From c6da2cfeb05178a11c6d062a06f8078150ee492f Mon Sep 17 00:00:00 2001 From: codeworkx Date: Sat, 2 Jun 2012 13:09:29 +0200 Subject: samsung update 1 --- arch/arm/mvp/Kconfig | 24 + arch/arm/mvp/Makefile | 3 + arch/arm/mvp/commkm/COPYING | 341 ++++ arch/arm/mvp/commkm/Kbuild | 9 + arch/arm/mvp/commkm/Makefile | 1 + arch/arm/mvp/commkm/check_kconfig.c | 91 + arch/arm/mvp/commkm/comm.c | 1457 ++++++++++++++ arch/arm/mvp/commkm/comm.h | 171 ++ arch/arm/mvp/commkm/comm_ev.h | 51 + arch/arm/mvp/commkm/comm_ev_kernel.c | 136 ++ arch/arm/mvp/commkm/comm_os.h | 150 ++ arch/arm/mvp/commkm/comm_os_linux.c | 371 ++++ arch/arm/mvp/commkm/comm_os_linux.h | 699 +++++++ arch/arm/mvp/commkm/comm_os_mod_linux.c | 105 + arch/arm/mvp/commkm/comm_os_mod_ver.h | 38 + arch/arm/mvp/commkm/comm_svc.c | 421 ++++ arch/arm/mvp/commkm/comm_svc.h | 71 + arch/arm/mvp/commkm/comm_transp.h | 90 + arch/arm/mvp/commkm/comm_transp_impl.h | 165 ++ arch/arm/mvp/commkm/comm_transp_mvp.c | 944 +++++++++ arch/arm/mvp/commkm/fatalerror.h | 126 ++ arch/arm/mvp/commkm/include_check.h | 18 + arch/arm/mvp/commkm/mksck.h | 153 ++ arch/arm/mvp/commkm/mksck_sockaddr.h | 50 + arch/arm/mvp/commkm/mvp.h | 48 + arch/arm/mvp/commkm/mvp_assert.h | 125 ++ arch/arm/mvp/commkm/mvp_compiler.h | 56 + arch/arm/mvp/commkm/mvp_compiler_gcc.h | 87 + arch/arm/mvp/commkm/mvp_types.h | 94 + arch/arm/mvp/commkm/mvpkm_comm_ev.h | 53 + arch/arm/mvp/commkm/nottested.h | 54 + arch/arm/mvp/commkm/platdefx.h | 67 + arch/arm/mvp/commkm/qp.h | 332 ++++ arch/arm/mvp/commkm/utils.h | 172 ++ arch/arm/mvp/commkm/vmid.h | 44 + arch/arm/mvp/mvpkm/COPYING | 341 ++++ arch/arm/mvp/mvpkm/Kbuild | 24 + arch/arm/mvp/mvpkm/Makefile | 1 + arch/arm/mvp/mvpkm/actions.h | 57 + arch/arm/mvp/mvpkm/arm_as_macros.h | 91 + arch/arm/mvp/mvpkm/arm_defs.h | 54 + arch/arm/mvp/mvpkm/arm_gcc_inline.h | 206 ++ arch/arm/mvp/mvpkm/arm_inline.h | 179 ++ arch/arm/mvp/mvpkm/arm_types.h | 42 + arch/arm/mvp/mvpkm/atomic.h | 88 + arch/arm/mvp/mvpkm/atomic_arm.h | 329 +++ arch/arm/mvp/mvpkm/check_kconfig.c | 91 + arch/arm/mvp/mvpkm/comm_os.h | 150 ++ arch/arm/mvp/mvpkm/comm_os_linux.h | 699 +++++++ arch/arm/mvp/mvpkm/comm_transp.h | 90 + arch/arm/mvp/mvpkm/comm_transp_impl.h | 165 ++ arch/arm/mvp/mvpkm/coproc_defs.h | 351 ++++ arch/arm/mvp/mvpkm/cpufreq_kernel.c | 308 +++ arch/arm/mvp/mvpkm/cpufreq_kernel.h | 47 + arch/arm/mvp/mvpkm/exc_defs.h | 67 + arch/arm/mvp/mvpkm/exc_types.h | 53 + arch/arm/mvp/mvpkm/exitstatus.h | 67 + arch/arm/mvp/mvpkm/fatalerror.h | 126 ++ arch/arm/mvp/mvpkm/include_check.h | 18 + arch/arm/mvp/mvpkm/instr_defs.h | 426 ++++ arch/arm/mvp/mvpkm/lowmemkiller_variant.sh | 83 + arch/arm/mvp/mvpkm/lpae_defs.h | 92 + arch/arm/mvp/mvpkm/lpae_types.h | 124 ++ arch/arm/mvp/mvpkm/mksck.h | 153 ++ arch/arm/mvp/mvpkm/mksck_kernel.c | 2589 ++++++++++++++++++++++++ arch/arm/mvp/mvpkm/mksck_kernel.h | 68 + arch/arm/mvp/mvpkm/mksck_shared.c | 343 ++++ arch/arm/mvp/mvpkm/mksck_shared.h | 189 ++ arch/arm/mvp/mvpkm/mksck_sockaddr.h | 50 + arch/arm/mvp/mvpkm/mmu_defs.h | 218 ++ arch/arm/mvp/mvpkm/mmu_types.h | 226 +++ arch/arm/mvp/mvpkm/montimer_kernel.c | 102 + arch/arm/mvp/mvpkm/montimer_kernel.h | 47 + arch/arm/mvp/mvpkm/monva_common.h | 106 + arch/arm/mvp/mvpkm/mutex.h | 107 + arch/arm/mvp/mvpkm/mutex_kernel.c | 480 +++++ arch/arm/mvp/mvpkm/mutex_kernel.h | 41 + arch/arm/mvp/mvpkm/mvp.h | 48 + arch/arm/mvp/mvpkm/mvp_assert.h | 125 ++ arch/arm/mvp/mvpkm/mvp_balloon.h | 217 ++ arch/arm/mvp/mvpkm/mvp_compiler.h | 56 + arch/arm/mvp/mvpkm/mvp_compiler_gcc.h | 87 + arch/arm/mvp/mvpkm/mvp_math.h | 133 ++ arch/arm/mvp/mvpkm/mvp_timer.h | 72 + arch/arm/mvp/mvpkm/mvp_types.h | 94 + arch/arm/mvp/mvpkm/mvp_version.h | 116 ++ arch/arm/mvp/mvpkm/mvpkm_comm_ev.c | 60 + arch/arm/mvp/mvpkm/mvpkm_comm_ev.h | 53 + arch/arm/mvp/mvpkm/mvpkm_kernel.h | 83 + arch/arm/mvp/mvpkm/mvpkm_main.c | 2690 +++++++++++++++++++++++++ arch/arm/mvp/mvpkm/mvpkm_private.h | 97 + arch/arm/mvp/mvpkm/mvpkm_types.h | 49 + arch/arm/mvp/mvpkm/nottested.h | 54 + arch/arm/mvp/mvpkm/platdefx.h | 67 + arch/arm/mvp/mvpkm/psr_defs.h | 117 ++ arch/arm/mvp/mvpkm/qp.h | 332 ++++ arch/arm/mvp/mvpkm/qp_common.c | 337 ++++ arch/arm/mvp/mvpkm/qp_host_kernel.c | 574 ++++++ arch/arm/mvp/mvpkm/qp_host_kernel.h | 44 + arch/arm/mvp/mvpkm/tsc.h | 49 + arch/arm/mvp/mvpkm/utils.h | 172 ++ arch/arm/mvp/mvpkm/ve_defs.h | 72 + arch/arm/mvp/mvpkm/vfp_switch.S | 216 ++ arch/arm/mvp/mvpkm/vmid.h | 44 + arch/arm/mvp/mvpkm/worldswitch.h | 381 ++++ arch/arm/mvp/mvpkm/wscalls.h | 165 ++ arch/arm/mvp/pvtcpkm/COPYING | 341 ++++ arch/arm/mvp/pvtcpkm/Kbuild | 9 + arch/arm/mvp/pvtcpkm/Makefile | 1 + arch/arm/mvp/pvtcpkm/check_kconfig.c | 91 + arch/arm/mvp/pvtcpkm/comm.h | 171 ++ arch/arm/mvp/pvtcpkm/comm_os.h | 150 ++ arch/arm/mvp/pvtcpkm/comm_os_linux.c | 371 ++++ arch/arm/mvp/pvtcpkm/comm_os_linux.h | 699 +++++++ arch/arm/mvp/pvtcpkm/comm_os_mod_linux.c | 105 + arch/arm/mvp/pvtcpkm/comm_os_mod_ver.h | 38 + arch/arm/mvp/pvtcpkm/comm_svc.h | 71 + arch/arm/mvp/pvtcpkm/comm_transp.h | 90 + arch/arm/mvp/pvtcpkm/include_check.h | 18 + arch/arm/mvp/pvtcpkm/pvtcp.c | 587 ++++++ arch/arm/mvp/pvtcpkm/pvtcp.h | 458 +++++ arch/arm/mvp/pvtcpkm/pvtcp_off.c | 81 + arch/arm/mvp/pvtcpkm/pvtcp_off.h | 219 ++ arch/arm/mvp/pvtcpkm/pvtcp_off_io_linux.c | 831 ++++++++ arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c | 2858 +++++++++++++++++++++++++++ arch/arm/mvp/pvtcpkm/pvtcp_off_linux.h | 226 +++ arch/arm/mvp/pvtcpkm/pvtcp_off_linux_shim.S | 70 + 127 files changed, 29994 insertions(+) create mode 100644 arch/arm/mvp/Kconfig create mode 100644 arch/arm/mvp/Makefile create mode 100644 arch/arm/mvp/commkm/COPYING create mode 100644 arch/arm/mvp/commkm/Kbuild create mode 100644 arch/arm/mvp/commkm/Makefile create mode 100644 arch/arm/mvp/commkm/check_kconfig.c create mode 100644 arch/arm/mvp/commkm/comm.c create mode 100644 arch/arm/mvp/commkm/comm.h create mode 100644 arch/arm/mvp/commkm/comm_ev.h create mode 100644 arch/arm/mvp/commkm/comm_ev_kernel.c create mode 100644 arch/arm/mvp/commkm/comm_os.h create mode 100644 arch/arm/mvp/commkm/comm_os_linux.c create mode 100644 arch/arm/mvp/commkm/comm_os_linux.h create mode 100644 arch/arm/mvp/commkm/comm_os_mod_linux.c create mode 100644 arch/arm/mvp/commkm/comm_os_mod_ver.h create mode 100644 arch/arm/mvp/commkm/comm_svc.c create mode 100644 arch/arm/mvp/commkm/comm_svc.h create mode 100644 arch/arm/mvp/commkm/comm_transp.h create mode 100644 arch/arm/mvp/commkm/comm_transp_impl.h create mode 100644 arch/arm/mvp/commkm/comm_transp_mvp.c create mode 100644 arch/arm/mvp/commkm/fatalerror.h create mode 100644 arch/arm/mvp/commkm/include_check.h create mode 100644 arch/arm/mvp/commkm/mksck.h create mode 100644 arch/arm/mvp/commkm/mksck_sockaddr.h create mode 100644 arch/arm/mvp/commkm/mvp.h create mode 100644 arch/arm/mvp/commkm/mvp_assert.h create mode 100644 arch/arm/mvp/commkm/mvp_compiler.h create mode 100644 arch/arm/mvp/commkm/mvp_compiler_gcc.h create mode 100644 arch/arm/mvp/commkm/mvp_types.h create mode 100644 arch/arm/mvp/commkm/mvpkm_comm_ev.h create mode 100644 arch/arm/mvp/commkm/nottested.h create mode 100644 arch/arm/mvp/commkm/platdefx.h create mode 100644 arch/arm/mvp/commkm/qp.h create mode 100644 arch/arm/mvp/commkm/utils.h create mode 100644 arch/arm/mvp/commkm/vmid.h create mode 100644 arch/arm/mvp/mvpkm/COPYING create mode 100644 arch/arm/mvp/mvpkm/Kbuild create mode 100644 arch/arm/mvp/mvpkm/Makefile create mode 100644 arch/arm/mvp/mvpkm/actions.h create mode 100644 arch/arm/mvp/mvpkm/arm_as_macros.h create mode 100644 arch/arm/mvp/mvpkm/arm_defs.h create mode 100644 arch/arm/mvp/mvpkm/arm_gcc_inline.h create mode 100644 arch/arm/mvp/mvpkm/arm_inline.h create mode 100644 arch/arm/mvp/mvpkm/arm_types.h create mode 100644 arch/arm/mvp/mvpkm/atomic.h create mode 100644 arch/arm/mvp/mvpkm/atomic_arm.h create mode 100644 arch/arm/mvp/mvpkm/check_kconfig.c create mode 100644 arch/arm/mvp/mvpkm/comm_os.h create mode 100644 arch/arm/mvp/mvpkm/comm_os_linux.h create mode 100644 arch/arm/mvp/mvpkm/comm_transp.h create mode 100644 arch/arm/mvp/mvpkm/comm_transp_impl.h create mode 100644 arch/arm/mvp/mvpkm/coproc_defs.h create mode 100644 arch/arm/mvp/mvpkm/cpufreq_kernel.c create mode 100644 arch/arm/mvp/mvpkm/cpufreq_kernel.h create mode 100644 arch/arm/mvp/mvpkm/exc_defs.h create mode 100644 arch/arm/mvp/mvpkm/exc_types.h create mode 100644 arch/arm/mvp/mvpkm/exitstatus.h create mode 100644 arch/arm/mvp/mvpkm/fatalerror.h create mode 100644 arch/arm/mvp/mvpkm/include_check.h create mode 100644 arch/arm/mvp/mvpkm/instr_defs.h create mode 100644 arch/arm/mvp/mvpkm/lowmemkiller_variant.sh create mode 100644 arch/arm/mvp/mvpkm/lpae_defs.h create mode 100644 arch/arm/mvp/mvpkm/lpae_types.h create mode 100644 arch/arm/mvp/mvpkm/mksck.h create mode 100644 arch/arm/mvp/mvpkm/mksck_kernel.c create mode 100644 arch/arm/mvp/mvpkm/mksck_kernel.h create mode 100644 arch/arm/mvp/mvpkm/mksck_shared.c create mode 100644 arch/arm/mvp/mvpkm/mksck_shared.h create mode 100644 arch/arm/mvp/mvpkm/mksck_sockaddr.h create mode 100644 arch/arm/mvp/mvpkm/mmu_defs.h create mode 100644 arch/arm/mvp/mvpkm/mmu_types.h create mode 100644 arch/arm/mvp/mvpkm/montimer_kernel.c create mode 100644 arch/arm/mvp/mvpkm/montimer_kernel.h create mode 100644 arch/arm/mvp/mvpkm/monva_common.h create mode 100644 arch/arm/mvp/mvpkm/mutex.h create mode 100644 arch/arm/mvp/mvpkm/mutex_kernel.c create mode 100644 arch/arm/mvp/mvpkm/mutex_kernel.h create mode 100644 arch/arm/mvp/mvpkm/mvp.h create mode 100644 arch/arm/mvp/mvpkm/mvp_assert.h create mode 100644 arch/arm/mvp/mvpkm/mvp_balloon.h create mode 100644 arch/arm/mvp/mvpkm/mvp_compiler.h create mode 100644 arch/arm/mvp/mvpkm/mvp_compiler_gcc.h create mode 100644 arch/arm/mvp/mvpkm/mvp_math.h create mode 100644 arch/arm/mvp/mvpkm/mvp_timer.h create mode 100644 arch/arm/mvp/mvpkm/mvp_types.h create mode 100644 arch/arm/mvp/mvpkm/mvp_version.h create mode 100644 arch/arm/mvp/mvpkm/mvpkm_comm_ev.c create mode 100644 arch/arm/mvp/mvpkm/mvpkm_comm_ev.h create mode 100644 arch/arm/mvp/mvpkm/mvpkm_kernel.h create mode 100644 arch/arm/mvp/mvpkm/mvpkm_main.c create mode 100644 arch/arm/mvp/mvpkm/mvpkm_private.h create mode 100644 arch/arm/mvp/mvpkm/mvpkm_types.h create mode 100644 arch/arm/mvp/mvpkm/nottested.h create mode 100644 arch/arm/mvp/mvpkm/platdefx.h create mode 100644 arch/arm/mvp/mvpkm/psr_defs.h create mode 100644 arch/arm/mvp/mvpkm/qp.h create mode 100644 arch/arm/mvp/mvpkm/qp_common.c create mode 100644 arch/arm/mvp/mvpkm/qp_host_kernel.c create mode 100644 arch/arm/mvp/mvpkm/qp_host_kernel.h create mode 100644 arch/arm/mvp/mvpkm/tsc.h create mode 100644 arch/arm/mvp/mvpkm/utils.h create mode 100644 arch/arm/mvp/mvpkm/ve_defs.h create mode 100644 arch/arm/mvp/mvpkm/vfp_switch.S create mode 100644 arch/arm/mvp/mvpkm/vmid.h create mode 100644 arch/arm/mvp/mvpkm/worldswitch.h create mode 100644 arch/arm/mvp/mvpkm/wscalls.h create mode 100644 arch/arm/mvp/pvtcpkm/COPYING create mode 100644 arch/arm/mvp/pvtcpkm/Kbuild create mode 100644 arch/arm/mvp/pvtcpkm/Makefile create mode 100644 arch/arm/mvp/pvtcpkm/check_kconfig.c create mode 100644 arch/arm/mvp/pvtcpkm/comm.h create mode 100644 arch/arm/mvp/pvtcpkm/comm_os.h create mode 100644 arch/arm/mvp/pvtcpkm/comm_os_linux.c create mode 100644 arch/arm/mvp/pvtcpkm/comm_os_linux.h create mode 100644 arch/arm/mvp/pvtcpkm/comm_os_mod_linux.c create mode 100644 arch/arm/mvp/pvtcpkm/comm_os_mod_ver.h create mode 100644 arch/arm/mvp/pvtcpkm/comm_svc.h create mode 100644 arch/arm/mvp/pvtcpkm/comm_transp.h create mode 100644 arch/arm/mvp/pvtcpkm/include_check.h create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp.c create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp.h create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp_off.c create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp_off.h create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp_off_io_linux.c create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp_off_linux.h create mode 100644 arch/arm/mvp/pvtcpkm/pvtcp_off_linux_shim.S (limited to 'arch/arm/mvp') diff --git a/arch/arm/mvp/Kconfig b/arch/arm/mvp/Kconfig new file mode 100644 index 0000000..4f2c5c7 --- /dev/null +++ b/arch/arm/mvp/Kconfig @@ -0,0 +1,24 @@ +config VMWARE_MVP + bool "Build VMware Mobile Virtualization Platform modules" + select MODULES + select MODULE_UNLOAD + select SYSFS + select NAMESPACES + select NET_NS + select INET + select IPV6 + select TUN + select NETFILTER + help + Say Y here to enable the building of kernel modules + for VMware's Mobile Virtualization Platform + +config VMWARE_MVP_DEBUG + bool "Enable debug for VMware Mobile Virtualization Platform modules" + depends on VMWARE_MVP + select IKCONFIG + select IKCONFIG_PROC + help + Say Y here to enable debug on kernel modules + for VMware's Mobile Virtualization Platform. + This should be enabled for eng or userdebug builds. diff --git a/arch/arm/mvp/Makefile b/arch/arm/mvp/Makefile new file mode 100644 index 0000000..cd38d75 --- /dev/null +++ b/arch/arm/mvp/Makefile @@ -0,0 +1,3 @@ +obj-y += mvpkm/ +obj-y += commkm/ +obj-y += pvtcpkm/ diff --git a/arch/arm/mvp/commkm/COPYING b/arch/arm/mvp/commkm/COPYING new file mode 100644 index 0000000..10828e0 --- /dev/null +++ b/arch/arm/mvp/commkm/COPYING @@ -0,0 +1,341 @@ + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/arch/arm/mvp/commkm/Kbuild b/arch/arm/mvp/commkm/Kbuild new file mode 100644 index 0000000..de43a5c --- /dev/null +++ b/arch/arm/mvp/commkm/Kbuild @@ -0,0 +1,9 @@ +# Warning: autogenerated +obj-m := commkm.o +commkm-objs := check_kconfig.o comm_ev_kernel.o comm.o comm_os_linux.o comm_os_mod_linux.o comm_svc.o comm_transp_mvp.o + +ccflags-y += -fno-pic -fno-dwarf2-cfi-asm -march=armv7-a -D__linux__ +ccflags-y += -DCOMM_BUILDING_SERVER +ccflags-y += -mfpu=neon -DIN_MODULE -DGPLED_CODE +ccflags-y += --std=gnu89 -O2 -g2 -ggdb -mapcs -fno-optimize-sibling-calls -mno-sched-prolog +ccflags-$(CONFIG_VMWARE_MVP_DEBUG) += -DMVP_DEBUG diff --git a/arch/arm/mvp/commkm/Makefile b/arch/arm/mvp/commkm/Makefile new file mode 100644 index 0000000..16eb389 --- /dev/null +++ b/arch/arm/mvp/commkm/Makefile @@ -0,0 +1 @@ +# Warning: autogenerated diff --git a/arch/arm/mvp/commkm/check_kconfig.c b/arch/arm/mvp/commkm/check_kconfig.c new file mode 100644 index 0000000..0867d74 --- /dev/null +++ b/arch/arm/mvp/commkm/check_kconfig.c @@ -0,0 +1,91 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * @brief Check for required kernel configuration + * + * Check to make sure that the kernel options that the MVP hypervisor requires + * have been enabled in the kernel that this kernel module is being built + * against. + */ +#include + +/* + * Minimum kernel version + * - network namespace support is only really functional starting in 2.6.29 + * - Android Gingerbread requires 2.6.35 + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) +#error "MVP requires a host kernel newer than 2.6.35" +#endif + +/* module loading ability */ +#ifndef CONFIG_MODULES +#error "MVP requires kernel loadable module support be enabled (CONFIG_MODULES)" +#endif +#ifndef CONFIG_MODULE_UNLOAD +#error "MVP requires kernel module unload support be enabled (CONFIG_MODULE_UNLOAD)" +#endif + +/* sysfs */ +#ifndef CONFIG_SYSFS +#error "MVP requires sysfs support (CONFIG_SYSFS)" +#endif + +/* network traffic isolation */ +#ifndef CONFIG_NAMESPACES +#error "MVP networking support requires namespace support (CONFIG_NAMESPACES)" +#endif +#ifndef CONFIG_NET_NS +#error "MVP networking support requires Network Namespace support to be enabled (CONFIG_NET_NS)" +#endif + +/* TCP/IP networking */ +#ifndef CONFIG_INET +#error "MVP networking requires IPv4 support (CONFIG_INET)" +#endif +#ifndef CONFIG_IPV6 +#error "MVP networking requires IPv6 support (CONFIG_IPV6)" +#endif + +/* VPN support */ +#if !defined(CONFIG_TUN) && !defined(CONFIG_TUN_MODULE) +#error "MVP VPN support requires TUN device support (CONFIG_TUN)" +#endif + +#if !defined(CONFIG_NETFILTER) && !defined(PVTCP_DISABLE_NETFILTER) +#error "MVP networking support requires netfilter support (CONFIG_NETFILTER)" +#endif + +/* Force /proc/config.gz support for eng/userdebug builds */ +#ifdef MVP_DEBUG +#if !defined(CONFIG_IKCONFIG) || !defined(CONFIG_IKCONFIG_PROC) +#error "MVP kernel /proc/config.gz support required for debuggability (CONFIG_IKCONFIG_PROC)" +#endif +#endif + +/* Sanity check we're only dealing with the memory hotplug + migrate and/or + * compaction combo */ +#ifdef CONFIG_MIGRATION +#if defined(CONFIG_NUMA) || defined(CONFIG_CPUSETS) || defined(CONFIG_MEMORY_FAILURE) +#error "MVP not tested with migration features other than CONFIG_MEMORY_HOTPLUG and CONFIG_COMPACTION" +#endif +#endif diff --git a/arch/arm/mvp/commkm/comm.c b/arch/arm/mvp/commkm/comm.c new file mode 100644 index 0000000..8fd591c --- /dev/null +++ b/arch/arm/mvp/commkm/comm.c @@ -0,0 +1,1457 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Communication functions based on transport functionality. + */ + +#include "comm.h" +#include "comm_transp_impl.h" + + +/* Constant and macro definitions */ + +#if defined(COMM_INSTRUMENT) +static CommOSAtomic commMaxCoalesceSize; +static CommOSAtomic commPacketsReceived; +static CommOSAtomic commCommittedPacketsReceived; +static CommOSAtomic commOpCalls; +#endif + +#define COMM_DISPATCH_EXTRA_WRITER_WAKEUP 1 + +#define COMM_CHANNEL_MAX_CAPACITY 2048 +#define COMM_CHANNEL_FREE 0x0 +#define COMM_CHANNEL_INITIALIZED 0x1 +#define COMM_CHANNEL_OPENED 0x2 +#define COMM_CHANNEL_ACTIVE 0x4 +#define COMM_CHANNEL_ZOMBIE 0x8 + +#define CommIsFree(chan) \ + ((chan)->lifecycleState == COMM_CHANNEL_FREE) +#define CommIsInitialized(chan) \ + ((chan)->lifecycleState == COMM_CHANNEL_INITIALIZED) +#define CommIsOpened(chan) \ + ((chan)->lifecycleState == COMM_CHANNEL_OPENED) +#define CommIsActive(chan) \ + ((chan)->lifecycleState == COMM_CHANNEL_ACTIVE) +#define CommIsZombie(chan) \ + ((chan)->lifecycleState == COMM_CHANNEL_ZOMBIE) + +#define CommSetFree(chan) \ + SetLifecycleState(chan, COMM_CHANNEL_FREE) +#define CommSetInitialized(chan) \ + SetLifecycleState(chan, COMM_CHANNEL_INITIALIZED) +#define CommSetOpened(chan) \ + SetLifecycleState(chan, COMM_CHANNEL_OPENED) +#define CommSetActive(chan) \ + SetLifecycleState(chan, COMM_CHANNEL_ACTIVE) +#define CommSetZombie(chan) \ + SetLifecycleState(chan, COMM_CHANNEL_ZOMBIE) + +#define CommGlobalLock() CommOS_SpinLock(&commGlobalLock) +#define CommGlobalUnlock() CommOS_SpinUnlock(&commGlobalLock) +#define CommGlobalLockBH() CommOS_SpinLockBH(&commGlobalLock) +#define CommGlobalUnlockBH() CommOS_SpinUnlockBH(&commGlobalLock) + +#define DispatchTrylock(chan) CommOS_MutexTrylock(&(chan)->dispatchMutex) +#define DispatchUnlock(chan) CommOS_MutexUnlock(&(chan)->dispatchMutex) + +#define WriteLock(chan) CommOS_MutexLock(&(chan)->writeMutex) +#define WriteTrylock(chan) CommOS_MutexTrylock(&(chan)->writeMutex) +#define WriteUnlock(chan) CommOS_MutexUnlock(&(chan)->writeMutex) + +#define StateLock(chan) CommOS_MutexLock(&(chan)->stateMutex) +#define StateTrylock(chan) CommOS_MutexTrylock(&(chan)->stateMutex) +#define StateUnlock(chan) CommOS_MutexUnlock(&(chan)->stateMutex) + +#define CommHoldInit(chan) CommOS_WriteAtomic(&(chan)->holds, 0) +#define CommHold(chan) CommOS_AddReturnAtomic(&(chan)->holds, 1) +#define CommRelease(chan) CommOS_SubReturnAtomic(&(chan)->holds, 1) +#define CommIsHeld(chan) (CommOS_ReadAtomic(&(chan)->holds) > 0) + +#define PacketLenOverLimit(chan, len) \ + (((len) - sizeof (CommPacket)) > ((chan)->transpArgs.capacity / 4)) + + +/* + * Data structure describing the offload <-> paravirtualized module + * communication channel. + */ + +struct CommChannelPriv { + CommOSAtomic holds; // Active readers and writers + CommTranspInitArgs transpArgs; // Transport initialization arguments + CommTransp transp; // Transport handle + CommOSMutex dispatchMutex; // Dispatch mutex + CommOSMutex writeMutex; // Non-BH write mutex + CommOSMutex stateMutex; // Upper-layer state mutex + CommOSWaitQueue availableWaitQ; // Available write space wait data + unsigned int desiredWriteSpace; // Size of write space needed + const CommImpl *impl; // Implementation + unsigned int implNmbOps; // Number of implementation operations + unsigned int lifecycleState; // Lifecycle state + void *state; // Upper layer-specific state +}; + + +static volatile int running; // Initialized and running. +static CommOSWaitQueue exitWaitQ; // Exit wait queue. +static CommOSSpinlock commGlobalLock; // Global lock. + + +/* Communication channel slots. */ + +static unsigned int commChannelCapacity; // Maximum number of channels. +static unsigned int commChannelSize; // Current size of channel array. +static unsigned int commChannelAllocated; // Nmb. entries currently in use. +static struct CommChannelPriv *commChannels; // Allocated channel array. + + +/** + * @brief Callback function called when the other side created a transport + * handle to which we need to potentially attach. + * @param[in,out] transpArgs arguments used when shared memory area was created. + * @param probeData our callback data, an implementation block. + * @return 0 if successful, -1 otherwise. + * @sideeffects May allocate a channel. + */ + +static int +DefaultTranspListener(CommTranspInitArgs *transpArgs, + void *probeData) +{ + int rc = -1; + const int inBH = 1; + const CommImpl *impl; + + if (!transpArgs || !probeData) { + CommOS_Debug(("%s: NULL args [0x%p, 0x%p].\n", + __FUNCTION__, transpArgs, probeData)); + goto out; + } + + impl = probeData; + CommOS_Debug(("%s: Received attach info [%u,%u,%u:%u].\n", + __FUNCTION__, + transpArgs->capacity, transpArgs->type, + transpArgs->id.d32[0], transpArgs->id.d32[1])); + + if (impl->checkArgs(transpArgs)) { + goto out; + } + transpArgs->mode = COMM_TRANSP_INIT_ATTACH; /* Ensure we attach. */ + + /* We recognized it, so don't let others waste any time. Even if we fail. */ + + rc = 0; + if (Comm_Alloc(transpArgs, impl, inBH, NULL)) { + impl->closeNtf(impl->closeNtfData, transpArgs, inBH); + CommOS_Log(("%s: Can't allocate new channel!\n", __FUNCTION__)); + } + +out: + return rc; +} + + +/** + * @brief Sets the lifecycle state of a channel entry + * @param channel channel to update + * @param newState state to update to + */ + +static inline void +SetLifecycleState(CommChannel channel, + unsigned int newState) +{ + + channel->lifecycleState = newState; +} + + +/* Wait conditions: functions returning 1: true, 0: false, < 0: error. */ + +/** + * @brief Wait condition function to check whether module can be unloaded. + * @param arg1 dummy + * @param arg2 dummy + * @return 1 if no channels are currently allocated, 0 if there are + */ + +static int +ExitCondition(void *arg1, + void *arg2) +{ + unsigned int i; + int rc; + + (void)arg1; + (void)arg2; + CommOS_Debug(("%s: running [%d] " + "commChannelAllocated [%u] commChannelSize [%u].\n", + __FUNCTION__, running, commChannelAllocated, commChannelSize)); + rc = !running && (commChannelAllocated == 0); + if (!rc) { + for (i = 0; i < commChannelCapacity; i++) { + CommOS_Debug(("%s: channel[%u] state [0x%x].\n", + __FUNCTION__, i, commChannels[i].lifecycleState)); + } + } + return rc; +} + + +/** + * @brief Wait condition function to check available write space. + * @param arg1 pointer to CommChannel struct + * @param arg2 size argument + * @return 1 if there is enough write space, 0 if not, -ENOMEM if comm down. + */ + +static int +WriteSpaceCondition(void *arg1, + void *arg2) +{ + CommChannel channel = arg1; + + if (!CommIsActive(channel)) { + return -ENOMEM; + } + return channel->desiredWriteSpace < CommTransp_EnqueueSpace(channel->transp); +} + + +/** + * @brief Registers an implementation block used when attaching to channels + * in response to transport attach events. + * @param impl implementation block. + * @return 0 if successful, non-zero otherwise. + */ + +int +Comm_RegisterImpl(const CommImpl *impl) +{ + CommTranspListener listener = { + .probe = DefaultTranspListener, + .probeData = (void *)impl + }; + + return CommTransp_Register(&listener); +} + + +/** + * @brief Unregisters an implementation block used when attaching to channels + * in response to transport attach events. + * @param impl implementation block. + */ + +void +Comm_UnregisterImpl(const CommImpl *impl) +{ + CommTranspListener listener = { + .probe = DefaultTranspListener, + .probeData = (void *)impl + }; + + CommTransp_Unregister(&listener); +} + + +/** + * @brief Allocates and initializes comm global state. Single-threaded use. + * @param maxChannels maximum number of channels. + * @return zero if successful, non-zero otherwise. + */ + +int +Comm_Init(unsigned int maxChannels) +{ + int rc = -1; + unsigned int i; + + if (running || commChannels || + (maxChannels == 0) || (maxChannels > COMM_CHANNEL_MAX_CAPACITY)) { + goto out; + } + +#if defined(COMM_INSTRUMENT) + CommOS_WriteAtomic(&commMaxCoalesceSize, 0); + CommOS_WriteAtomic(&commPacketsReceived, 0); + CommOS_WriteAtomic(&commCommittedPacketsReceived, 0); + CommOS_WriteAtomic(&commOpCalls, 0); +#endif + + CommOS_WaitQueueInit(&exitWaitQ); + CommOS_SpinlockInit(&commGlobalLock); + commChannelCapacity = maxChannels; + commChannelAllocated = 0; + commChannels = CommOS_Kmalloc((sizeof *commChannels) * commChannelCapacity); + if (!commChannels) { + goto out; + } + + memset(commChannels, 0, (sizeof *commChannels) * commChannelCapacity); + for (i = 0; i < commChannelCapacity; i++ ) { + CommChannel channel; + + channel = &commChannels[i]; + CommHoldInit(channel); + channel->transp = NULL; + CommOS_MutexInit(&channel->dispatchMutex); + CommOS_MutexInit(&channel->writeMutex); + CommOS_MutexInit(&channel->stateMutex); + CommOS_WaitQueueInit(&channel->availableWaitQ); + channel->desiredWriteSpace = -1U; + channel->state = NULL; + CommSetFree(channel); + } + + rc = CommTransp_Init(); + if (!rc) { + commChannelSize = 0; + running = 1; + rc = 0; + } else { + CommOS_Kfree(commChannels); + } + +out: + return rc; +} + + +/** + * @brief Initiates and finishes, comm global state deallocations. + * @param timeoutMillis initialization timeout in milliseconds + * @return zero if deallocations done, non-zero if more calls are needed. + */ + +int +Comm_Finish(unsigned long long *timeoutMillis) +{ + int rc; + unsigned int i; + unsigned long long timeout; + + for (i = 0; i < commChannelSize; i++) { + Comm_Zombify(&commChannels[i], 0); + } + + running = 0; + timeout = timeoutMillis ? *timeoutMillis : 0; + /* coverity[var_deref_model] */ + rc = CommOS_Wait(&exitWaitQ, ExitCondition, NULL, NULL, &timeout); + if (rc == 1) { + /* + * Didn't time out, task wasn't interrupted, we can wrap it up.. + */ + + CommTransp_Exit(); + CommOS_Kfree(commChannels); + commChannels = NULL; + commChannelSize = 0; +#if defined(COMM_INSTRUMENT) + CommOS_Log(("%s: commMaxCoalesceSize = %lu.\n", + __FUNCTION__, + CommOS_ReadAtomic(&commMaxCoalesceSize))); + CommOS_Log(("%s: commPacketsReceived = %lu.\n", + __FUNCTION__, + CommOS_ReadAtomic(&commPacketsReceived))); + CommOS_Log(("%s: commCommittedPacketsReceived = %lu.\n", + __FUNCTION__, + CommOS_ReadAtomic(&commCommittedPacketsReceived))); + CommOS_Log(("%s: commOpCalls = %lu.\n", + __FUNCTION__, + CommOS_ReadAtomic)(&commOpCalls))); +#endif + rc = 0; + } else { + rc = -1; + } + return rc; +} + + +/** + * @brief Finds a free entry and initializes it with the information provided. + * May be called from BH. It doesn't call potentially blocking functions. + * + * @note Depending on the choice of shared memory transport (VMCI or MVP QP), + * the 'inBH' distinction is important. VMCI datagrams are received under + * some circumstances in bottom-half context, so 'inBH' should be set. This + * is not a restriction on MVP. + * + * @param transpArgs transport initialization arguments. + * @param impl implementation block. + * @param inBH non-zero if called in bottom half. + * @param[out] newChannel newly allocated channel. + * @return zero if successful, non-zero otherwise. + * @sideeffects Initializes the communications channel with given parameters + */ + +int +Comm_Alloc(const CommTranspInitArgs *transpArgs, + const CommImpl *impl, + int inBH, + CommChannel *newChannel) +{ + unsigned int i; + CommChannel channel = NULL; + int restoreSize = 0; + int modHeld = 0; + int rc = -1; + + if (inBH) { + CommGlobalLock(); + } else { + CommGlobalLockBH(); + } + + if (!running || !transpArgs || !impl) { + goto out; + } + + if (CommOS_ModuleGet(impl->owner)) { + goto out; + } + modHeld = 1; + + for (i = 0; i < commChannelSize; i++) { + /* + * Check if this channel is already allocated. We don't match against + * ANY because those channels are in the process of being opened; after + * that happens, they'll get proper IDs. + */ + + if (!CommIsFree(&commChannels[i]) && + (transpArgs->id.d64 != COMM_TRANSP_ID_64_ANY) && + (transpArgs->id.d64 == commChannels[i].transpArgs.id.d64)) { + goto out; + } + if (!channel && CommIsFree(&commChannels[i])) { + channel = &commChannels[i]; + } + } + if (!channel) { + if (commChannelSize == commChannelCapacity) { + goto out; + } + channel = &commChannels[commChannelSize]; + commChannelSize++; + restoreSize = 1; + } + + if (channel->transp) { /* Inconsistency! */ + if (restoreSize) { + commChannelSize--; + } + goto out; + } + + channel->transpArgs = *transpArgs; + channel->impl = impl; + for (i = 0; impl->operations[i]; i++) { + ; + } + channel->implNmbOps = i; + channel->desiredWriteSpace = -1U; + commChannelAllocated++; + CommSetInitialized(channel); + if (newChannel) { + *newChannel = channel; + } + rc = 0; + CommOS_ScheduleDisp(); + +out: + if (inBH) { + CommGlobalUnlock(); + } else { + CommGlobalUnlockBH(); + } + if (rc && modHeld) { + CommOS_ModulePut(impl->owner); + } + return rc; +} + + +/** + * @brief Zombifies a channel. May fail if channel isn't active. + * @param[in,out] channel channel to zombify. + * @param inBH non-zero if called in bottom half. + * @return zero if channel zombified, non-zero otherwise. + */ + +int +Comm_Zombify(CommChannel channel, + int inBH) +{ + int rc = -1; + + if (!running) { + goto out; + } + if (inBH) { + CommGlobalLock(); + } else { + CommGlobalLockBH(); + } + if (CommIsActive(channel) || CommIsOpened(channel)) { + CommSetZombie(channel); + rc = 0; + } + if (inBH) { + CommGlobalUnlock(); + } else { + CommGlobalUnlockBH(); + } + +out: + if (!rc) { + CommOS_ScheduleDisp(); + } + return rc; +} + + +/** + * @brief Reports whether a channel is active. + * @param channel channel to report on. + * @return non-zero if channel active, zero otherwise. + */ + +int +Comm_IsActive(CommChannel channel) +{ + return channel ? CommIsActive(channel) : 0; +} + + +/** + * @brief Wakes up potential writer on the channel. This function must be + * called on an active channel, with either the dispatch lock taken, or + * the channel ref count incremented. + * @param channel CommChannel structure on which potential writer waits. + */ + +static inline void +WakeUpWriter(CommChannel channel) +{ + if (WriteSpaceCondition(channel, NULL)) { + CommOS_WakeUp(&channel->availableWaitQ); + } +} + + +/** + * @brief Transport event handler for comm channels. + * @param transp transport handle. + * @param event type of event. + * @param data callback data. + * @sideeffects may put the channel into zombie state, or schedule it for I/O. + */ + +static void +TranspEventHandler(CommTransp transp, + CommTranspIOEvent event, + void *data) +{ + CommChannel channel = (CommChannel)data; + + switch (event) { + case COMM_TRANSP_IO_DETACH: + CommOS_Debug(("%s: Detach event. Zombifying channel.\n", __FUNCTION__)); + Comm_Zombify(channel, 1); + break; + + case COMM_TRANSP_IO_IN: + case COMM_TRANSP_IO_INOUT: + /* + * The dispatch threads may not have been started because either: + * a) we're not running in the CommSvc service, or + * b) the Comm client didn't create them explicitly (CommOS_StartIO()). + * + * If so, the CommOS_ScheduleDisp() call is ineffective. This is + * the intended behavior: the client obviously wants to call the Comm + * dispatch function(s) directly. + */ + + CommOS_ScheduleDisp(); + break; + + case COMM_TRANSP_IO_OUT: + CommHold(channel); + if (CommIsActive(channel)) { + WakeUpWriter(channel); + } + CommRelease(channel); + if (CommIsZombie(channel)) { + /* + * After releasing the hold on the channel, we must check if it was + * set to zombie and the dispatcher was supposed to nuke it. If the + * dispatcher had made its run while we were holding the channel, it + * gave up. So schedule it. + */ + + CommOS_ScheduleDisp(); + } + break; + + default: + CommOS_Debug(("%s: Unhandled event [%u, %p, %p].\n", + __FUNCTION__, event, transp, data)); + } +} + + +/** + * @brief Destroys upper layer state, unregisters event handlers and + * detaches from or deletes shared memory. + * @param[in,out] channel CommChannel structure to close. + */ + +static void +CommClose(CommChannel channel) +{ + const CommImpl *impl = channel->impl; + + StateLock(channel); + if (impl->stateDtor && channel->state) { + impl->stateDtor(channel->state); + } + channel->state = NULL; + StateUnlock(channel); + + CommOS_ModulePut(impl->owner); + + if (channel->transp) { + CommTransp_Close(channel->transp); + channel->transp = NULL; + } + + CommGlobalLockBH(); + CommSetFree(channel); + commChannelAllocated--; + if (channel == &commChannels[commChannelSize - 1]) { + commChannelSize--; + } + CommGlobalUnlockBH(); + if (!running && (commChannelAllocated == 0)) { + CommOS_WakeUp(&exitWaitQ); + } +} + + +/** + * @brief Allocates upper layer state, registers transport event handler + * and creates or attaches to shared memory. + * @param[in,out] channel CommChannel structure to open. + * @return zero if successful, -1 otherwise + * @sideeffects Memory may be allocated, event handlers registered and + * QP allocated or attached to. + */ + +static int +CommOpen(CommChannel channel) +{ + int rc = -1; + CommTranspEvent transpEvent = { + .ioEvent = TranspEventHandler, + .ioEventData = channel + }; + const CommImpl *impl; + + if (!channel || !CommIsInitialized(channel)) { + return rc; + } + + if (!running) { /* Ok, toggle it back to FREE. */ + goto out; + } + + impl = channel->impl; + if (impl->stateCtor) { + channel->state = impl->stateCtor(channel); + if (!channel->state) { + goto out; + } + } + + if (!CommTransp_Open(&channel->transp, &channel->transpArgs, &transpEvent)) { + rc = 0; + } else { + channel->transp = NULL; + } + +out: + if (!rc) { + CommSetOpened(channel); + } else { + CommClose(channel); + } + return rc; +} + + +/** + * @brief Retrieves a channel's transport initialization arguments. + * It doesn't lock, the caller must ensure the channel may be accessed. + * @param channel CommChannel structure to get initialization arguments from. + * @return initialization arguments used to allocate/attach to channel. + */ + +CommTranspInitArgs +Comm_GetTranspInitArgs(CommChannel channel) +{ + if (!channel) { + CommTranspInitArgs res = { .capacity = 0 }; + + return res; + } + return channel->transpArgs; +} + + +/** + * @brief Retrieves upper layer state (pointer). It doesn't lock, the caller + * must ensure the channel may be accessed. + * @param channel CommChannel structure to get state from. + * @return pointer to upper layer state. + */ + +void * +Comm_GetState(CommChannel channel) +{ + if (!channel) { + return NULL; + } + return channel->state; +} + + +/** + * @brief Main input processing function operating on a given channel. + * @param channel CommChannel structure to process. + * @return number of processed channels (0 or 1), or -1 if channel closed. + * @sideeffects Lifecycle states are transitioned to and from. Channel may + * be opened or destroyed, waiting writers may be woken up, and input + * may be handed off to operation callbacks. + */ + +int +Comm_Dispatch(CommChannel channel) +{ + int rc = 0; + int zombify = 0; + CommPacket packet; + CommPacket firstPacket; + unsigned int dataLen; +#define VEC_SIZE 32 + struct kvec vec[VEC_SIZE]; + unsigned int vecLen; + + /* + * Taking the reader mutex is safe in all cases: entries, including + * free ones, are guaranteed to have initialized mutexes and locks. + * Locking empty entries may seem wasteful, but those entries are rare. + */ + + if (DispatchTrylock(channel)) { + return 0; + } + + /* Process input and writer wake-up. */ + + if (CommIsActive(channel)) { + /* + * The entry may have transitioned to ZOMBIE, somehow. That's OK + * since it can't be freed just yet (it's currently locked). + */ + + /* Wake up any waiting writers, if necessary. */ + + WakeUpWriter(channel); + + /* Read packets, payloads. */ + CommTransp_DequeueReset(channel->transp); + + for (vecLen = 0; vecLen < VEC_SIZE; vecLen++) { + if (!running) { + break; + } + + /* Read header. */ + + rc = CommTransp_DequeueSegment(channel->transp, + &packet, sizeof packet); + if (rc <= 0) { + /* No packet (header). */ + + rc = vecLen == 0 ? 0 : 1; + break; + } +#if defined(COMM_INSTRUMENT) + CommOS_AddReturnAtomic(commPacketsReceived, 1); +#endif + if ((rc != sizeof packet) || (packet.len < sizeof packet)) { + rc = -1; /* Fatal protocol error, close down comm. */ + break; + } + rc = 1; + + /* Read payload, if any. */ + + dataLen = packet.len - sizeof packet; + if (vecLen == 0) { + /* Save header of first packet. */ + + firstPacket = packet; + if (dataLen == 0) { + /* Commit no-payload packet read and we're done. */ + + CommTransp_DequeueCommit(channel->transp); +#if defined(COMM_INSTRUMENT) + CommOS_AddReturnAtomic(&commCommittedPacketsReceived, 1); +#endif + break; + } + } else { + /* + * Check if non-equivalent packet or above coalescing limit. + * If so, don't commit the read. + */ + + if (memcmp(&packet.opCode, &firstPacket.opCode, + sizeof packet - offsetof(CommPacket, opCode)) || + PacketLenOverLimit(channel, firstPacket.len + dataLen)) { + break; + } + } + + if (dataLen == 0) { + /* + * Received equivalent packet with zero-sized payload. This may + * happen in certain cases, such as pvtcp forwarding zero-sized + * datagrams. So don't break the loop, but keep going for as + * along as we can. + */ + + vec[vecLen].iov_base = NULL; + goto dequeueCommit; + } + + /* The packet has a payload (dataLen > 0). */ + + if (!(vec[vecLen].iov_base = channel->impl->dataAlloc(dataLen))) { + /* + * We treat out-of-(net?-)memory errors as "nothing to read". + * Memory pressure may either subside, in which case a future + * read may be successful, or be severe enough for the kernel + * to oops, anyway. Leave packet uncommitted. + */ + + CommOS_Debug(("%s: COULD NOT ALLOC PAYLOAD BYTES!\n", + __FUNCTION__)); + rc = vecLen == 0 ? 0 : 1; + break; + } + + /* Read payload and commit (packet and payload). */ + + rc = CommTransp_DequeueSegment(channel->transp, + vec[vecLen].iov_base, dataLen); + if (rc != dataLen) { + channel->impl->dataFree(vec[vecLen].iov_base); + CommOS_Log(("%s: BOOG -- COULD NOT DEQUEUE PAYLOAD! [%d != %u]", + __FUNCTION__, rc, dataLen)); + rc = -1; /* Fatal protocol error, close down comm. */ + break; + } + rc = 1; + +dequeueCommit: + CommTransp_DequeueCommit(channel->transp); +#if defined(COMM_INSTRUMENT) + CommOS_AddReturnAtomic(&commCommittedPacketsReceived, 1); +#endif + vec[vecLen].iov_len = dataLen; + if (vecLen > 0) { + firstPacket.len += dataLen; + if (packet.flags) { + /* Update to latest flags _iff_ latter non-zero. */ + + firstPacket.flags = packet.flags; + } + } +#if defined(COMM_INSTRUMENT) + if (firstPacket.len > + CommOS_ReadAtomic(&commMaxCoalesceSize)) { + CommOS_WriteAtomic(&commMaxCoalesceSize, firstPacket.len); + } +#endif + if (COMM_OPF_TEST_ERR(packet.flags)) { + /* If error bit is set, we're done (no more coalescing). */ + + vecLen++; + break; + } + } + + if (rc <= 0) { + if (rc < 0) { + zombify = 1; + rc = 1; + } + goto outUnlockAndFreeIovec; + } + +#if defined(COMM_DISPATCH_EXTRA_WRITER_WAKEUP) + /* Check again if we need to wake up any writers. */ + + WakeUpWriter(channel); +#endif + + if (firstPacket.opCode >= channel->implNmbOps) { + CommOS_Debug(("%s: Ignoring illegal opCode [%u]!\n", + __FUNCTION__, (unsigned int)firstPacket.opCode)); + CommOS_Debug(("%s: Max opCode: %u\n", + __FUNCTION__, channel->implNmbOps)); + goto outUnlockAndFreeIovec; + } + + /* + * NOTE: + * DispatchUnlock() _must_ be called from the operation callback. + * The reason for doing so is that, for better scalability, we want + * it released as soon as possible, BUT: + * - releasing it here, before calling into the operation, doesn't + * let the latter coordinate its own lock acquisition, such as + * potential socket or state locks. + * - alternatively, always releasing the dispatch lock after the + * operation completes, ties up the channel and imposes too much + * serialization between sockets. + * - to prevent the channel from being torn down while an operation + * is in flight (and potentially having released the dispatch lock), + * we increment the ref count on the channel and then release it + * after the function returns. + */ + +#if defined(COMM_INSTRUMENT) + CommOS_AddReturnAtomic(&commOpCalls, 1); +#endif + + CommHold(channel); + channel->impl->operations[firstPacket.opCode](channel, channel->state, + &firstPacket, vec, vecLen); + CommRelease(channel); + goto out; /* No unlocking, see comment above. */ + } + + /* Process state changes. */ + + if (CommIsZombie(channel) && !CommIsHeld(channel)) { + CommTranspInitArgs transpArgs = channel->transpArgs; + void (*closeNtf)(void *, + const CommTranspInitArgs *, + int inBH) = channel->impl->closeNtf; + void *closeNtfData = channel->impl->closeNtfData; + + while (WriteTrylock(channel)) { + /* Take the write lock; kick writers out if necessary. */ + + CommOS_Debug(("%s: Kicking writers out...\n", __FUNCTION__)); + CommOS_WakeUp(&channel->availableWaitQ); + } + WriteUnlock(channel); + + CommOS_Debug(("%s: Nuking zombie channel.\n", __FUNCTION__)); + CommClose(channel); + if (closeNtf) { + closeNtf(closeNtfData, &transpArgs, 0); + } + rc = -1; + } else if (CommIsInitialized(channel) && + (channel->impl->openAtMillis <= + CommOS_GetCurrentMillis())) { + if (!CommOpen(channel)) { + if (channel->transpArgs.mode == COMM_TRANSP_INIT_CREATE) { + /* + * If the attach side doesn't get notified, the entry will + * time out in OPENED and will be collected. + * Note that during the CommOpen(Transp_Open) call, the IDs + * in the transpArgs may have changed. Use those. + */ + + CommTransp_Notify(&channel->impl->ntfCenterID, + &channel->transpArgs); + } else { /* Attach mode */ + packet.len = sizeof packet; + packet.opCode = 0xff; + packet.flags = 0x00; + + /* + * Send out control packet, attach ack, and transition straight + * to ACTIVE. + */ + + rc = CommTransp_EnqueueAtomic(channel->transp, + &packet, sizeof packet); + if (rc == sizeof packet) { + /* Guard against potentially concurrent zombify. */ + + CommGlobalLockBH(); + if (CommIsOpened(channel)) { + CommOS_Debug(("%s: Sent attach ack. Activating channel.\n", + __FUNCTION__)); + CommSetActive(channel); + } + CommGlobalUnlockBH(); + } + } + rc = 1; + } + } else if (CommIsOpened(channel) && + (channel->transpArgs.mode == COMM_TRANSP_INIT_CREATE)) { + /* + * Get control packet (opCode == 0xff), attach ack (flags == 0x0), + * or check whether the channel timed out in OPENED. + */ + + rc = CommTransp_DequeueAtomic(channel->transp, + &packet, sizeof packet); + if (rc == sizeof packet) { + void (*activateNtf)(void *activateNtfData, CommChannel) = NULL; + void *activateNtfData = NULL; + + /* Guard against potentially concurrent zombify. */ + + CommGlobalLockBH(); + if (CommIsOpened(channel) && + (packet.opCode == 0xff) && (packet.flags == 0x0)) { + activateNtf = channel->impl->activateNtf; + activateNtfData = channel->impl->activateNtfData; + + CommSetActive(channel); + CommOS_Debug(("%s: Received attach ack. Activating channel.\n", + __FUNCTION__)); + } + CommHold(channel); + CommGlobalUnlockBH(); + + if (activateNtf) { + /* The callback must be short and 'put' the channel when done. */ + + activateNtf(activateNtfData, channel); + } else { + /* Don't forget to put back the channel if no activate callback. */ + + CommRelease(channel); + } + } else if ((channel->impl->openTimeoutAtMillis <= + CommOS_GetCurrentMillis()) || + !running) { + zombify = 1; + CommOS_Debug(("%s: Zombifying expired opened channel.\n", + __FUNCTION__)); + } + rc = 1; + } + DispatchUnlock(channel); + +out: + if (zombify) { + Comm_Zombify(channel, 0); + } + return rc; + +outUnlockAndFreeIovec: + DispatchUnlock(channel); + for ( ; vecLen; ) { + if (vec[--vecLen].iov_base) { + channel->impl->dataFree(vec[vecLen].iov_base); + vec[vecLen].iov_base = NULL; + } + vec[vecLen].iov_len = 0; + } + goto out; +#undef VEC_SIZE +} + + +/** + * @brief Main input processing function operating on all channels. + * @return number of processed channels. + * @sideeffects Lifecycle states are transitioned to and from. Channels may + * be opened and destroyed, waiting writers may be woken up, and input + * may be handed off to operation callbacks. + */ + +unsigned int +Comm_DispatchAll(void) +{ + unsigned int i; + unsigned int hits; + + for (hits = 0, i = 0; running && (i < commChannelSize); i++) { + hits += !!Comm_Dispatch(&commChannels[i]); + } + return hits; +} + + +/** + * @brief Writes a fully formatted packet (containing payload data, if + * applicable) to the specified channel. + * + * The operation may block until enough write space is available, but no + * more than the specified interval. The operation either writes the full + * amount of bytes, or it fails. Warning: callers must _not_ use the + * _Lock/_Unlock functions to bracket calls to this function. + * @param[in,out] channel channel to write to. + * @param packet packet to write. + * @param[in,out] timeoutMillis interval in milliseconds to wait. + * @return number of bytes written, 0 if it times out, -1 error. + * @sideeffects Data may be written to the channel. + */ + +int +Comm_Write(CommChannel channel, + const CommPacket *packet, + unsigned long long *timeoutMillis) +{ + int rc = -1; + int zombify; + + if (!channel || !timeoutMillis || + !packet || (packet->len < sizeof *packet)) { + return rc; + } + + zombify = (*timeoutMillis >= COMM_MAX_TO); + + WriteLock(channel); + if (!CommIsActive(channel)) { + goto out; + } + + CommTransp_EnqueueReset(channel->transp); + channel->desiredWriteSpace = packet->len; + rc = CommOS_DoWait(&channel->availableWaitQ, WriteSpaceCondition, + channel, NULL, timeoutMillis, + (*timeoutMillis != COMM_MAX_TO_UNINT)); + channel->desiredWriteSpace = -1U; + + if (rc) { /* Don't zombify, if it didn't time out. */ + zombify = 0; + } + if (rc == 1) { /* Enough write space, enqueue the packet. */ + rc = CommTransp_EnqueueAtomic(channel->transp, packet, packet->len); + if (rc != packet->len) { + zombify = 1; + rc = -1; /* Fatal protocol error. */ + } + } + +out: + WriteUnlock(channel); + if (zombify) { + Comm_Zombify(channel, 0); + } + return rc; +} + + +/** + * @brief Writes a packet and associated payload data to the specified channel. + * The operation may block until enough write space is available, but + * not more than the specified interval. + * The operation either writes the full amount of bytes, or it fails. + * If there is not enough data in the vector, padding will be added to + * reach the specified packet length, if the flags parameter requires it. + * Users may call this function successively to write several packets + * from large {io|k}vecs, when the flags parameter indicates it. If this + * is the case, the packet header needs to be updated accordingly in + * between calls, for the different (total) lengths. + * Warning: callers must _not_ use the _Lock/_Unlock functions to bracket + * calls to this function. + * @param[in,out] channel the specified channel. + * @param packet packet to write. + * @param[in,out] vec kvec to write from. + * @param[in,out] vecLen length of kvec. + * @param[in,out] timeoutMillis interval in milliseconds to wait. + * @param[in,out] iovOffset must be set to 0 before first call (internal cookie) + * @return number of bytes written, 0 if it timed out, -1 error. + * @sideeffects data may be written to the channel. + */ + +int +Comm_WriteVec(CommChannel channel, + const CommPacket *packet, + struct kvec **vec, + unsigned int *vecLen, + unsigned long long *timeoutMillis, + unsigned int *iovOffset) +{ + int rc; + int zombify; + unsigned int dataLen; + unsigned int vecDataLen; + unsigned int vecNdx; + unsigned int iovLen; + void *iovBase; + + if (!channel || !timeoutMillis || !iovOffset || + !packet || (packet->len < sizeof *packet) || + (((dataLen = packet->len - sizeof *packet) > 0) && + (!*vec || !*vecLen))) { + return -1; + } + + zombify = (*timeoutMillis >= COMM_MAX_TO); + + WriteLock(channel); + if (!CommIsActive(channel)) { + rc = -1; + goto out; + } + + CommTransp_EnqueueReset(channel->transp); + channel->desiredWriteSpace = packet->len; + rc = CommOS_DoWait(&channel->availableWaitQ, WriteSpaceCondition, + channel, NULL, timeoutMillis, + (*timeoutMillis != COMM_MAX_TO_UNINT)); + channel->desiredWriteSpace = -1U; + + if (rc) { /* Don't zombify, if it didn't time out. */ + zombify = 0; + } + if (rc == 1) { /* Enough write space, enqueue the packet. */ + iovLen = 0; + rc = CommTransp_EnqueueSegment(channel->transp, packet, sizeof *packet); + if (rc != sizeof *packet) { + zombify = 1; + rc = -1; /* Fatal protocol error. */ + goto out; + } + + if (dataLen > 0) { + int done = 0; + + for (vecDataLen = 0, vecNdx = 0; vecNdx < *vecLen; vecNdx++) { + if (vecNdx) { + *iovOffset = 0; + } + iovLen = (*vec)[vecNdx].iov_len - *iovOffset; + iovBase = (*vec)[vecNdx].iov_base + *iovOffset; + + if (!iovLen) { + continue; + } + + vecDataLen += iovLen; + if (vecDataLen >= dataLen) { + iovLen -= (vecDataLen - dataLen); + done = 1; + } + + rc = CommTransp_EnqueueSegment(channel->transp, iovBase, iovLen); + if (rc != iovLen) { + zombify = 1; + rc = -1; /* Fatal protocol error, close down comm. */ + goto out; + } + + if (done) { + CommTransp_EnqueueCommit(channel->transp); + if (vecDataLen == dataLen) { + vecNdx++; + *iovOffset = 0; + } else { + *iovOffset += iovLen; + } + *vecLen -= vecNdx; + *vec += vecNdx; + break; + } + } + + if (!done) { + /* + * We exhausted all the bytes in the given vector, but total length + * in the packet header is more than we sent (was available). + * If so, we pad by sending zero bytes to reach length required. + */ + + static char pad[1024]; + unsigned int delta; + unsigned int toSend; + + while (vecDataLen < dataLen) { + delta = dataLen - vecDataLen; + toSend = delta <= sizeof pad ? delta : sizeof pad; + if (toSend == delta) { + done = 1; + } + vecDataLen += toSend; + + rc = CommTransp_EnqueueSegment(channel->transp, pad, toSend); + if (rc != toSend) { + zombify = 1; + rc = -1; /* Fatal protocol error, close down comm. */ + goto out; + } + + if (done) { + CommTransp_EnqueueCommit(channel->transp); + *vec = NULL; + *vecLen = 0; + *iovOffset = 0; + break; + } + } + } + } else { + CommTransp_EnqueueCommit(channel->transp); + } + rc = (int)packet->len; + } else { + CommOS_Debug(("%s: timed out...\n", __FUNCTION__)); + } + +out: + WriteUnlock(channel); + if (zombify) { + Comm_Zombify(channel, 0); + } + return rc; +} + + +/** + * @brief Releases channel ref count. This function is exported for the upper + * layer's 'activateNtf' callback which may be run asynchronously. The + * callback is protected from concurrent channel releases until it calls + * this function. + * @param[in,out] channel CommChannel structure to release. + */ + +void +Comm_Put(CommChannel channel) +{ + if (channel) { + CommRelease(channel); + } +} + + +/** + * @brief Uses the read lock. This function is exported for the upper layer + * such that it can order acquisition of a different lock (socket) with + * the release of the dispatch lock. + * @param[in,out] channel CommChannel structure to unlock. + */ + +void +Comm_DispatchUnlock(CommChannel channel) +{ + if (channel) { + DispatchUnlock(channel); + } +} + + +/** + * @brief Lock the channel for upper layer state. + * This function is exported for the upper layer to ensure that channel + * isn't closed while updating the layer state. Operations using this + * function are expected to be short, since unlike the _Write functions, + * these callers cannot be signaled. + * @param[in,out] channel CommChannel structure to lock. + * @return zero if successful, -1 otherwise. + */ + +int +Comm_Lock(CommChannel channel) +{ + if (!channel) { + return -1; + } + StateLock(channel); + if (!CommIsActive(channel) && !CommIsZombie(channel)) { + StateUnlock(channel); + return -1; + } + return 0; +} + + +/** + * @brief Uses the writer lock. This function is exported for the upper layer + * to ensure that channel isn't closed while updating the layer state. + * See Comm_Lock for details). + * @param[in,out] channel CommChannel structure to unlock. + */ + +void +Comm_Unlock(CommChannel channel) +{ + if (channel) { + StateUnlock(channel); + } +} + + +/** + * @brief Requests events be posted in-line after the function completes. + * @param channel channel object. + * @return current number of requests for inline event posting, or -1 on error. + */ + +unsigned int +Comm_RequestInlineEvents(CommChannel channel) +{ + if (channel->transp) { + return CommTransp_RequestInlineEvents(channel->transp); + } else { + return (unsigned int)-1; + } +} + + +/** + * @brief Requests events be posted out-of-band after the function completes. + * @param channel channel object. + * @return current number of requests for inline event posting, or -1 on error. + */ + +unsigned int +Comm_ReleaseInlineEvents(CommChannel channel) +{ + if (channel->transp) { + return CommTransp_ReleaseInlineEvents(channel->transp); + } else { + return (unsigned int)-1; + } +} diff --git a/arch/arm/mvp/commkm/comm.h b/arch/arm/mvp/commkm/comm.h new file mode 100644 index 0000000..8291ae4 --- /dev/null +++ b/arch/arm/mvp/commkm/comm.h @@ -0,0 +1,171 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Communication functions based on queue pair transport APIs. + * + * Comm is a shared memory-based mechanism that facilitates the implementation + * of kernel components that require host-to-guest, or guest-to-guest + * communication. + * This facility assumes the availability of a minimal shared memory queue pair + * implementation, such as MVP queue pairs or VMCI queue pairs. The latter must + * provide primitives for queue pair creation and destruction, and reading and + * writing from/to queue pairs. + * Comm assumes that the queue pair (transport) layer is not concerned with + * multi-threading, locking or flow control, and does not require such features. + */ + +#ifndef _COMM_H_ +#define _COMM_H_ + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "comm_os.h" +#include "comm_transp.h" + + +/* Default/maximum Comm timeouts (in milliseconds). */ +#define COMM_MAX_TO 60000ULL +#define COMM_MAX_TO_UNINT (COMM_MAX_TO + 1) + +#define COMM_OPF_SET_ERR(flags) ((flags) |= 128) +#define COMM_OPF_CLEAR_ERR(flags) ((flags) &= 127) +#define COMM_OPF_TEST_ERR(flags) ((flags) & 128) + +#define COMM_OPF_SET_VAL(flags, val) ((flags) |= ((val) & 127)) +#define COMM_OPF_GET_VAL(flags) ((flags) & 127) + +/** + * Packet (header) structure. + * NB: Do not change this structure, especially the first three fields; there + * will be consequences. It may be extended, but it's not recommended: all + * operations carry this header, so it's better kept in its minimal form. + */ + +typedef struct CommPacket { + unsigned int len; // Total length + unsigned char flags; // Operation flags + unsigned char opCode; // Operation to call + unsigned short data16; // Auxiliary data + unsigned long long data64; + unsigned long long data64ex; + union { + struct { + unsigned int data32; + unsigned int data32ex; + }; + unsigned long long data64ex2; + }; +} CommPacket; + + +/* Opaque structure representing a communication channel. */ + +struct CommChannelPriv; +typedef struct CommChannelPriv *CommChannel; + + +/* Input operations associated with a comm channel. */ + +typedef void (*CommOperationFunc)(CommChannel channel, + void *state, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen); + + +/* Helper macros */ + +#define COMM_DEFINE_OP(funcName) \ +void \ +funcName(CommChannel channel, \ + void *state, \ + CommPacket *packet, \ + struct kvec *vec, \ + unsigned int vecLen) + + +/* Comm-based implementations. */ + +typedef struct CommImpl { + struct module *owner; + int (*checkArgs)(CommTranspInitArgs *transpArgs); + void *(*stateCtor)(CommChannel channel); + void (*stateDtor)(void *state); + void *(*dataAlloc)(unsigned int dataLen); + void (*dataFree)(void *data); + const CommOperationFunc *operations; + void (*closeNtf)(void *closeNtfData, + const CommTranspInitArgs *transpArgs, + int inBH); + void *closeNtfData; + void (*activateNtf)(void *activateNtfData, + CommChannel channel); + void *activateNtfData; + unsigned long long openAtMillis; + unsigned long long openTimeoutAtMillis; + CommTranspID ntfCenterID; +} CommImpl; + + +int Comm_Init(unsigned int maxChannels); +int Comm_Finish(unsigned long long *timeoutMillis); +int Comm_RegisterImpl(const CommImpl *impl); +void Comm_UnregisterImpl(const CommImpl *impl); +int Comm_IsActive(CommChannel channel); +CommTranspInitArgs Comm_GetTranspInitArgs(CommChannel channel); +void *Comm_GetState(CommChannel channel); +int Comm_Dispatch(CommChannel channel); +unsigned int Comm_DispatchAll(void); +void Comm_Put(CommChannel channel); +void Comm_DispatchUnlock(CommChannel channel); +int Comm_Lock(CommChannel channel); +void Comm_Unlock(CommChannel channel); +int Comm_Zombify(CommChannel channel, int inBH); + +int +Comm_Alloc(const CommTranspInitArgs *transpArgs, + const CommImpl *impl, + int inBH, + CommChannel *newChannel); + + +int +Comm_Write(CommChannel channel, + const CommPacket *packet, + unsigned long long *timeoutMillis); + +int +Comm_WriteVec(CommChannel channel, + const CommPacket *packet, + struct kvec **vec, + unsigned int *vecLen, + unsigned long long *timeoutMillis, + unsigned int *iovOffset); + +unsigned int Comm_RequestInlineEvents(CommChannel channel); +unsigned int Comm_ReleaseInlineEvents(CommChannel channel); + +#endif // _COMM_H_ diff --git a/arch/arm/mvp/commkm/comm_ev.h b/arch/arm/mvp/commkm/comm_ev.h new file mode 100644 index 0000000..bf629c3 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_ev.h @@ -0,0 +1,51 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief various comm event signaling types and signatures + */ + +#ifndef _COMM_EV_H +#define _COMM_EV_H + +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_MODULE +#include "include_check.h" + +/** + * @name Identifiers of comm event signaling class methods + * @{ + */ +#define MVP_COMM_EV_SIGNATURE 0x4d4d4f43 ///< 'COMM' +#define MVP_COMM_EV_SIGNAL (MVP_OBJECT_CUSTOM_BASE + 0) ///< Signal host +#define MVP_COMM_EV_READ_EVENT_DATA (MVP_OBJECT_CUSTOM_BASE + 1) ///< read event data +#define MVP_COMM_EV_LAST (MVP_OBJECT_CUSTOM_BASE + 2) ///< Number of methods +/**@}*/ + +typedef struct CommEvent { + CommTranspID id; + CommTranspIOEvent event; +} CommEvent; + +#endif diff --git a/arch/arm/mvp/commkm/comm_ev_kernel.c b/arch/arm/mvp/commkm/comm_ev_kernel.c new file mode 100644 index 0000000..0701945 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_ev_kernel.c @@ -0,0 +1,136 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Comm event signaling, host kernel side. + */ + +#include + +#include "mvp_types.h" +#include "comm_os.h" +#include "comm_transp_impl.h" +#include "mksck_sockaddr.h" +#include "comm_ev.h" +#include "mvpkm_comm_ev.h" + +static struct socket *sock; + +/** + * @brief Raises a transport event on the provided event ID (address). This + * function is called from a comm_transp provider, such as comm_transp_mvp, + * when it needs to signal an event on a given channel. + * @param targetEvID opaque event channel ID (interpreted by implementation). + * @param transpID ID of transport to signal. + * @param eventType event type to raise. + * @return 0 if successful, -1 otherwise. + */ + +int +CommTranspEvent_Raise(unsigned int targetEvID, // unused + CommTranspID *transpID, + CommTranspIOEvent eventType) +{ + struct sockaddr_mk guestAddr; + struct msghdr msg; + struct kvec vec[1]; + int rc; + CommEvent event; + + if (!transpID) { + return -1; + } + + guestAddr.mk_family = AF_MKSCK; + guestAddr.mk_addr.addr = Mksck_AddrInit(transpID->d32[0], MKSCK_PORT_COMM_EV); + + memset(&msg, 0, sizeof (struct msghdr)); + msg.msg_name = &guestAddr; + msg.msg_namelen = sizeof (guestAddr); + + event.id = *transpID; + event.event = eventType; + + vec[0].iov_base = &event; + vec[0].iov_len = sizeof (CommEvent); + + rc = kernel_sendmsg(sock, + &msg, + vec, + 1, + sizeof (CommEvent)); + rc = (rc < 0) ? -1 : 0; + return rc; +} + + +/** + * @brief Performs one-time, global initialization of event provider. + * @return 0 if successful, -1 otherwise. + */ +int +CommTranspEvent_Init(void) +{ + struct sockaddr_mk addr = { AF_MKSCK, { .addr = MKSCK_ADDR_UNDEF } }; + int rc; + + rc = sock_create_kern(AF_MKSCK, SOCK_DGRAM, 0, &sock); + if (rc < 0) { + goto out; + } + + rc = kernel_bind(sock, (struct sockaddr *) &addr, sizeof addr); + if (rc < 0) { + sock_release(sock); + sock = NULL; + goto out; + } + + Mvpkm_CommEvRegisterProcessCB(CommTranspEvent_Process); + +out: + if (rc) { + CommOS_Log(("%s: Failed to initialize transport event signaling\n", + __FUNCTION__)); + } else { + CommOS_Log(("%s: Transport event signaling initialization successful\n", + __FUNCTION__)); + } + return rc; +} + + +/** + * @brief Performs global clean-up of event provider. + */ + +void +CommTranspEvent_Exit(void) +{ + Mvpkm_CommEvUnregisterProcessCB(); + if (sock) { + sock_release(sock); + sock = NULL; + } + + CommOS_Debug(("%s: done.\n", __FUNCTION__)); +} diff --git a/arch/arm/mvp/commkm/comm_os.h b/arch/arm/mvp/commkm/comm_os.h new file mode 100644 index 0000000..f98c8d4 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_os.h @@ -0,0 +1,150 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Cross-platform base type definitions and function declarations. + * Includes OS-specific base type definitions and function declarations. + */ + +#ifndef _COMM_OS_H_ +#define _COMM_OS_H_ + +/* For-ever timeout constant (in milliseconds). */ +#define COMM_OS_4EVER_TO ((unsigned long long)(~0UL >> 1)) + +/* Condition function prototype. Returns 1: true, 0: false, < 0: error code. */ +typedef int (*CommOSWaitConditionFunc)(void *arg1, void *arg2); + +/* Dispatch function prototype. Called by input (dispatch) kernel threads. */ +typedef unsigned int (*CommOSDispatchFunc)(void); + +/* Module initialization and exit callback functions. */ +extern int (*commOSModInit)(void *args); +extern void (*commOSModExit)(void); + +/* Macro to assign Init and Exit callbacks. */ +#define COMM_OS_MOD_INIT(init, exit) \ + int (*commOSModInit)(void *args) = init; \ + void (*commOSModExit)(void) = exit + + +/* + * OS-specific implementations must provide the following: + * 1. Types: + * CommOSAtomic + * CommOSSpinlock + * CommOSMutex + * CommOSWaitQueue + * CommOSWork + * CommOSWorkFunc + * CommOSList + * CommOSModule + * struct kvec + * + * 2. Definition, initializers: + * CommOSSpinlock_Define() + * + * 3. Functions: + * void CommOS_Debug(const char *format, ...); + * void CommOS_Log(const char *format, ...); + * void CommOS_WriteAtomic(CommOSAtomic *atomic, int val); + * int CommOS_ReadAtomic(CommOSAtomic *atomic); + * int CommOS_AddReturnAtomic(CommOSAtomic *atomic, int val); + * int CommOS_SubReturnAtomic(CommOSAtomic *atomic, int val); + * void CommOS_SpinlockInit(CommOSSpinlock *lock); + * void CommOS_SpinLockBH(CommOSSpinlock *lock); + * int CommOS_SpinTrylockBH(CommOSSpinlock *lock); + * void CommOS_SpinUnlockBH(CommOSSpinlock *lock); + * void CommOS_SpinLock(CommOSSpinlock *lock); + * int CommOS_SpinTrylock(CommOSSpinlock *lock); + * void CommOS_SpinUnlock(CommOSSpinlock *lock); + * void CommOS_MutexInit(CommOSMutex *mutex); + * void CommOS_MutexLock(CommOSMutex *mutex); + * int CommOS_MutexLockUninterruptible(CommOSMutex *mutex); + * int CommOS_MutexTrylock(CommOSMutex *mutex); + * void CommOS_MutexUnlock(CommOSMutex *mutex); + * void CommOS_WaitQueueInit(CommOSWaitQueue *wq); + * CommOS_DoWait(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc cond, + * void *condArg1, + * void *condArg2, + * unsigned long long *timeoutMillis, + * int interruptible); + * int CommOS_Wait(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc func, + * void *funcArg1, + * void *funcArg2, + * unsigned long long *timeoutMillis); + * int CommOS_WaitUninterruptible(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc func, + * void *funcArg1, + * void *funcArg2, + * unsigned long long *timeoutMillis); + * void CommOS_WakeUp(CommOSWaitQueue *wq); + * void *CommOS_KmallocNoSleep(unsigned int size); + * void *CommOS_Kmalloc(unsigned int size); + * void CommOS_Kfree(void *arg); + * void CommOS_Yield(void); + * unsigned long long CommOS_GetCurrentMillis(void); + * void CommOS_ListInit(CommOSList *list); + * int CommOS_ListEmpty(CommOSList *list); + * void CommOS_ListAdd(CommOSList *list, CommOSList *listElem); + * void CommOS_ListAddTail(CommOSList *list, CommOSList *listElem); + * void int CommOS_ListDel(CommOSList *listElem); + * Macros: + * CommOS_ListForEach(*list, *item, itemListFieldName); + * CommOS_ListForEachSafe(*list, *item, *tmp, itemListFieldName); + * void CommOS_ListSplice(CommOSList *list, CommOSList *listToAdd); + * void CommOS_ListSpliceTail(CommOSList *list, CommOSList *listToAdd); + * CommOSModule CommOS_ModuleSelf(void); + * int CommOS_ModuleGet(CommOSModule module); + * void CommOS_ModulePut(CommOSModule module); + * void CommOS_MemBarrier(void); + * + * These cannot be defined here: a) non-pointer type definitions need size + * information, and b) functions may or may not be inlined, or macros may + * be used instead. + */ + + +#ifdef __linux__ +#include "comm_os_linux.h" +#else +#error "Unsupported OS" +#endif + +/* Functions to start and stop the dispatch and aio kernel threads. */ +void CommOS_StopIO(void); +void CommOS_ScheduleDisp(void); +void CommOS_InitWork(CommOSWork *work, CommOSWorkFunc func); +int CommOS_ScheduleAIOWork(CommOSWork *work); +void CommOS_FlushAIOWork(CommOSWork *work); + +int +CommOS_StartIO(const char *dispatchTaskName, + CommOSDispatchFunc dispatchHandler, + unsigned int interval, + unsigned int maxCycles, + const char *aioTaskName); + + +#endif /* _COMM_OS_H_ */ diff --git a/arch/arm/mvp/commkm/comm_os_linux.c b/arch/arm/mvp/commkm/comm_os_linux.c new file mode 100644 index 0000000..74f99f5 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_os_linux.c @@ -0,0 +1,371 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Linux-specific functions/types. + */ + +#include "comm_os.h" + +#define DISPATCH_MAX_CYCLES 8192 + +/* Type definitions */ + +typedef struct workqueue_struct CommOSWorkQueue; + + +/* Static data */ + +static volatile int running; +static int numCpus; +static CommOSWorkQueue *dispatchWQ; +static CommOSDispatchFunc dispatch; +static CommOSWork dispatchWorksNow[NR_CPUS]; +static CommOSWork dispatchWorks[NR_CPUS]; +static unsigned int dispatchInterval = 1; +static unsigned int dispatchMaxCycles = 2048; +static CommOSWorkQueue *aioWQ; + + +/** + * @brief Initializes a workqueue consisting of per-cpu kernel threads. + * @param name workqueue name + * @return workqueue handle if successful, NULL otherwise + */ + +static inline CommOSWorkQueue * +CreateWorkqueue(const char *name) +{ + return create_workqueue(name); +} + + +/** + * @brief Destroys a workqueue and stops its threads. + * @param[in,out] wq workqueue to destroy. + * @return workqueue handle is successful, NULL otherwise. + */ + +static inline void +DestroyWorkqueue(CommOSWorkQueue *wq) +{ + destroy_workqueue(wq); +} + + +/** + * @brief Force execution of a work item. + * @param[in,out] work work item to dequeue. + */ + +static inline void +FlushDelayedWork(CommOSWork *work) +{ + flush_delayed_work(work); +} + + +/** + * @brief Enqueue a work item to a workqueue for execution on a given cpu + * and after the specified interval. + * @param cpu cpu number. If negative, work item is enqueued on current cpu. + * @param[in,out] wq target work queue. + * @param[in,out] work work item to enqueue. + * @param jif delay interval. + * @return zero if successful, non-zero otherwise. + */ + +static inline int +QueueDelayedWorkOn(int cpu, + CommOSWorkQueue *wq, + CommOSWork *work, + unsigned long jif) +{ + if (cpu < 0) { + return !queue_delayed_work(wq, work, jif) ? -1 : 0; + } else { + return !queue_delayed_work_on(cpu, wq, work, jif) ? -1 : 0; + } +} + + +/** + * @brief Enqueues a work item to a workqueue for execution on the current cpu + * and after the specified interval. + * @param[in,out] wq target work queue. + * @param[in,out] work work item to enqueue. + * @param jif delay interval. + * @return zero if successful, non-zero otherwise. + */ + +static inline int +QueueDelayedWork(CommOSWorkQueue *wq, + CommOSWork *work, + unsigned long jif) +{ + return QueueDelayedWorkOn(-1, wq, work, jif); +} + + +/** + * @brief Cancels a queued delayed work item and synchronizes with its + * completion. + * @param[in,out] work work item to cancel + */ + +static inline void +WaitForDelayedWork(CommOSWork *work) +{ + cancel_delayed_work_sync(work); +} + + +/** + * @brief Discards work items queued to the specified workqueue. + * @param[in,out] wq work queue to flush. + */ + +static inline void +FlushWorkqueue(CommOSWorkQueue *wq) +{ + flush_workqueue(wq); +} + + +/** + * @brief Schedules dispatcher threads for immediate execution. + */ + +void +CommOS_ScheduleDisp(void) +{ + CommOSWork *work = &dispatchWorksNow[get_cpu()]; + + put_cpu(); + if (running) { + QueueDelayedWork(dispatchWQ, work, 0); + } +} + + +/** + * @brief Default delayed work callback function implementation. + * Calls the input function specified at initialization. + * @param[in,out] work work item. + */ + +static void +DispatchWrapper(CommOSWork *work) +{ + unsigned int misses; + + for (misses = 0; running && (misses < dispatchMaxCycles); ) { + /* We run for at most dispatchMaxCycles worth of channel no-ops. */ + + if (!dispatch()) { + /* No useful work was done, on any of the channels. */ + + misses++; + if ((misses % 32) == 0) { + CommOS_Yield(); + } + } else { + misses = 0; + } + } + + if (running && + (work >= &dispatchWorks[0]) && + (work <= &dispatchWorks[NR_CPUS - 1])) { + /* + * If still running _and_ this was a regular, time-based run, then + * re-arm the timer. + */ + + QueueDelayedWork(dispatchWQ, work, dispatchInterval); + } +} + + +/** + * @brief Initializes work item with specified callback function. + * @param[in,out] work work queue to initialize. + * @param func work item to initialize the queue with. + */ + +void +CommOS_InitWork(CommOSWork *work, + CommOSWorkFunc func) +{ + INIT_DELAYED_WORK(work, (work_func_t)func); +} + + +/** + * @brief Flush execution of a work item + * @param{in,out] work work item to dequeue + */ +void +CommOS_FlushAIOWork(CommOSWork *work) +{ + if (aioWQ && work) { + FlushDelayedWork(work); + } +} + + +/** + * @brief Queue a work item to the AIO workqueue. + * @param[in,out] work work item to enqueue. + * @return zero if work enqueued, non-zero otherwise. + */ + +int +CommOS_ScheduleAIOWork(CommOSWork *work) +{ + if (running && aioWQ && work) { + return QueueDelayedWork(aioWQ, work, 0); + } + return -1; +} + + +/** + * @brief Initializes the base IO system. + * @param dispatchTaskName dispatch thread(s) name. + * @param dispatchFunc dispatch function. + * @param intervalMillis periodic interval in milliseconds to call dispatch. + * The floor is 1 jiffy, regardless of how small intervalMillis is + * @param maxCycles number of cycles to do adaptive polling before scheduling. + * The maximum number of cycles is DISPATCH_MAX_CYCLES. + * @param aioTaskName AIO thread(s) name. If NULL, AIO threads aren't started. + * @return zero is successful, -1 otherwise. + * @sideeffects Dispatch threads, and if applicable, AIO threads are started. + */ + +int +CommOS_StartIO(const char *dispatchTaskName, // IN + CommOSDispatchFunc dispatchFunc, // IN + unsigned int intervalMillis, // IN + unsigned int maxCycles, // IN + const char *aioTaskName) // IN +{ + int rc; + int cpu; + + if (running) { + CommOS_Debug(("%s: I/O tasks already running.\n", __FUNCTION__)); + return 0; + } + + /* + * OK, let's test the handler against NULL. Though, the whole concept + * of checking for NULL pointers, outside cases where NULL is meaningful + * to the implementation, is relatively useless: garbage, random pointers + * rarely happen to be all-zeros. + */ + + if (!dispatchFunc) { + CommOS_Log(("%s: a NULL Dispatch handler was passed.\n", __FUNCTION__)); + return -1; + } + dispatch = dispatchFunc; + + if (intervalMillis == 0) { + intervalMillis = 4; + } + if ((dispatchInterval = msecs_to_jiffies(intervalMillis)) < 1) { + dispatchInterval = 1; + } + if (maxCycles > DISPATCH_MAX_CYCLES) { + dispatchMaxCycles = DISPATCH_MAX_CYCLES; + } else if (maxCycles > 0) { + dispatchMaxCycles = maxCycles; + } + CommOS_Debug(("%s: Interval millis %u (jif:%u).\n", __FUNCTION__, + intervalMillis, dispatchInterval)); + CommOS_Debug(("%s: Max cycles %u.\n", __FUNCTION__, dispatchMaxCycles)); + + numCpus = num_present_cpus(); + dispatchWQ = CreateWorkqueue(dispatchTaskName); + if (!dispatchWQ) { + CommOS_Log(("%s: Couldn't create %s task(s).\n", __FUNCTION__, + dispatchTaskName)); + return -1; + } + + if (aioTaskName) { + aioWQ = CreateWorkqueue(aioTaskName); + if (!aioWQ) { + CommOS_Log(("%s: Couldn't create %s task(s).\n", __FUNCTION__, + aioTaskName)); + DestroyWorkqueue(dispatchWQ); + return -1; + } + } else { + aioWQ = NULL; + } + + running = 1; + for (cpu = 0; cpu < numCpus; cpu++) { + CommOS_InitWork(&dispatchWorksNow[cpu], DispatchWrapper); + CommOS_InitWork(&dispatchWorks[cpu], DispatchWrapper); + rc = QueueDelayedWorkOn(cpu, dispatchWQ, + &dispatchWorks[cpu], + dispatchInterval); + if (rc != 0) { + CommOS_StopIO(); + return -1; + } + } + CommOS_Log(("%s: Created I/O task(s) successfully.\n", __FUNCTION__)); + return 0; +} + + +/** + * @brief Stops the base IO system. + * @sideeffects Dispatch threads, and if applicable, AIO threads are stopped. + */ + +void +CommOS_StopIO(void) +{ + int cpu; + + if (running) { + running = 0; + if (aioWQ) { + FlushWorkqueue(aioWQ); + DestroyWorkqueue(aioWQ); + aioWQ = NULL; + } + FlushWorkqueue(dispatchWQ); + for (cpu = 0; cpu < numCpus; cpu++) { + WaitForDelayedWork(&dispatchWorksNow[cpu]); + WaitForDelayedWork(&dispatchWorks[cpu]); + } + DestroyWorkqueue(dispatchWQ); + dispatchWQ = NULL; + CommOS_Log(("%s: I/O tasks stopped.\n", __FUNCTION__)); + } +} diff --git a/arch/arm/mvp/commkm/comm_os_linux.h b/arch/arm/mvp/commkm/comm_os_linux.h new file mode 100644 index 0000000..f92c8bd --- /dev/null +++ b/arch/arm/mvp/commkm/comm_os_linux.h @@ -0,0 +1,699 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Contains linux-specific type definitions and function declarations + */ + +#ifndef _COMM_OS_LINUX_H_ +#define _COMM_OS_LINUX_H_ + +#include +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) +#error "Kernel versions lower than 2.6.20 are not supported" +#endif + +#include +#include +#include +#include +#include +#include + + +/* + * Type definitions. + */ + +typedef atomic_t CommOSAtomic; +typedef spinlock_t CommOSSpinlock; +typedef struct mutex CommOSMutex; +typedef wait_queue_head_t CommOSWaitQueue; +typedef struct delayed_work CommOSWork; +typedef void (*CommOSWorkFunc)(CommOSWork *work); +typedef struct list_head CommOSList; +typedef struct module *CommOSModule; + + +/* + * Initializers. + */ + +#define CommOSSpinlock_Define DEFINE_SPINLOCK + + +#define COMM_OS_DOLOG(...) printk(KERN_INFO __VA_ARGS__) + + +/** + * @brief Logs given arguments in debug builds. + */ + +#if defined(COMM_OS_DEBUG) + #define CommOS_Debug(args) COMM_OS_DOLOG args +#else + #define CommOS_Debug(args) +#endif + + +/** + * @brief Logs given arguments. + */ + +#define CommOS_Log(args) COMM_OS_DOLOG args + + +/** + * @brief Logs function name and location. + */ + +#if defined(COMM_OS_TRACE) +#define TRACE(ptr) \ + do { \ + CommOS_Debug(("%p:%s: at [%s:%d] with arg ptr [0x%p].\n", current, \ + __FUNCTION__, __FILE__, __LINE__, (ptr))); \ + } while (0) +#else +#define TRACE(ptr) +#endif + + +/** + * @brief Write atomic variable + * @param[in,out] atomic variable to write + * @param val new value + */ + +static inline void +CommOS_WriteAtomic(CommOSAtomic *atomic, + int val) +{ + atomic_set(atomic, val); +} + + +/** + * @brief Reads atomic variable + * @param atomic variable to read + * @return value + */ + +static inline int +CommOS_ReadAtomic(CommOSAtomic *atomic) +{ + return atomic_read(atomic); +} + + +/** + * @brief Atomically add value to atomic variable, return new value. + * @param[in,out] atomic variable + * @param val value to add + * @return new value + */ + +static inline int +CommOS_AddReturnAtomic(CommOSAtomic *atomic, + int val) +{ + return atomic_add_return(val, atomic); +} + + +/** + * @brief Atomically substract value from atomic variable, return new value. + * @param[in,out] atomic variable + * @param val value to substract + * @return new value + */ + +static inline int +CommOS_SubReturnAtomic(CommOSAtomic *atomic, + int val) +{ + return atomic_sub_return(val, atomic); +} + + +/** + * @brief Initializes a given lock. + * @param[in,out] lock lock to initialize + */ + +static inline void +CommOS_SpinlockInit(CommOSSpinlock *lock) +{ + spin_lock_init(lock); +} + + +/** + * @brief Locks given lock and disables bottom half processing. + * @param[in,out] lock lock to lock + */ + +static inline void +CommOS_SpinLockBH(CommOSSpinlock *lock) +{ + spin_lock_bh(lock); +} + + +/** + * @brief Attempts to lock the given lock and disable BH processing. + * @param[in,out] lock lock to lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_SpinTrylockBH(CommOSSpinlock *lock) +{ + return !spin_trylock_bh(lock); +} + + +/** + * @brief Unlocks given lock and re-enables BH processing. + * @param[in,out] lock lock to unlock + */ + +static inline void +CommOS_SpinUnlockBH(CommOSSpinlock *lock) +{ + spin_unlock_bh(lock); +} + + +/** + * @brief Locks the given lock. + * @param[in,out] lock lock to lock + */ + +static inline void +CommOS_SpinLock(CommOSSpinlock *lock) +{ + spin_lock(lock); +} + + +/** + * @brief Attempts to lock the given lock. + * @param[in,out] lock lock to try-lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_SpinTrylock(CommOSSpinlock *lock) +{ + return !spin_trylock(lock); +} + + +/** + * @brief Unlocks given lock. + * @param[in,out] lock lock to unlock + */ + +static inline void +CommOS_SpinUnlock(CommOSSpinlock *lock) +{ + spin_unlock(lock); +} + + +/** + * @brief Initializes given mutex. + * @param[in,out] mutex mutex to initialize + */ + +static inline void +CommOS_MutexInit(CommOSMutex *mutex) +{ + mutex_init(mutex); +} + + +/** + * @brief Acquires mutex. + * @param[in,out] mutex mutex to lock + * @return zero if successful, non-zero otherwise (interrupted) + */ + +static inline int +CommOS_MutexLock(CommOSMutex *mutex) +{ + return mutex_lock_interruptible(mutex); +} + + +/** + * @brief Acquires mutex in uninterruptible mode. + * @param[in,out] mutex mutex to lock + */ + +static inline void +CommOS_MutexLockUninterruptible(CommOSMutex *mutex) +{ + mutex_lock(mutex); +} + + +/** + * @brief Attempts to acquire given mutex. + * @param[in,out] mutex mutex to try-lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_MutexTrylock(CommOSMutex *mutex) +{ + return !mutex_trylock(mutex); +} + + +/** + * @brief Releases a given mutex. + * @param[in,out] mutex mutex to unlock + */ + +static inline void +CommOS_MutexUnlock(CommOSMutex *mutex) +{ + mutex_unlock(mutex); +} + + +/** + * @brief Initializes a wait queue. + * @param[in,out] wq workqueue to initialize + */ + +static inline void +CommOS_WaitQueueInit(CommOSWaitQueue *wq) +{ + init_waitqueue_head(wq); +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * - a signal is pending + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @param interruptible enable/disable signal pending check + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, if a signal is pending or other error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_DoWait(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis, + int interruptible) +{ + int rc; + DEFINE_WAIT(wait); + long timeout; +#if defined(COMM_OS_LINUX_WAIT_WORKAROUND) + long tmpTimeout; + long retTimeout; + const unsigned int interval = 50; +#endif + + if (!timeoutMillis) { + return -1; + } + if ((rc = cond(condArg1, condArg2)) != 0) { + return rc; + } + +#if defined(COMM_OS_LINUX_WAIT_WORKAROUND) + timeout = msecs_to_jiffies(interval < *timeoutMillis ? + interval : (unsigned int)*timeoutMillis); + retTimeout = msecs_to_jiffies((unsigned int)(*timeoutMillis)); + + for (; retTimeout >= 0; ) { + prepare_to_wait(wq, &wait, + (interruptible?TASK_INTERRUPTIBLE:TASK_UNINTERRUPTIBLE)); + if ((rc = cond(condArg1, condArg2))) { + break; + } + if (interruptible && signal_pending(current)) { + rc = -EINTR; + break; + } + if ((tmpTimeout = schedule_timeout(timeout))) { + retTimeout -= (timeout - tmpTimeout); + } else { + retTimeout -= timeout; + } + if (retTimeout < 0) { + retTimeout = 0; + } + } + finish_wait(wq, &wait); + if (rc == 0) { + rc = cond(condArg1, condArg2); + if (rc && (retTimeout == 0)) { + retTimeout = 1; + } + } + *timeoutMillis = (unsigned long long)jiffies_to_msecs(retTimeout); +#else // !defined(COMM_OS_LINUX_WAIT_WORKAROUND) + timeout = msecs_to_jiffies((unsigned int)(*timeoutMillis)); + + for (;;) { + prepare_to_wait(wq, &wait, + (interruptible?TASK_INTERRUPTIBLE:TASK_UNINTERRUPTIBLE)); + if ((rc = cond(condArg1, condArg2)) != 0) { + break; + } + if (interruptible && signal_pending(current)) { + rc = -EINTR; + break; + } + if ((timeout = schedule_timeout(timeout)) == 0) { + rc = 0; + break; + } + } + finish_wait(wq, &wait); + if (rc == 0) { + rc = cond(condArg1, condArg2); + if (rc && (timeout == 0)) { + timeout = 1; + } + } + *timeoutMillis = (unsigned long long)jiffies_to_msecs(timeout); +#endif + + return rc; +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * - a signal is pending + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, if a signal is pending or other error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_Wait(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis) +{ + return CommOS_DoWait(wq, cond, condArg1, condArg2, timeoutMillis, 1); +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_WaitUninterruptible(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis) +{ + return CommOS_DoWait(wq, cond, condArg1, condArg2, timeoutMillis, 0); +} + + +/** + * @brief Wakes up task(s) waiting on the given wait queue. + * @param[in,out] wq wait queue. + */ + +static inline void +CommOS_WakeUp(CommOSWaitQueue *wq) +{ + wake_up(wq); +} + + +/** + * @brief Allocates kernel memory of specified size; does not sleep. + * @param size size to allocate. + * @return Address of allocated memory or NULL if the allocation fails. + */ + +static inline void * +CommOS_KmallocNoSleep(unsigned int size) +{ + return kmalloc(size, GFP_ATOMIC); +} + + +/** + * @brief Allocates kernel memory of specified size; may sleep. + * @param size size to allocate. + * @return Address of allocated memory or NULL if the allocation fails. + */ + +static inline void * +CommOS_Kmalloc(unsigned int size) +{ + return kmalloc(size, GFP_KERNEL); +} + + +/** + * @brief Frees previously allocated kernel memory. + * @param obj object to free. + */ + +static inline void +CommOS_Kfree(void *obj) +{ + if (obj) { + kfree(obj); + } +} + + +/** + * @brief Yields the current cpu to other runnable tasks. + */ + +static inline void +CommOS_Yield(void) +{ + cond_resched(); +} + + +/** + * @brief Gets the current time in milliseconds. + * @return Current time in milliseconds, with precision of at most one tick. + */ + +static inline unsigned long long +CommOS_GetCurrentMillis(void) +{ + return (unsigned long long)jiffies_to_msecs(jiffies); +} + + +/** + * @brief Initializes given list. + * @param list list to initialize. + */ + +static inline void +CommOS_ListInit(CommOSList *list) +{ + INIT_LIST_HEAD(list); +} + + +/** + * @brief Tests if list is empty. + * @param list list to test. + * @return non-zero if empty, zero otherwise. + */ + +#define CommOS_ListEmpty(list) list_empty((list)) + + +/** + * @brief Adds given element to beginning of list. + * @param list list to add to. + * @param elem element to add. + */ + +#define CommOS_ListAdd(list, elem) list_add((elem), (list)) + + +/** + * @brief Adds given element to end of list. + * @param list list to add to. + * @param elem element to add. + */ + +#define CommOS_ListAddTail(list, elem) list_add_tail((elem), (list)) + + +/** + * @brief Deletes given element from its list. + * @param elem element to delete. + */ + +#define CommOS_ListDel(elem) \ + do { \ + list_del((elem)); \ + INIT_LIST_HEAD((elem)); \ + } while (0) + + +/** + * @brief Iterates over a list. + * @param list list to iterate over. + * @param[out] item stores next element. + * @param itemListFieldName name in the item structure storing the list head. + */ + +#define CommOS_ListForEach(list, item, itemListFieldName) \ + list_for_each_entry((item), (list), itemListFieldName) + + +/** + * @brief Iterates safely over a list. + * @param list list to iterate over. + * @param[out] item stores next element. May be deleted in the loop. + * @param[out] tmpItem saves iteration element. + * @param itemListFieldName name in the item structure storing the list head. + */ + +#define CommOS_ListForEachSafe(list, item, tmpItem, itemListFieldName) \ + list_for_each_entry_safe((item), (tmpItem), (list), itemListFieldName) + + +/** + * @brief Combines two lists, adds second list to beginning of first one. + * @param list list to add to. + * @param list2 list to add. + */ + +#define CommOS_ListSplice(list, list2) list_splice((list2), (list)) + + +/** + * @brief Combines two lists, adds second list to end of first one. + * @param list list to add to. + * @param list2 list to add. + */ + +#define CommOS_ListSpliceTail(list, list2) list_splice_tail((list2), (list)) + + +/** + * @brief Gets current module handle. + * @return module handle. + */ + +static inline CommOSModule +CommOS_ModuleSelf(void) +{ + return THIS_MODULE; +} + + +/** + * @brief Retains module. + * @param[in,out] module to retain. + * @return zero if successful, non-zero otherwise. + */ + +static inline int +CommOS_ModuleGet(CommOSModule module) +{ + int rc = 0; + + if (!module) { + goto out; + } + if (!try_module_get(module)) { + rc = -1; + } + +out: + return rc; +} + + +/** + * @brief Releases module. + * @param[in,out] module to release. + */ + +static inline void +CommOS_ModulePut(CommOSModule module) +{ + if (module) { + module_put(module); + } +} + + +/** + * @brief Inserts r/w memory barrier. + */ + +#define CommOS_MemBarrier smp_mb + +#endif /* _COMM_OS_LINUX_H_ */ diff --git a/arch/arm/mvp/commkm/comm_os_mod_linux.c b/arch/arm/mvp/commkm/comm_os_mod_linux.c new file mode 100644 index 0000000..8470de6 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_os_mod_linux.c @@ -0,0 +1,105 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Linux-specific module loading, unloading functions. + */ + +#include "comm_os.h" +#include "comm_os_mod_ver.h" + +#include + + +/* Module parameters -- passed as one 'name=value'-list string. */ + +static char modParams[256]; +module_param_string(COMM_OS_MOD_SHORT_NAME, modParams, sizeof modParams, 0644); + + +/** + * @brief Module initialization entry point. Calls the commOSModInit + * function pointer to perform upper layer initialization. + * @return zero if successful, non-zero otherwise. + */ + +static int __init +ModInit(void) +{ + int rc; + + if (!commOSModInit) { + CommOS_Log(("%s: Can't find \'init\' function for module \'" \ + COMM_OS_MOD_SHORT_NAME_STRING "\'.\n", __FUNCTION__)); + return -1; + } + + CommOS_Debug(("%s: Module parameters: [%s].\n", __FUNCTION__, modParams)); + + rc = (*commOSModInit)(modParams); + if (rc == 0) { + CommOS_Log(("%s: Module \'" COMM_OS_MOD_SHORT_NAME_STRING \ + "\' has been successfully initialized.\n", __FUNCTION__)); + } else { + CommOS_Log(("%s: Module \'" COMM_OS_MOD_SHORT_NAME_STRING \ + "\' could not be initialized [%d].\n", __FUNCTION__, rc)); + } + + return rc > 0 ? -rc : rc; +} + + +/** + * @brief Module exit function. Calls the commOSModExit function pointer + * to perform upper layer cleanup. + */ + +static void __exit +ModExit(void) +{ + if (!commOSModExit) { + CommOS_Log(("%s: Can't find \'fini\' function for module \'" \ + COMM_OS_MOD_SHORT_NAME_STRING "\'.\n", __FUNCTION__)); + return; + } + + (*commOSModExit)(); + CommOS_Log(("%s: Module \'" COMM_OS_MOD_SHORT_NAME_STRING \ + "\' has been stopped.\n", __FUNCTION__)); +} + + +module_init(ModInit); +module_exit(ModExit); + +/* Module information. */ +MODULE_AUTHOR("VMware, Inc."); +MODULE_DESCRIPTION(COMM_OS_MOD_NAME_STRING); +MODULE_VERSION(COMM_OS_MOD_VERSION_STRING); +MODULE_LICENSE("GPL v2"); +/* + * Starting with SLE10sp2, Novell requires that IHVs sign a support agreement + * with them and mark their kernel modules as externally supported via a + * change to the module header. If this isn't done, the module will not load + * by default (i.e., neither mkinitrd nor modprobe will accept it). + */ +MODULE_INFO(supported, "external"); diff --git a/arch/arm/mvp/commkm/comm_os_mod_ver.h b/arch/arm/mvp/commkm/comm_os_mod_ver.h new file mode 100644 index 0000000..059854c --- /dev/null +++ b/arch/arm/mvp/commkm/comm_os_mod_ver.h @@ -0,0 +1,38 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Version definitions for the Comm module. + */ + +#ifndef _COMM_OS_MOD_VER_H_ +#define _COMM_OS_MOD_VER_H_ + +#define COMM_OS_MOD_NAME_STRING "VMware communication module" +#define COMM_OS_MOD_SHORT_NAME comm +#define COMM_OS_MOD_SHORT_NAME_STRING "comm" + +#define COMM_OS_MOD_VERSION 1.0.0.0 +#define COMM_OS_MOD_VERSION_COMMAS 1,0,0,0 +#define COMM_OS_MOD_VERSION_STRING "1.0.0.0" + +#endif /* _COM_OS_MOD_VER_H_ */ diff --git a/arch/arm/mvp/commkm/comm_svc.c b/arch/arm/mvp/commkm/comm_svc.c new file mode 100644 index 0000000..18f62bd --- /dev/null +++ b/arch/arm/mvp/commkm/comm_svc.c @@ -0,0 +1,421 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Communication functions based on transport functionality. + */ + +#include "comm_os.h" +#include "comm_os_mod_ver.h" +#include "comm_svc.h" + + +/* + * Initialization of module entry and exit callbacks expected by module + * loading/unloading functions in comm_os. + */ + +static int Init(void *args); +static void Exit(void); + +COMM_OS_MOD_INIT(Init, Exit); + +static volatile int running; // Initialized and running. + + +/** + * @brief Allocates and initializes comm global state. + * Starts input dispatch and aio threads. + * @param argsIn arguments + * @return zero if successful, non-zero otherwise. + */ + +static int +Init(void *argsIn) +{ + int rc = -1; + unsigned int maxChannels = 8; + /* + * Infinite timeout, 1 polling cycle + * see kernel/time.c: msecs_to_jiffies() + */ + unsigned int pollingMillis = (unsigned int)-1; + unsigned int pollingCycles = 1; + const char *args = argsIn; + + if (args && *args) { + /* coverity[secure_coding] */ + sscanf(args, + "max_channels:%u,poll_millis:%u,poll_cycles:%u", + &maxChannels, &pollingMillis, &pollingCycles); + CommOS_Debug(("%s: arguments [%s].\n", __FUNCTION__, args)); + } + + rc = Comm_Init(maxChannels); + if (rc) { + goto out; + } + + rc = CommOS_StartIO(COMM_OS_MOD_SHORT_NAME_STRING "-disp", + Comm_DispatchAll, pollingMillis, pollingCycles, + COMM_OS_MOD_SHORT_NAME_STRING "-aio"); + if (rc) { + unsigned long long timeout = 0; + + Comm_Finish(&timeout); /* Nothing started, guaranteed to succeed. */ + goto out; + } + running = 1; + rc = 0; + +out: + return rc; +} + + +/** + * @brief Attempts to close all channels. + * @return zero if successful, non-zero otherwise. + */ + +static int +Halt(void) +{ + unsigned int maxTries = 10; + int rc = -1; + + if (!running) { + rc = 0; + goto out; + } + + for ( ; maxTries; maxTries--) { + unsigned long long timeout = 2000ULL; + + CommOS_Debug(("%s: Attempting to halt...\n", __FUNCTION__)); + if (!Comm_Finish(&timeout)) { + running = 0; + rc = 0; + break; + } + } + +out: + return rc; +} + + +/** + * @brief Stops the comm_rt module. + * If Halt() call successful, stops input dispatch and aio threads. + */ + +static void +Exit(void) +{ + if (!Halt()) { + CommOS_StopIO(); + } +} + + +/** + * @brief Registers an implementation block used when attaching to channels + * in response to transport attach events. + * @param impl implementation block. + * @return 0 if successful, non-zero otherwise. + */ + +int +CommSvc_RegisterImpl(const CommImpl *impl) +{ + return Comm_RegisterImpl(impl); +} + + +/** + * @brief Unregisters an implementation block used when attaching to channels + * in response to transport attach events. + * @param impl implementation block. + */ + +void +CommSvc_UnregisterImpl(const CommImpl *impl) +{ + Comm_UnregisterImpl(impl); +} + + +/** + * @brief Finds a free entry and initializes it with the information provided. + * May be called from BH. It doesn't call potentially blocking functions. + * @param transpArgs transport initialization arguments. + * @param impl implementation block. + * @param inBH non-zero if called in bottom half. + * @param[out] newChannel newly allocated channel. + * @return zero if successful, non-zero otherwise. + * @sideeffects Initializes the communications channel with given parameters + */ + +int +CommSvc_Alloc(const CommTranspInitArgs *transpArgs, + const CommImpl *impl, + int inBH, + CommChannel *newChannel) +{ + return Comm_Alloc(transpArgs, impl, inBH, newChannel); +} + + +/** + * @brief Zombifies a channel. May fail if channel isn't active. + * @param channel channel to zombify. + * @param inBH non-zero if called in bottom half. + * @return zero if channel zombified, non-zero otherwise. + */ + +int +CommSvc_Zombify(CommChannel channel, + int inBH) +{ + return Comm_Zombify(channel, inBH); +} + + +/** + * @brief Reports whether a channel is active. + * @param channel channel to report on. + * @return non-zero if channel active, zero otherwise. + */ + +int +CommSvc_IsActive(CommChannel channel) +{ + return Comm_IsActive(channel); +} + + +/** + * @brief Retrieves a channel's transport initialization arguments. + * It doesn't lock, the caller must ensure the channel may be accessed. + * @param channel CommChannel structure to get initialization arguments from. + * @return initialization arguments used to allocate/attach to channel. + */ + +CommTranspInitArgs +CommSvc_GetTranspInitArgs(CommChannel channel) +{ + return Comm_GetTranspInitArgs(channel); +} + + +/** + * @brief Retrieves upper layer state (pointer). It doesn't lock, the caller + * must ensure the channel may be accessed. + * @param channel CommChannel structure to get state from. + * @return pointer to upper layer state. + */ + +void * +CommSvc_GetState(CommChannel channel) +{ + return Comm_GetState(channel); +} + + +/** + * @brief Writes a fully formatted packet (containing payload data, if + * applicable) to the specified channel. + * + * The operation may block until enough write space is available, but no + * more than the specified interval. The operation either writes the full + * amount of bytes, or it fails. Warning: callers must _not_ use the + * _Lock/_Unlock functions to bracket calls to this function. + * @param[in,out] channel channel to write to. + * @param packet packet to write. + * @param[in,out] timeoutMillis interval in milliseconds to wait. + * @return number of bytes written, 0 if it times out, -1 error. + * @sideeffects Data may be written to the channel. + */ + +int +CommSvc_Write(CommChannel channel, + const CommPacket *packet, + unsigned long long *timeoutMillis) +{ + return Comm_Write(channel, packet, timeoutMillis); +} + + +/** + * @brief Writes a packet and associated payload data to the specified channel. + * + * The operation may block until enough write space is available, but not + * more than the specified interval. The operation either writes the full + * amount of bytes, or it fails. Users may call this function successively + * to write several packets from large {io|k}vecs. If that's the case, the + * packet header needs to be updated in between calls, for the different + * (total) lengths. Warning: callers must _not_ use the _Lock/_Unlock + * functions to bracket calls to this function. + * @param[in,out] channel the specified channel + * @param packet packet to write + * @param[in,out] vec kvec to write from + * @param[in,out] vecLen length of kvec + * @param[in,out] timeoutMillis interval in milliseconds to wait + * @param[in,out] iovOffset must be set to 0 before first call (internal cookie) + * @return number of bytes written, 0 if it timed out, -1 error + * @sideeffects data may be written to the channel + */ + +int +CommSvc_WriteVec(CommChannel channel, + const CommPacket *packet, + struct kvec **vec, + unsigned int *vecLen, + unsigned long long *timeoutMillis, + unsigned int *iovOffset) +{ + return Comm_WriteVec(channel, packet, vec, vecLen, timeoutMillis, iovOffset); +} + + +/** + * @brief Releases channel ref count. This function is exported for the upper + * layer's 'activateNtf' callback which may be run asynchronously. The + * callback is protected from concurrent channel releases until it calls + * this function. + * @param[in,out] channel CommChannel structure to release. + */ + +void +CommSvc_Put(CommChannel channel) +{ + Comm_Put(channel); +} + + +/** + * @brief Uses the read lock. This function is exported for the upper layer + * such that it can order acquisition of a different lock (socket) with + * the release of the dispatch lock. + * @param[in,out] channel CommChannel structure to unlock. + */ + +void +CommSvc_DispatchUnlock(CommChannel channel) +{ + Comm_DispatchUnlock(channel); +} + + +/** + * @brief Lock the channel. + * + * Uses the writer lock. This function is exported for the upper layer + * to ensure that channel isn't closed while updating the layer state. + * It also guarantees that if the lock is taken, the entry is either ACTIVE + * or ZOMBIE. Operations using this function are expected to be short, + * since unlike the _Write functions, these callers cannot be signaled. + * @param[in,out] channel CommChannel structure to lock. + * @return zero if successful, -1 otherwise. + */ + +int +CommSvc_Lock(CommChannel channel) +{ + return Comm_Lock(channel); +} + + +/** + * @brief Unlock the channel. + * + * Uses the writer lock. This function is exported for the upper layer + * to ensure that channel isn't closed while updating the layer state. + * See Comm_WriteLock for details). + * @param[in,out] channel CommChannel structure to unlock. + */ + +void +CommSvc_Unlock(CommChannel channel) +{ + Comm_Unlock(channel); +} + + +/** + * @brief Schedules a work item on the AIO thread(s). + * @param[in,out] work work item to be scheduled. + * @return zero if successful, -1 otherwise. + */ + +int +CommSvc_ScheduleAIOWork(CommOSWork *work) +{ + return CommOS_ScheduleAIOWork(work); +} + + +/** + * @brief Requests events be posted in-line after the function completes. + * @param channel channel object. + * @return current number of requests for inline event posting, or -1 on error. + */ + +unsigned int +CommSvc_RequestInlineEvents(CommChannel channel) +{ + return Comm_RequestInlineEvents(channel); +} + + +/** + * @brief Requests events be posted out-of-band after the function completes. + * @param channel channel object. + * @return current number of requests for inline event posting, or -1 on error. + */ + +unsigned int +CommSvc_ReleaseInlineEvents(CommChannel channel) +{ + return Comm_ReleaseInlineEvents(channel); +} + + +#if defined(__linux__) +EXPORT_SYMBOL(CommSvc_RegisterImpl); +EXPORT_SYMBOL(CommSvc_UnregisterImpl); +EXPORT_SYMBOL(CommSvc_Alloc); +EXPORT_SYMBOL(CommSvc_Zombify); +EXPORT_SYMBOL(CommSvc_IsActive); +EXPORT_SYMBOL(CommSvc_GetTranspInitArgs); +EXPORT_SYMBOL(CommSvc_GetState); +EXPORT_SYMBOL(CommSvc_Write); +EXPORT_SYMBOL(CommSvc_WriteVec); +EXPORT_SYMBOL(CommSvc_Put); +EXPORT_SYMBOL(CommSvc_DispatchUnlock); +EXPORT_SYMBOL(CommSvc_Lock); +EXPORT_SYMBOL(CommSvc_Unlock); +EXPORT_SYMBOL(CommSvc_ScheduleAIOWork); +EXPORT_SYMBOL(CommSvc_RequestInlineEvents); +EXPORT_SYMBOL(CommSvc_ReleaseInlineEvents); +#endif // defined(__linux__) diff --git a/arch/arm/mvp/commkm/comm_svc.h b/arch/arm/mvp/commkm/comm_svc.h new file mode 100644 index 0000000..c4f3292 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_svc.h @@ -0,0 +1,71 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Communication functions exported by the comm_rt module. + */ + +#ifndef _COMM_SVC_H_ +#define _COMM_SVC_H_ + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "comm.h" + +int CommSvc_RegisterImpl(const CommImpl *impl); +void CommSvc_UnregisterImpl(const CommImpl *impl); +int CommSvc_Zombify(CommChannel channel, int inBH); +int CommSvc_IsActive(CommChannel channel); +CommTranspInitArgs CommSvc_GetTranspInitArgs(CommChannel channel); +void *CommSvc_GetState(CommChannel channel); +void CommSvc_Put(CommChannel channel); +void CommSvc_DispatchUnlock(CommChannel channel); +int CommSvc_Lock(CommChannel channel); +void CommSvc_Unlock(CommChannel channel); +int CommSvc_ScheduleAIOWork(CommOSWork *work); + +int +CommSvc_Alloc(const CommTranspInitArgs *transpArgs, + const CommImpl *impl, + int inBH, + CommChannel *newChannel); + +int +CommSvc_Write(CommChannel channel, + const CommPacket *packet, + unsigned long long *timeoutMillis); + +int +CommSvc_WriteVec(CommChannel channel, + const CommPacket *packet, + struct kvec **vec, + unsigned int *vecLen, + unsigned long long *timeoutMillis, + unsigned int *iovOffset); + +unsigned int CommSvc_RequestInlineEvents(CommChannel channel); +unsigned int CommSvc_ReleaseInlineEvents(CommChannel channel); + +#endif // _COMM_SVC_H_ diff --git a/arch/arm/mvp/commkm/comm_transp.h b/arch/arm/mvp/commkm/comm_transp.h new file mode 100644 index 0000000..6cc58ae --- /dev/null +++ b/arch/arm/mvp/commkm/comm_transp.h @@ -0,0 +1,90 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Generic shared memory transport API. + */ + +#ifndef _COMM_TRANSP_H_ +#define _COMM_TRANSP_H_ + +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/* + * Common shared memory identifier. + * External handle that makes sense to both hypervisor and guest. + */ + +#define COMM_TRANSP_ID_8_ANY ((unsigned char)-1) +#define COMM_TRANSP_ID_32_ANY ((unsigned int)-1) +#define COMM_TRANSP_ID_64_ANY ((unsigned long long)-1) + + +typedef struct CommTranspID { + union { + unsigned char d8[8]; + unsigned int d32[2]; + unsigned long long d64; + }; +} CommTranspID; + + +/* Basic initialization arguments. */ + +typedef enum CommTranspInitMode { + COMM_TRANSP_INIT_CREATE = 0x0, + COMM_TRANSP_INIT_ATTACH = 0x1 +} CommTranspInitMode; + +typedef struct CommTranspInitArgs { + unsigned int capacity; // Shared memory capacity. + unsigned int type; // Type / implementation using this area. + CommTranspID id; // ID (name) of shared memory area. + CommTranspInitMode mode; // Init mode (above). +} CommTranspInitArgs; + + +/** + * @brief Generate a type id from description (protocol) string. This function + * uses djb2, a string hashing algorithm by Dan Bernstein. + * (see http://www.cse.yorku.ca/~oz/hash.html) + * @param str string to hash + * @return 32-bit hash value + */ + +static inline unsigned int +CommTransp_GetType(const char *str) +{ + unsigned int hash = 5381; + int c; + + while ((c = *str++)) { + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + } + return hash; +} + +#endif // _COMM_TRANSP_H_ diff --git a/arch/arm/mvp/commkm/comm_transp_impl.h b/arch/arm/mvp/commkm/comm_transp_impl.h new file mode 100644 index 0000000..113cd21 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_transp_impl.h @@ -0,0 +1,165 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Generic shared memory transport private API. + */ + +#ifndef _COMM_TRANSP_IMPL_H_ +#define _COMM_TRANSP_IMPL_H_ + +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "comm_transp.h" + + +/* Shared memory opaque descriptor/handle. Only meaningful locally. */ + +typedef struct CommTranspPriv *CommTransp; + + +/* Asynchronous signaling initialization arguments. */ + +typedef enum CommTranspIOEvent { + COMM_TRANSP_IO_DETACH = 0x0, + COMM_TRANSP_IO_IN = 0x1, + COMM_TRANSP_IO_OUT = 0x2, + COMM_TRANSP_IO_INOUT = 0x3 +} CommTranspIOEvent; + +typedef struct CommTranspEvent { + void (*ioEvent)(CommTransp transp, CommTranspIOEvent event, void *data); + void *ioEventData; +} CommTranspEvent; + + +/* + * Mechanism to detect and optionally attach to, created shared memory regions. + */ + +typedef struct CommTranspListener { + int (*probe)(CommTranspInitArgs *transpArgs, void *probeData); + void *probeData; +} CommTranspListener; + + + +/* + * Function prototypes. + */ + +int CommTranspEvent_Init(void); +void CommTranspEvent_Exit(void); +int CommTranspEvent_Process(CommTranspID *transpID, CommTranspIOEvent event); +int +CommTranspEvent_Raise(unsigned int peerEvID, + CommTranspID *transpID, + CommTranspIOEvent event); + +int CommTransp_Init(void); +void CommTransp_Exit(void); + +int CommTransp_Register(const CommTranspListener *listener); +void CommTransp_Unregister(const CommTranspListener *listener); +int +CommTransp_Notify(const CommTranspID *notificationCenterID, + CommTranspInitArgs *transpArgs); + +int +CommTransp_Open(CommTransp *transp, + CommTranspInitArgs *transpArgs, + CommTranspEvent *transpEvent); +void CommTransp_Close(CommTransp transp); + +int CommTransp_EnqueueSpace(CommTransp transp); +int CommTransp_EnqueueReset(CommTransp transp); +int CommTransp_EnqueueCommit(CommTransp transp); +int +CommTransp_EnqueueSegment(CommTransp transp, + const void *buf, + unsigned int bufLen); + +int CommTransp_DequeueSpace(CommTransp transp); +int CommTransp_DequeueReset(CommTransp transp); +int CommTransp_DequeueCommit(CommTransp transp); +int +CommTransp_DequeueSegment(CommTransp transp, + void *buf, + unsigned int bufLen); + +unsigned int CommTransp_RequestInlineEvents(CommTransp transp); +unsigned int CommTransp_ReleaseInlineEvents(CommTransp transp); + + +/** + * @brief Enqueues data into the transport object, data is available for + * reading immediately. + * @param transp handle to the transport object. + * @param buf bytes to enqueue. + * @param bufLen number of bytes to enqueue. + * @return number of bytes enqueued on success, < 0 otherwise. + */ + +static inline int +CommTransp_EnqueueAtomic(CommTransp transp, + const void *buf, + unsigned int bufLen) +{ + int rc; + + CommTransp_EnqueueReset(transp); + rc = CommTransp_EnqueueSegment(transp, buf, bufLen); + if (CommTransp_EnqueueCommit(transp)) { + rc = -1; + } + return rc; +} + + +/** + * @brief Dequeues data from the transport object into a buffer. + * @param transp handle to the transport object. + * @param[out] buf buffer to copy to. + * @param bufLen number of bytes to dequeue. + * @return number of bytes dequeued on success, < 0 otherwise, + */ + +static inline int +CommTransp_DequeueAtomic(CommTransp transp, + void *buf, + unsigned int bufLen) +{ + int rc; + + CommTransp_DequeueReset(transp); + rc = CommTransp_DequeueSegment(transp, buf, bufLen); + if (CommTransp_DequeueCommit(transp)) { + rc = -1; + } + return rc; +} + +#endif // _COMM_TRANSP_IMPL_H_ diff --git a/arch/arm/mvp/commkm/comm_transp_mvp.c b/arch/arm/mvp/commkm/comm_transp_mvp.c new file mode 100644 index 0000000..f755de9 --- /dev/null +++ b/arch/arm/mvp/commkm/comm_transp_mvp.c @@ -0,0 +1,944 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Generic shared memory transport API. + */ +#include + +#include "comm_os.h" +#include "comm_transp_impl.h" + +#include "mvp_types.h" +#include "qp.h" + + +/* + * Opaque CommTransp structure. See comm_transp.h + */ + +struct CommTranspPriv { + QPHandle *qp; + CommTranspEvent event; + unsigned int peerEvID; + unsigned int writeSize; + unsigned int readSize; + uint32 backRef; + CommOSWork work; + CommOSAtomic raiseInline; +}; + +/* + * Transport table object accounting + */ + +typedef struct TranspTableEntry { + CommOSAtomic holds; + CommTransp transp; + CommOSWaitQueue wq; +} TranspTableEntry; + +TranspTableEntry transpTable[QP_MAX_QUEUE_PAIRS]; +static CommOSSpinlock_Define(transpTableLock); + +/** + * @brief Destroy the transport object + * @param transp transport object to destroy + * @sideeffects detaches from queue pair + */ + +static void +DestroyTransp(CommTransp transp) +{ + CommTranspID transpID; + int32 rc; + + if (!transp) { + CommOS_Debug(("Failed to close channel: Bad handle\n")); + return; + } + + CommOS_Log(("%s: Detaching channel [%u:%u]\n", + __FUNCTION__, + transp->qp->id.context, + transp->qp->id.resource)); + + transpID.d32[0] = transp->qp->id.context; + transpID.d32[1] = transp->qp->id.resource; + +#if !defined(COMM_BUILDING_SERVER) + /* + * Tell the host to detach, will block in the host + * until the host has unmapped memory. Once the + * host has unmapped, it is safe to free. + */ + CommTranspEvent_Raise(transp->peerEvID, + &transpID, + COMM_TRANSP_IO_DETACH); +#endif + + rc = QP_Detach(transp->qp); + +#if defined(COMM_BUILDING_SERVER) + /* + * Wake up waiters now that unmapping is complete + */ + CommOS_WakeUp(&transpTable[transp->backRef].wq); +#endif + + CommOS_Kfree(transp); + if (rc != QP_SUCCESS) { + CommOS_Log(("%s: Failed to detach. rc: %d\n", __FUNCTION__, rc)); + } else { + CommOS_Log(("%s: Channel detached.\n", __FUNCTION__)); + } +} + + +/** + * @brief Initialize the transport object table + */ + +static void +TranspTableInit(void) +{ + uint32 i; + CommOS_SpinLock(&transpTableLock); + for (i = 0; i < QP_MAX_QUEUE_PAIRS; i++) { + CommOS_WriteAtomic(&transpTable[i].holds, -1); + transpTable[i].transp = NULL; + } + CommOS_SpinUnlock(&transpTableLock); +} + + +/** + * @brief Add a transport object into the table + * @param transp handle to the transport object + * @return 0 on success, -1 otherwise + * @sideeffects increments entry refcount + */ + +static inline int32 +TranspTableAdd(CommTransp transp) +{ + uint32 i; + + if (!transp) { + return -1; + } + + CommOS_SpinLock(&transpTableLock); + for (i = 0; i < QP_MAX_QUEUE_PAIRS; i++) { + if ((transpTable[i].transp) == NULL) { + transpTable[i].transp = transp; + CommOS_WriteAtomic(&transpTable[i].holds, 1); + CommOS_WaitQueueInit(&transpTable[i].wq); + transp->backRef = i; + break; + } + } + CommOS_SpinUnlock(&transpTableLock); + + return 0; +} + +/** + * @brief retrieve a transport object and increment its ref count + * @param id transport id to retrieve + * @return transport object, or NULL if not found + * @sideeffects increments entry ref count + */ + +static inline CommTransp +TranspTableGet(CommTranspID *id) +{ + CommTransp transp; + uint32 i; + + if (!id) { + return NULL; + } + + for (i = 0; i < QP_MAX_QUEUE_PAIRS; i++) { + transp = transpTable[i].transp; + if (transp && + (transp->qp->id.context == id->d32[0]) && + (transp->qp->id.resource == id->d32[1])) { + CommOS_AddReturnAtomic(&transpTable[i].holds, 1); + return transp; + } + } + CommOS_Debug(("%s: couldn't find transport object\n", __FUNCTION__)); + + return NULL; +} + +/** + * @brief Puts back a previously TranspGet-ed transport object. + * @param transp the transport object. + * @sideeffects decrements the transport reference count. + * frees object if refcount now zero + */ + +static inline void +TranspTablePut(CommTransp transp) +{ + int32 holds; + int32 backRef; + if (!transp) { + return; + } + + backRef = transp->backRef; + BUG_ON(backRef >= QP_MAX_QUEUE_PAIRS); + + holds = CommOS_SubReturnAtomic(&transpTable[backRef].holds, 1); + if (holds > 0) { + return; + } + BUG_ON(holds < 0); + + CommOS_SpinLock(&transpTableLock); + CommOS_WriteAtomic(&transpTable[backRef].holds, -1); + transpTable[backRef].transp = NULL; + CommOS_SpinUnlock(&transpTableLock); + DestroyTransp(transp); +} + + +/** + * @brief Puts back a previously TranspGet-ed transport object. + * @param transp the transport object. + * @sideeffects decrements the transport reference count. + * asserts that remaining count > 0 + */ + +static inline void +TranspTablePutNF(CommTransp transp) +{ + int32 holds; + int32 backRef; + if (!transp) { + return; + } + + backRef = transp->backRef; + BUG_ON(backRef >= QP_MAX_QUEUE_PAIRS); + + holds = CommOS_SubReturnAtomic(&transpTable[backRef].holds, 1); + BUG_ON(holds <= 0); +} + + +/** + * @brief Raises INOUT event in-line or out-of-band. Note that this function + * expects the transport object to be held prior to being called. + * @param arg work item of transport object. + */ + +static void +RaiseEvent(CommOSWork *arg) +{ +#if !defined(__linux__) +#error "RaiseEvent() is only supported on linux. Port 'container_of'!" +#endif + CommTransp transp = container_of(arg, struct CommTranspPriv, work); + CommTranspID transpID = {{ + .d32 = { + [0] = transp->qp->id.context, + [1] = transp->qp->id.resource + } + }}; + + CommTranspEvent_Raise(transp->peerEvID, + &transpID, + COMM_TRANSP_IO_INOUT); + TranspTablePut(transp); +} + + +/** + * @brief Requests events be posted in-line after the function completes. + * @param transp transport object. + * @return current number of requests for inline event posting. + * @sideeffects posts an event on the first transition to in-line processing. + */ + +unsigned int +CommTransp_RequestInlineEvents(CommTransp transp) +{ + unsigned int res = CommOS_AddReturnAtomic(&transp->raiseInline, 1); + if (res == 1) { + /* On the first (effective) transition, make sure an event is raised. */ + + CommOS_AddReturnAtomic(&transpTable[transp->backRef].holds, 1); + RaiseEvent(&transp->work); + } + return res; +} + + +/** + * @brief Requests events be posted out-of-band after the function completes. + * @param transp transport object. + * @return current number of requests for inline event posting. + */ + +unsigned int +CommTransp_ReleaseInlineEvents(CommTransp transp) +{ + return CommOS_SubReturnAtomic(&transp->raiseInline, 1); +} + + +/* + * Comm Offload server callbacks. + */ + +#if defined(COMM_BUILDING_SERVER) + +#define COMM_MAX_LISTENERS QP_MAX_LISTENERS + +static int32 NotifyCB(const QPInitArgs *args); +static void DetachCB(void *data); + +static CommOSSpinlock_Define(listenersLock); +static CommTranspListener listeners[COMM_MAX_LISTENERS]; +static uint32 numListeners = 0; + + +/** + * @brief Notify callback when guests attach to queue pairs. Notifies any + * registered listeners (e.g. Comm layer). + * @param args Initialization arguments used by the guest to initialize + * its queue pair + * @return 0 on success, <0 otherwise. see qp.h for error codes. + */ + +static int32 +NotifyCB(const QPInitArgs* args) +{ + CommTranspInitArgs transpArgs; + uint32 i; + int32 rc = -1; + + if (!args) { + return QP_ERROR_INVALID_ARGS; + } + + transpArgs.id.d32[0] = args->id.context; + transpArgs.id.d32[1] = args->id.resource; + transpArgs.capacity = args->capacity; + transpArgs.type = args->type; + + CommOS_SpinLock(&listenersLock); + for (i = 0; i < COMM_MAX_LISTENERS; i++) { + if (listeners[i].probe && + (listeners[i].probe(&transpArgs, listeners[i].probeData) == 0)) { + CommOS_Debug(("%s: Delivered notify event to listener %u\n", + __FUNCTION__, + i)); + rc = 0; + break; + } + } + CommOS_SpinUnlock(&listenersLock); + return rc; +} + + +/** + * @brief Detach callback when guests detach from queue pairs. Notifies + * any registered listeners (e.g. CommComm layer). + * @param data Transport object passed when the callback was registered + */ + +static void +DetachCB(void *data) +{ + CommTransp transp = data; + if (!transp || !(transp->event.ioEvent)) { + return; + } + CommOS_Debug(("%s: Guest detached from [%u:%u]\n", + __FUNCTION__, + transp->qp->id.context, + transp->qp->id.resource)); + transp->event.ioEvent(transp, COMM_TRANSP_IO_DETACH, transp->event.ioEventData); +} +#endif + + +/** + * @brief Performs one-time initialization of mvp transport provider. + * @return 0 on success, < 0 otherwise. + */ + +int +CommTransp_Init(void) +{ + int32 rc; + TranspTableInit(); + + rc = CommTranspEvent_Init(); + +#if defined(COMM_BUILDING_SERVER) + if (!rc) { + QP_RegisterListener(NotifyCB); + } +#endif + return rc; +} + + +/** + * @brief Performs clean-up of mvp transport provider. + */ + +void +CommTransp_Exit(void) +{ + CommTranspEvent_Exit(); +#if defined(COMM_BUILDING_SERVER) + QP_UnregisterListener(NotifyCB); +#endif +} + +#if defined(COMM_BUILDING_SERVER) + +/** + * @brief Checks for a successful detach from Comm + * @param arg1 back reference index for channel in transport table + * @param arg2 ignored + * @return 1 if detach completed, 0 otherwise + */ + +static int +DetachCondition(void *arg1, void *arg2) +{ + uint32 backRef = (uint32)arg1; + + return (CommOS_ReadAtomic(&transpTable[backRef].holds) == -1); +} +#endif + + +/** + * @brief Processes a raised signal event. This is a callback function called + * from a comm_transp_ev plugin when a signal is received. Delivers an event + * to one or more channels. If id->d32[1] == COMM_TRANSP_ID_32_ANY, the event + * will be delivered to all registered channels associated with vmID + * id->d32[0]. + * @param id identifies a transport object to signal. + * @param event type of event. + * @return 0 if delivered to at least one channel, -1 on failure. + */ + +int +CommTranspEvent_Process(CommTranspID *id, + CommTranspIOEvent event) +{ + int rc = 0; + unsigned int delivered = 0; + unsigned int backRef; + int i = 0; + + CommTransp transp; + uint32 raiseOnAllChannels = (id->d32[1] == COMM_TRANSP_ID_32_ANY); + uint32 channels = raiseOnAllChannels ? QP_MAX_QUEUE_PAIRS : 1; + + while (channels--) { + if (raiseOnAllChannels) { + id->d32[1] = i++; + } + transp = TranspTableGet(id); + if (transp) { + if (transp->event.ioEvent) { + transp->event.ioEvent(transp, event, transp->event.ioEventData); + } + backRef = transp->backRef; + TranspTablePut(transp); + +#if defined(COMM_BUILDING_SERVER) + /* + * Wait for unmap on IO_DETACH, return to monitor. + */ + if (event == COMM_TRANSP_IO_DETACH) { + unsigned long long timeout = 30000; + + rc = CommOS_Wait(&transpTable[backRef].wq, + DetachCondition, + (void*)backRef, + NULL, + &timeout); + switch (rc) { + case 1: // Memory successfully unmapped + rc = 0; + break; + default: // Timed out or other error. + return -1; + } + } +#endif + delivered++; + } + } + + rc = (delivered > 0) ? 0 : -1; + return rc; +} + + +/** + * @brief Register a listener to be notified when guests attach to the Comm + * offload server + * @param listener the listener to be notified + * @return 0 on success, -1 on failure + */ + +int +CommTransp_Register(const CommTranspListener *listener) +{ + int32 rc = -1; +#if defined(COMM_BUILDING_SERVER) + uint32 i; + + if (!listener) { + return -1; + } + + CommOS_SpinLock(&listenersLock); + for (i = 0; i < COMM_MAX_LISTENERS; i++) { + if ((listeners[i].probe == NULL) && + (listeners[i].probeData == NULL)) { + listeners[i] = *listener; + numListeners++; + rc = 0; + CommOS_Debug(("%s: Registered listener %u\n", __FUNCTION__, i)); + break; + } + } + CommOS_SpinUnlock(&listenersLock); +#endif + return rc; +} + + +/** + * @brief Unregisters a listener from the transport event notification system + * @param listener listener to unregister + * @return 0 on success + */ + +void +CommTransp_Unregister(const CommTranspListener *listener) +{ +#if defined(COMM_BUILDING_SERVER) + uint32 i; + + if (!listener || !listener->probe) { + return; + } + + + CommOS_SpinLock(&listenersLock); + for (i = 0; i < COMM_MAX_LISTENERS; i++) { + if ((listeners[i].probe == listener->probe) && + (listeners[i].probeData == listener->probeData)) { + listeners[i].probe = NULL; + listeners[i].probeData = NULL; + numListeners--; + CommOS_Debug(("%s: Unregistered listener %u\n", __FUNCTION__, i)); + } + } + CommOS_SpinUnlock(&listenersLock); +#endif +} + + +/** + * @brief Allocates and initializes a transport object + * @param[in,out] transp handle to the transport to allocate and initialize + * @param transpArgs initialization arguments (see pvtcpTransp.h) + * @param transpEvent event callback to be delivered when events occur (e.g. + * detach events) + * @return 0 on success, <0 otherwise. See qp.h for error codes. + * @sideeffects Allocates memory + */ + +int +CommTransp_Open(CommTransp *transp, + CommTranspInitArgs *transpArgs, + CommTranspEvent *transpEvent) +{ + int32 rc = -1; + QPHandle *qp = NULL; + CommTransp transpOut = NULL; + QPInitArgs qpInitArgs; + + if (!transp || !transpArgs) { + return -1; + } + + CommOS_Log(("%s: Attaching to [%u:%u]. Capacity: %u\n", + __FUNCTION__, + transpArgs->id.d32[1], + transpArgs->id.d32[0], + transpArgs->capacity)); + + qpInitArgs.id.context = transpArgs->id.d32[0]; + qpInitArgs.id.resource = transpArgs->id.d32[1]; + qpInitArgs.capacity = transpArgs->capacity; + qpInitArgs.type = transpArgs->type; + + if (!(transpOut = CommOS_Kmalloc(sizeof *transpOut))) { + rc = -1; + goto out; + } + + /* + * Attach to the queue pair + */ + rc = QP_Attach(&qpInitArgs, &qp); + if (rc < 0) { + rc = -1; + goto out; + } + + transpOut->qp = qp; + + /* + * Reassign ID so Comm knows what ID was actually given + */ + transpArgs->id.d32[0] = qp->id.context; + transpArgs->id.d32[1] = qp->id.resource; + + if (transpEvent) { + transpOut->event = *transpEvent; + } else { + transpOut->event.ioEvent = NULL; + transpOut->event.ioEventData = NULL; + } + +#if defined(COMM_BUILDING_SERVER) + CommOS_Debug(("%s: Registering detach CB on id %u...\n", + __FUNCTION__, transpArgs->id.d32[1])); + QP_RegisterDetachCB(transpOut->qp, DetachCB, transpOut); +#endif + + transpOut->peerEvID = COMM_TRANSP_ID_32_ANY; + transpOut->writeSize = 0; + transpOut->readSize = 0; + CommOS_InitWork(&transpOut->work, RaiseEvent); + CommOS_WriteAtomic(&transpOut->raiseInline, 0); + + if (TranspTableAdd(transpOut)) { + CommOS_Log(("%s: Exceeded max limit of transport objects!\n", + __FUNCTION__)); + DestroyTransp(transpOut); + rc = -1; + goto out; + } + + *transp = transpOut; + rc = 0; + + CommOS_Log(("%s: Channel attached.\n", __FUNCTION__)); + +out: + if (rc && transpOut) { + CommOS_Log(("%s: Failed to attach: %d\n", __FUNCTION__, rc)); + CommOS_Kfree(transpOut); + } + + return rc; +} + + +/** + * @brief Tear down the transport channel, destroy the object if the refcount + * drops to zero + * @param transp handle to the transport channel + * @sideeffects decrements the entry's refcount + */ + +void +CommTransp_Close(CommTransp transp) { + if (!transp) { + return; + } + CommOS_FlushAIOWork(&transp->work); + TranspTablePut(transp); +} + + +/** + * @brief Returns available space for enqueue, in bytes + * @param transp handle to the transport object + * @return available space in the queue for enqueue operations, <0 + * on error conditions. see qp.h for error codes. + */ + +int +CommTransp_EnqueueSpace(CommTransp transp) +{ + if (!transp) { + return -1; + } + return QP_EnqueueSpace(transp->qp); +} + + +/** + * @brief Discards any pending enqueues + * @param transp handle to the transport object + * @return 0 on success, <0 otherwise. see qp.h for error codes + */ + +int +CommTransp_EnqueueReset(CommTransp transp) +{ + if (!transp) { + return -1; + } + transp->writeSize = 0; + return QP_EnqueueReset(transp->qp); +} + + +/** + * @brief Enqueues a segment of data into the transport object + * @param transp handle to the transport object + * @param buf data to enqueue + * @param bufLen number of bytes to enqueue + * @return number of bytes enqueued on success, <0 otherwise. see qp.h + * for error codes + */ + +int +CommTransp_EnqueueSegment(CommTransp transp, + const void *buf, + unsigned int bufLen) +{ + int rc; + + if (!transp) { + return -1; + } + rc = QP_EnqueueSegment(transp->qp, (void*)buf, bufLen); + if (rc >= 0) { + transp->writeSize += (unsigned int)rc; + } else { + transp->writeSize = 0; + } + return rc; +} + + +/** + * @brief Commits any previous EnqueueSegment operations to the transport + * object. + * @param transp handle to the transport object. + * @return 0 on success, < 0 otherwise. + */ + +int +CommTransp_EnqueueCommit(CommTransp transp) +{ + int rc; + + if (!transp) { + return -1; + } + + rc = QP_EnqueueCommit(transp->qp); + if (rc >= 0) { + const unsigned int fudge = 4; + int writable = CommTransp_EnqueueSpace(transp); + + if ((writable >= 0) && + ((transp->writeSize + (unsigned int)writable + fudge) >= + transp->qp->queueSize)) { + /* + * If bytes written since last commit + writable space 'almost' + * equal write queue size, then signal. The 'almost' fudge factor + * accounts for a possibly inaccurate CommTransp_EnqueueSpace() + * return value. Most of the time, this is inconsequential. In + * rare, borderline occasions, it results in a few extra signals. + * The scheme essentially means this: if this is the first packet + * to be write-committed, we signal. Otherwise, the remote end is + * supposed to keep going for as long as it can read. + * + */ + + BUG_ON(transp->backRef >= QP_MAX_QUEUE_PAIRS); + CommOS_AddReturnAtomic(&transpTable[transp->backRef].holds, 1); + if (CommOS_ReadAtomic(&transp->raiseInline)) { + RaiseEvent(&transp->work); + } else if (CommOS_ScheduleAIOWork(&transp->work)) { + TranspTablePutNF(transp); + } + } + } else { + rc = -1; + } + transp->writeSize = 0; + return rc; +} + + +/** + * @brief Returns any available bytes for dequeue + * @param transp handle to the transport object + * @return available bytes for dequeue, <0 otherwise. see qp.h for error codes + */ + +int +CommTransp_DequeueSpace(CommTransp transp) +{ + if (!transp) { + return -1; + } + return QP_DequeueSpace(transp->qp); +} + + +/** + * @brief Discards any pending dequeues + * @param transp handle to the transport object + * @return 0 on success, <0 otherwise, see qp.h for error codes + */ + +int +CommTransp_DequeueReset(CommTransp transp) +{ + if (!transp) { + return -1; + } + transp->readSize = 0; + return QP_DequeueReset(transp->qp); +} + + +/** + * @brief Dequeues a segment of data from the consumer queue into + * a buffer + * @param transp handle to the transport object + * @param[out] buf buffer to copy to + * @param bufLen number of bytes to dequeue + * @return number of bytes dequeued on success, <0 otherwise, + * see qp.h for error codes + */ + +int +CommTransp_DequeueSegment(CommTransp transp, + void *buf, + unsigned bufLen) +{ + int rc; + + if (!transp) { + return -1; + } + rc = QP_DequeueSegment(transp->qp, buf, bufLen); + if (rc >= 0) { + transp->readSize += (unsigned int)rc; + } else { + transp->readSize = 0; + } + return rc; +} + + +/** + * @brief Commits any previous DequeueSegment operations to the + * transport object. + * @param transp handle to the transport object. + * @return 0 on success, < 0 otherwise. + */ + +int +CommTransp_DequeueCommit(CommTransp transp) +{ + int rc; + + if (!transp) { + return -1; + } + rc = QP_DequeueCommit(transp->qp); + if (rc >= 0) { + int readable = CommTransp_DequeueSpace(transp); + const unsigned int limit = transp->qp->queueSize / 2; + + if ((readable >= 0) && + (transp->readSize + (unsigned int)readable >= limit) && + ((unsigned int)readable < limit)) { + /* + * Minimize the number of likely 'peer write OK' signalling: + * only do it, if reading crossed half-way down. + * + */ + + BUG_ON(transp->backRef >= QP_MAX_QUEUE_PAIRS); + CommOS_AddReturnAtomic(&transpTable[transp->backRef].holds, 1); + if (CommOS_ReadAtomic(&transp->raiseInline)) { + RaiseEvent(&transp->work); + } else if (CommOS_ScheduleAIOWork(&transp->work)) { + TranspTablePut(transp); + } + } + } else { + rc = -1; + } + /* coverity[deref_after_free] */ + transp->readSize = 0; + return rc; +} + + +/** + * @brief Notify any registered listeners for the given queue pair + * @param notificationCenterID noop, unused on MVP + * @param transpArgs initialization arguments used by the guest for this + * channel + * @sideeffects the host may attach to the queue pair + */ + +int +CommTransp_Notify(const CommTranspID *notificationCenterID, + CommTranspInitArgs *transpArgs) +{ + QPInitArgs args; + + args.id.context = transpArgs->id.d32[0]; + args.id.resource = transpArgs->id.d32[1]; + args.capacity = transpArgs->capacity; + args.type = transpArgs->type; + + CommOS_Debug(("%s: d32[0]: %u d32[1]: %u\n", + __FUNCTION__, + transpArgs->id.d32[0], + transpArgs->id.d32[1])); + QP_Notify(&args); + return 0; +} diff --git a/arch/arm/mvp/commkm/fatalerror.h b/arch/arm/mvp/commkm/fatalerror.h new file mode 100644 index 0000000..9676ff3 --- /dev/null +++ b/arch/arm/mvp/commkm/fatalerror.h @@ -0,0 +1,126 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief fatal error handlers. They all post fatal errors regardless of build + * type. + */ + +#ifndef _FATALERROR_H +#define _FATALERROR_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mvp_compiler.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum FECode { + FECodeMisc, ///< generic FATAL() call of sorts + FECodeOOM, ///< FATAL_OOM() call of sorts + FECodeAssert, ///< ASSERT() call of sorts + FECodeNR, ///< NOT_REACHED() call of sorts + FECodeNI, ///< NOT_IMPLEMENTED() call of sorts + FECodeNT, ///< NOT_TESTED() call of sorts + FECodeCF ///< COMPILE_FAIL() call of sorts +}; +typedef enum FECode FECode; + +#define FATAL() FatalError(__FILE__, __LINE__, FECodeMisc, 0, NULL) +#define FATAL_IF(x) do { if (UNLIKELY(x)) FATAL(); } while (0) +#define FATAL_OOM() FatalError(__FILE__, __LINE__, FECodeOOM, 0, NULL) +#define FATAL_OOM_IF(x) do { if (UNLIKELY(x)) FATAL_OOM(); } while (0) + +extern _Bool FatalError_hit; + +void NORETURN FatalError(char const *file, + int line, + FECode feCode, + int bugno, + char const *fmt, + ...) FORMAT(printf,5,6); + +#define FATALERROR_COMMON(printFunc, \ + printFuncV, \ + file, \ + line, \ + feCode, \ + bugno, \ + fmt) { \ + va_list ap; \ + \ + printFunc("FatalError: %s:%d, code %d, bugno %d\n", \ + file, line, feCode, bugno); \ + if (fmt != NULL) { \ + va_start(ap, fmt); \ + printFuncV(fmt, ap); \ + va_end(ap); \ + } \ + } + +#if defined IN_HOSTUSER || defined IN_GUESTUSER || defined IN_WORKSTATION + +#define FATALERROR_POSIX_USER \ +void \ +FatalError_VErrPrintf(const char *fmt, va_list ap) \ +{ \ + vfprintf(stderr, fmt, ap); \ +} \ +\ +void \ +FatalError_ErrPrintf(const char *fmt, ...) \ +{ \ + va_list ap; \ + va_start(ap, fmt); \ + FatalError_VErrPrintf(fmt, ap); \ + va_end(ap); \ +} \ +\ +void NORETURN \ +FatalError(char const *file, \ + int line, \ + FECode feCode, \ + int bugno, \ + const char *fmt, \ + ...) \ +{ \ + FATALERROR_COMMON(FatalError_ErrPrintf, FatalError_VErrPrintf, file, line, feCode, bugno, fmt); \ + exit(EXIT_FAILURE); \ +} +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/arch/arm/mvp/commkm/include_check.h b/arch/arm/mvp/commkm/include_check.h new file mode 100644 index 0000000..2eeafe7 --- /dev/null +++ b/arch/arm/mvp/commkm/include_check.h @@ -0,0 +1,18 @@ +/* + * Linux 2.6.32 and later Kernel module for Empty File Placeholder + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ diff --git a/arch/arm/mvp/commkm/mksck.h b/arch/arm/mvp/commkm/mksck.h new file mode 100644 index 0000000..e9e10bc --- /dev/null +++ b/arch/arm/mvp/commkm/mksck.h @@ -0,0 +1,153 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +#ifndef _MKSCK_H +#define _MKSCK_H + +/** + * @file + * + * @brief The monitor-kernel socket interface definitions. + * + * The monitor kernel socket interface was created for (what the name + * says) communications between the monitor and host processes. On the + * monitor side a special API is introduced, see mksck_vmm.h. On the + * host side the API is the standard Berkeley socket interface. Host + * process to host process or monitor to monitor communication is not + * supported. + * + * A generic address consists of two 16 bit fields: the vm id and the + * port id. Both hosts (vmx) and monitors (vmm) get their vm id + * automatically. The host vm id is assigned at the time the host + * process opens the mvpkm file descriptor, while the monitor vm id is + * assigned when the vmx.c:SetupWorldSwitchPage() calls + * Mvpkm_SetupIds(). As a vmx may create multiple monitors to service + * an MP guest, a vmx vm id may be associated with multiple monitor vm + * ids. A monitor id, however, has a single associated vmx host id, + * the id of its canonical vmx. + * + * Sockets on the host get their addresses either by explicit user + * call (the bind command) or implicitly by (issuing a send command + * first). At an explicit bind the user may omit one or both fields by + * providing MKSCK_VMID_UNDEF/MKSCK_PORT_UNDEF respectively. An + * implicit bind behaves as if both fields were omitted in an explicit + * bind. The default value of the vmid field is the vmid computed from + * the thread group id while that of a port is a new number. It is not + * invalid to bind a host process socket with a vm id different from + * the vmid computed from the tgid. + * + * Sockets of the monitor are automatically assigned a vmid, that of their + * monitor, at the time of their creation. The port id can be assigned by the + * user or left to the implementation to assign an unused one (by specifying + * MKSCK_PORT_UNDEF at @ref Mksck_Open). + * + * Host unconnected sockets may receive from any monitor sender, may send to any + * monitor socket. A socket can be connected to a peer address, that enables the + * use of the send command. + * + * One of many special predefined port (both host and monitor) is + * MKSCK_PORT_MASTER. It is used for initialization. + * + * Monitor sockets have to send their peer address explicitly (by + * Mksck_SetPeer()) or implicitly by receiving first. After the peer + * is set, monitor sockets may send or receive only to/from their + * peer. + */ + + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "vmid.h" + +/* + * The interface limits the size of transferable packets. + */ +#define MKSCK_XFER_MAX 1024 + +#define MKSCK_ADDR_UNDEF (uint32)0xffffffff + +#define MKSCK_PORT_UNDEF (uint16)0xffff +#define MKSCK_PORT_MASTER (MKSCK_PORT_UNDEF-1) +#define MKSCK_PORT_HOST_FB (MKSCK_PORT_UNDEF-2) +#define MKSCK_PORT_BALLOON (MKSCK_PORT_UNDEF-3) +#define MKSCK_PORT_HOST_HID (MKSCK_PORT_UNDEF-4) +#define MKSCK_PORT_CHECKPOINT (MKSCK_PORT_UNDEF-5) +#define MKSCK_PORT_COMM_EV (MKSCK_PORT_UNDEF-6) +#define MKSCK_PORT_HIGH (MKSCK_PORT_UNDEF-7) + +#define MKSCK_VMID_UNDEF VMID_UNDEF +#define MKSCK_VMID_HIGH (MKSCK_VMID_UNDEF-1) + +#define MKSCK_DETACH 3 + +typedef uint16 Mksck_Port; +typedef VmId Mksck_VmId; + +/** + * @brief Page descriptor for typed messages. Each page describes a region of + * the machine address space with base mpn and size 2^(12 + order) bytes. + */ +typedef struct { + uint32 mpn : 20; ///< Base MPN of region described by page + uint32 order : 12; ///< Region is 2^(12 + order) bytes. +} Mksck_PageDesc; + +/** + * @brief Typed message template macro. Allows us to avoid having two message + * types, one with page descriptor vector (for VMM), one without (for + * VMX). + * + * @param type C type of uninterpreted component of the message (following the + * page descriptor vector). + * @param pages number of page descriptors in vector. + */ +#define MKSCK_DESC_TYPE(type,pages) \ + struct { \ + type umsg; \ + Mksck_PageDesc page[pages]; \ + } + +/** + * @brief The monitor kernel socket interface address format + */ +typedef union { + uint32 addr; ///< the address + struct { /* The address is decomposed to two shorts */ + Mksck_Port port; ///< port unique within a vmid + Mksck_VmId vmId; ///< unique vmid + }; +} Mksck_Address; + +static inline uint32 +Mksck_AddrInit(Mksck_VmId vmId, Mksck_Port port) +{ + Mksck_Address aa; + aa.vmId = vmId; + aa.port = port; + return aa.addr; +} +#endif diff --git a/arch/arm/mvp/commkm/mksck_sockaddr.h b/arch/arm/mvp/commkm/mksck_sockaddr.h new file mode 100644 index 0000000..82df240 --- /dev/null +++ b/arch/arm/mvp/commkm/mksck_sockaddr.h @@ -0,0 +1,50 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Host user space definitions for mksck sockets. + */ + +#ifndef _MKSCK_SOCKADDR_H_ +#define _MKSCK_SOCKADDR_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mksck.h" + +/* no one ever uses DECnet anymore? */ +#define AF_MKSCK AF_DECnet +#define PF_MKSCK PF_DECnet + +/* Address structure used by the host user socket interface. */ +struct sockaddr_mk { + sa_family_t mk_family; + Mksck_Address mk_addr; +}; + +#endif diff --git a/arch/arm/mvp/commkm/mvp.h b/arch/arm/mvp/commkm/mvp.h new file mode 100644 index 0000000..a57f8cc --- /dev/null +++ b/arch/arm/mvp/commkm/mvp.h @@ -0,0 +1,48 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief top-level include for all basic includes. + * This file should not define anything of its own. + */ + +#ifndef _MVP_H +#define _MVP_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mvp_compiler.h" +#include "utils.h" +#include "mvp_assert.h" +#include "mvp_types.h" +#include "platdefx.h" + +#endif diff --git a/arch/arm/mvp/commkm/mvp_assert.h b/arch/arm/mvp/commkm/mvp_assert.h new file mode 100644 index 0000000..cbc5ed8 --- /dev/null +++ b/arch/arm/mvp/commkm/mvp_assert.h @@ -0,0 +1,125 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief ASSERT() and related macros. + */ + +#ifndef _MVP_ASSERT_H +#define _MVP_ASSERT_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define ASSERT(_x) ASSERT_BUG((_x),0) + +#ifndef NDEBUG +#define ASSERT_BUG(_x,_tkt) do { \ + if (UNLIKELY(!(_x))) { \ + FatalError(__FILE__, __LINE__, FECodeAssert, _tkt, NULL); \ + } \ +} while (0) + +#define ASSERTF(_x, ...) do { \ + if (UNLIKELY(!(_x))) { \ + FatalError(__FILE__, \ + __LINE__, \ + FECodeAssert, \ + 0, \ + __VA_ARGS__); \ + } \ +} while (0) +#else + +#define ASSERT_BUG(_x,_tkt) (void)sizeof((int)(_x)) +#define ASSERTF(_x, ...) ASSERT_BUG(_x, 0) + +#endif + +/* + * Compile-time assertions. + * + * ASSERT_ON_COMPILE does not use the common + * switch (0) { case 0: case (e): ; } trick because some compilers (e.g. MSVC) + * generate code for it. + * + * The implementation uses both enum and typedef because the typedef alone is + * insufficient; gcc allows arrays to be declared with non-constant expressions + * (even in typedefs, where it makes no sense). + */ +#ifdef __COVERITY__ +#define ASSERT_ON_COMPILE(e) ASSERT(e) +#else +#define ASSERT_ON_COMPILE(e) \ + do { \ + enum { AssertOnCompileMisused = ((e) ? 1 : -1) }; \ + typedef char AssertOnCompileFailed[AssertOnCompileMisused]; \ + } while (0) +#endif + +/* + * To put an ASSERT_ON_COMPILE() outside a function, wrap it + * in MY_ASSERTS(). The first parameter must be unique in + * each .c file where it appears. For example, + * + * MY_ASSERTS(FS3_INT, + * ASSERT_ON_COMPILE(sizeof(FS3_DiskLock) == 128); + * ASSERT_ON_COMPILE(sizeof(FS3_DiskLockReserved) == DISK_BLOCK_SIZE); + * ASSERT_ON_COMPILE(sizeof(FS3_DiskBlock) == DISK_BLOCK_SIZE); + * ASSERT_ON_COMPILE(sizeof(Hardware_DMIUUID) == 16); + * ) + * + * Caution: ASSERT() within MY_ASSERTS() is silently ignored. + * The same goes for anything else not evaluated at compile time. + */ + +#define MY_ASSERTS(name, assertions) \ + static inline void name(void) { \ + assertions \ + } + +#define KNOWN_BUG(_tkt) + +#define NOT_IMPLEMENTED() NOT_IMPLEMENTED_JIRA(0) +#define NOT_IMPLEMENTED_JIRA(_tkt,...) FatalError(__FILE__, __LINE__, FECodeNI, _tkt, NULL) + +#define NOT_IMPLEMENTED_IF(_x) NOT_IMPLEMENTED_IF_JIRA((_x),0) +#define NOT_IMPLEMENTED_IF_JIRA(_x,_tkt,...) do { if (UNLIKELY(_x)) NOT_IMPLEMENTED_JIRA(_tkt); } while (0) +/* + * All sites tagged with this are @knownjira{MVP-1855}. + */ +#define NOT_IMPLEMENTEDF(...) FatalError(__FILE__, __LINE__, FECodeNI, 0, __VA_ARGS__) + +#define NOT_REACHED() FatalError(__FILE__, __LINE__, FECodeNR, 0, NULL) + +#include "fatalerror.h" +#include "nottested.h" + +#endif diff --git a/arch/arm/mvp/commkm/mvp_compiler.h b/arch/arm/mvp/commkm/mvp_compiler.h new file mode 100644 index 0000000..21af455 --- /dev/null +++ b/arch/arm/mvp/commkm/mvp_compiler.h @@ -0,0 +1,56 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Compiler-related definitions and directives. + */ + +#ifndef _MVP_COMPILER_H_ +#define _MVP_COMPILER_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#ifdef __GNUC__ +#include "mvp_compiler_gcc.h" +#else /* __GNUC__ */ +#include "mvp_compiler_other.h" +#endif /* __GNUC__ */ + +/** + * @brief Find last set bit. + * + * @param n unsigned 32-bit integer. + * + * @return 0 if n == 0 otherwise 32 - the number of leading zeroes in n. + */ +#define FLS(n) (32 - CLZ(n)) + +#endif /// ifndef _MVP_COMPILER_H_ diff --git a/arch/arm/mvp/commkm/mvp_compiler_gcc.h b/arch/arm/mvp/commkm/mvp_compiler_gcc.h new file mode 100644 index 0000000..fbc96e3 --- /dev/null +++ b/arch/arm/mvp/commkm/mvp_compiler_gcc.h @@ -0,0 +1,87 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief common definitions for GCC + */ + +#ifndef _MVP_COMPILER_GCC_H +#define _MVP_COMPILER_GCC_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/** + * @brief Count leading zeroes. + * + * @param n unsigned 32-bit integer. + * + * @return 32 if n == 0 otherwise 31 - the bit position of the most significant 1 + * in n. + */ +#ifdef __COVERITY__ +static inline int +CLZ(unsigned int n) +{ + unsigned int r = 0; + + while (n) { + r++; + n >>= 1; + } + + return 32 - r; +} +#else +#define CLZ(n) __builtin_clz(n) +#endif + +#define PACKED __attribute__ ((packed)) +#define ALLOC __attribute__ ((malloc, warn_unused_result)) +#define UNUSED __attribute__ ((unused)) +#define PURE __attribute__ ((pure)) +#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result)) +#define FORMAT(x,y,z) __attribute__ ((format(x,y,z))) +#define LIKELY(x) __builtin_expect(!!(x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) + +/* + * For debug builds, we want to omit __attribute__((noreturn)) so that gcc will + * keep stack linkages and then we will have useful core dumps. For non-debug + * builds, we don't care about the stack frames and want the little bit of + * optimization that noreturn gives us. + */ +#if defined(__COVERITY__) || !defined(MVP_DEBUG) +#define NORETURN __attribute__((noreturn)) +#else +#define NORETURN +#endif + +#endif diff --git a/arch/arm/mvp/commkm/mvp_types.h b/arch/arm/mvp/commkm/mvp_types.h new file mode 100644 index 0000000..ba5c04c --- /dev/null +++ b/arch/arm/mvp/commkm/mvp_types.h @@ -0,0 +1,94 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief basic type definitions. + * These may need to be conditionalized for different compilers/platforms. + */ + +#ifndef _MVPTYPES_H +#define _MVPTYPES_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; +typedef unsigned long long uint64; + +typedef signed char int8; +typedef short int16; +typedef int int32; +typedef long long int64; + +typedef uint32 CVA; // whatever we are compiling the code as +typedef uint32 GVA; // guest virtual addresses +typedef uint32 MVA; // monitor virtual addresses +typedef uint32 HKVA; // host kernel virtual addresses +typedef uint32 HUVA; // host user virtual addresses +typedef uint64 PA; // (guest) physical addresses (40-bit) +typedef uint32 MA; // (host) machine addresses + +typedef uint32 PPN; // PA/PAGE_SIZE +typedef uint32 MPN; // MA/PAGE_SIZE + +typedef uint64 cycle_t; + +/** + * @brief Page segment. + * + * Specifies a segment within a single page. + */ +typedef struct { + uint16 off; + uint16 len; +} PageSeg; + +/* + * GCC's argument checking for printf-like functions + * + * fmtPos is the position of the format string argument, beginning at 1 + * varPos is the position of the variable argument, beginning at 1 + */ + +#if defined(__GNUC__) +# define PRINTF_DECL(fmtPos, varPos) __attribute__((__format__(__printf__, fmtPos, varPos))) +#else +# define PRINTF_DECL(fmtPos, varPos) +#endif + +#if defined(__GNUC__) +# define SCANF_DECL(fmtPos, varPos) __attribute__((__format__(__scanf__, fmtPos, varPos))) +#else +# define SCANF_DECL(fmtPos, varPos) +#endif + +#endif /* _MVPTYPES_H */ diff --git a/arch/arm/mvp/commkm/mvpkm_comm_ev.h b/arch/arm/mvp/commkm/mvpkm_comm_ev.h new file mode 100644 index 0000000..b220a9b --- /dev/null +++ b/arch/arm/mvp/commkm/mvpkm_comm_ev.h @@ -0,0 +1,53 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief mvpkm kernel hooks for comm event signaling + */ + +#ifndef _MVPKM_COMM_EV_H +#define _MVPKM_COMM_EV_H + +extern int (*CommTranspEvProcess)(CommTranspID* id, CommTranspIOEvent event); + +/** + * @brief Forward any guest signal requests to the commkm module + * @param id transport channel id + * @param event comm event type + */ + +static inline void +Mvpkm_CommEvSignal(CommTranspID *id, CommTranspIOEvent event) +{ + if (CommTranspEvProcess) { + CommTranspEvProcess(id, event); + } +} + +void +Mvpkm_CommEvRegisterProcessCB(int (*commProcessFunc)(CommTranspID*, + CommTranspIOEvent)); +void Mvpkm_CommEvUnregisterProcessCB(void); + + + +#endif diff --git a/arch/arm/mvp/commkm/nottested.h b/arch/arm/mvp/commkm/nottested.h new file mode 100644 index 0000000..c5c1e26 --- /dev/null +++ b/arch/arm/mvp/commkm/nottested.h @@ -0,0 +1,54 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief NOT_TESTED() and related. + */ + +#ifndef _NOTTESTED_H +#define _NOTTESTED_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include + +#ifdef NOT_TESTED_ENABLED +#define NotTestedEnabled true +#else +#define NotTestedEnabled false +#endif + +#define NOT_TESTED() NOT_TESTED_JIRA(0) +#define NOT_TESTED_JIRA(_tkt,...) NotTested(_tkt, __FILE__, __LINE__) + +void NotTested(int tkt, char const *file, int line); + +#endif diff --git a/arch/arm/mvp/commkm/platdefx.h b/arch/arm/mvp/commkm/platdefx.h new file mode 100644 index 0000000..42953e6 --- /dev/null +++ b/arch/arm/mvp/commkm/platdefx.h @@ -0,0 +1,67 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Basic platform definitions needed various places. + */ + +#ifndef _PLATDEFX_H +#define _PLATDEFX_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define PAGE_ORDER 12 + +#ifndef PAGE_SIZE +#define PAGE_SIZE (1UL << PAGE_ORDER) +#endif +#if PAGE_SIZE != 4096 +#error bad page size PAGE_SIZE +#endif + +#define PA_2_PPN(_pa) ((_pa) / PAGE_SIZE) +#define PPN_2_PA(_ppn) ((_ppn) * PAGE_SIZE) + +#define VMM_DOMAIN 0x0 +#define VMM_DOMAIN_NO_ACCESS 0x3 +#define VMM_DOMAIN_CLIENT 0x1 +#define VMM_DOMAIN_MANAGER 0x4 + +#define INVALID_CVA (-(CVA)1) +#define INVALID_GVA (-(GVA)1) +#define INVALID_MVA (-(MVA)1) +#define INVALID_HKVA (-(HKVA)1) +#define INVALID_HUVA (-(HUVA)1) + +#define INVALID_MPN (((MPN)-1) >> ARM_L2D_SMALL_ORDER) +#define INVALID_PPN (((PPN)-1) >> ARM_L2D_SMALL_ORDER) + +#endif diff --git a/arch/arm/mvp/commkm/qp.h b/arch/arm/mvp/commkm/qp.h new file mode 100644 index 0000000..d4a50ec --- /dev/null +++ b/arch/arm/mvp/commkm/qp.h @@ -0,0 +1,332 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MVP Queue Pairs function and structure declarations + * + * MVP Queue Pairs: + * + * Queue pairs are intended to be a generic bulk data transport mechanism + * between the guest and host kernels. The queue pair abstraction is based + * on two ring buffers (queues) placed on a shared memory region mapped + * into both guest and host kernel address spaces. + * + * NOTE: Queue pairs are SINGLE-READER, SINGLE-WRITER. Any caller is + * responsible for multi-reader/writer serialization!!! + * + * There are a maximum of QP_MAX_QUEUE_PAIRS in the system, with a maximum + * size of QP_MAX_CAPACITY per pair. Each queue pair is identified by + * an ID. + * + * Each peer follows a producer-consumer model in which one side is the + * producer on one queue, and the other side is the consumer on that queue + * (and vice-versa for its pair). + * + * Data is enqueued and dequeued into the pair in transactional stages, + * meaning each enqueue/dequeue can be followed by zero or more + * enqueue/dequeues, but the enqueue/dequeue is not visible to the peer + * until it has been committed with the *Commit() function. + * In PVTCP, for example, this is used to enqueue a short header, then + * followed by 'segments' of iovecs, then followed by a commit. This + * model prevents a peer from reading the header, expecting a payload, + * but not being able to read the payload because it hasn't been + * enqueued yet. + * + * Queue Pair setup: + * + * Before data can be passed, the guest and host kernel must perform + * the following connection handshake: + * + * 1). A host kernel service registers a listener with the queue pair + * subsystem with a callback to be called when guests create + * and attach to a shared memory region. + * + * 2). Guest initiates an QP_Attach() operation to a shared memory region + * keyed by ID. This step allocates memory, maps it into the host + * address space, and optionally notifies any host services who are + * listening for attach requests from the guest (see previous step). + * Host listeners are provided with a copy of the initialization + * arguments used by the guest (id, size, service type). All registered + * listeners are iterated over until one of them handles the attach + * request and acknowledges with QP_SUCCESS. + * + * 3). The registered host callback is called, notifying the host that + * the guest has attached. + * + * 4). The host can now QP_Attach() to the shared memory region with the same + * arguments as the guest. The queue pair is now well formed and enqueues + * and dequeues can proceed on either side. + * + * Queue Pair teardown: + * + * 1). As before, teardowns are initiated by the guest. Hosts can register + * a callback to be called upon detach. Guests initiate a teardown + * through a call to QP_Detach(). + * + * 2). Registered hosts are notified through the aforementioned callback. + * 3). The host service can call QP_Detach() at its own leisure. Memory + * is freed, the queue pair is destroyed. + * + * If at any point the guest unexpectedly shuts down, the host will be + * notified at monitor shutdown time. Memory is freed, and the queue + * pair is destroyed. + * + */ + +#ifndef _QP_H +#define _QP_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +//#define QP_DEBUG 1 + +typedef enum QPState { + QP_STATE_FREE = 0x1, ///< No peers, not memory-backed + QP_STATE_CONNECTED, ///< Both peers attached , memory backed + QP_STATE_GUEST_ATTACHED, ///< Guest allocated memory, host not yet attached + QP_STATE_MAX // leave this at the end! +} QPState; + +typedef struct QPId { + uint32 context; + uint32 resource; +} QPId; + +/* + * Initialization arguments for each queue pair + */ +typedef struct QPInitArgs { + QPId id; ///< Shared memory region ID + uint32 capacity; ///< Total size of shared region in bytes + uint32 type; ///< Type of queue pair (PVTCP, other)... +} QPInitArgs; + +/* + * Placed on the shared region, two per region + */ +typedef struct QHandle { + volatile uint32 head; ///< queue head offset + volatile uint32 tail; ///< queue tail offset + volatile uint32 phantom_head; ///< queue shadow head offset + volatile uint32 phantom_tail; ///< queue shadow tail offset + uint8 data[0]; ///< start of data, runs off + // the struct +} QHandle; + +/* + * Local to each peer + */ +typedef struct QPHandle { + QPId id; ///< shared memory region ID + uint32 capacity; ///< size of region in bytes + QHandle *produceQ; ///< producer queue + QHandle *consumeQ; ///< consumer queue + uint32 queueSize; ///< size of each queue in bytes + uint32 type; ///< type of queue pair + + /* + * Following fields unused by guest + */ + QPState state; + void (*peerDetachCB)(void* data); ///< detach notification callback + void *detachData; ///< data for the detach cb + struct page **pages; ///< page pointers for shared region +} QPHandle; + +/* + * QP Error codes + */ +#define QP_SUCCESS 0 +#define QP_ERROR_NO_MEM (-1) +#define QP_ERROR_INVALID_HANDLE (-2) +#define QP_ERROR_INVALID_ARGS (-3) +#define QP_ERROR_ALREADY_ATTACHED (-4) + +/* + * Hard-coded limits + */ +#define QP_MIN_CAPACITY (PAGE_SIZE * 2) +#define QP_MAX_CAPACITY (1024*1024) // 1M +#define QP_MAX_QUEUE_PAIRS 32 +#define QP_MAX_ID QP_MAX_QUEUE_PAIRS +#define QP_MAX_LISTENERS QP_MAX_QUEUE_PAIRS +#define QP_MAX_PAGES (QP_MAX_CAPACITY/PAGE_SIZE) // 256 pages + +#define QP_INVALID_ID 0xFFFFFFFF +#define QP_INVALID_SIZE 0xFFFFFFFF +#define QP_INVALID_REGION 0xFFFFFFFF +#define QP_INVALID_TYPE 0xFFFFFFFF + +#ifdef __KERNEL__ +/** + * @brief Utility function to sanity check arguments + * @param args argument structure to check + * @return true if arguments are sane, false otherwise + */ +static inline +_Bool QP_CheckArgs(QPInitArgs *args) +{ + if (!args || + !is_power_of_2(args->capacity) || + (args->capacity < QP_MIN_CAPACITY) || + (args->capacity > QP_MAX_CAPACITY) || + !(args->id.resource < QP_MAX_ID || args->id.resource == QP_INVALID_ID) || + (args->type == QP_INVALID_TYPE)) { + return false; + } else { + return true; + } +} +#endif + + +/** + * @brief Utility function to sanity check a queue pair handle + * @param qp handle to the queue pair + * @return true if the handle is sane, false otherwise + */ +static inline +_Bool QP_CheckHandle(QPHandle *qp) +{ +#ifdef MVP_DEBUG + if (!(qp) || + !(qp->produceQ) || + !(qp->consumeQ) || + (qp->state >= (uint32)QP_STATE_MAX) || + !(qp->queueSize < (QP_MAX_CAPACITY/2))) { + return false; + } else { + return true; + } +#else + return true; +#endif +} + + +/** + * @brief Initializes an invalid handle + * @param[in, out] qp handle to the queue pair + */ +static inline void +QP_MakeInvalidQPHandle(QPHandle *qp) +{ + if (!qp) { + return; + } + + qp->id.context = QP_INVALID_ID; + qp->id.resource = QP_INVALID_ID; + qp->capacity = QP_INVALID_SIZE; + qp->produceQ = NULL; + qp->consumeQ = NULL; + qp->queueSize = QP_INVALID_SIZE; + qp->type = QP_INVALID_TYPE; + qp->state = QP_STATE_FREE; + qp->peerDetachCB = NULL; + qp->detachData = NULL; +} + +/* + * Host only + */ +typedef int32 (*QPListener)(const QPInitArgs*); +int32 QP_RegisterListener(const QPListener); +int32 QP_UnregisterListener(const QPListener); +int32 QP_RegisterDetachCB(QPHandle *qp, void (*callback)(void*), void *data); + + +/* + * Host and guest specific implementations, see qp_host.c and qp_guest.c + */ +int32 QP_Attach(QPInitArgs *args, QPHandle** qp); +int32 QP_Detach(QPHandle* qp); +int32 QP_Notify(QPInitArgs *args); + +/* + * Common implementation, see qp_common.c + */ +int32 QP_EnqueueSpace(QPHandle *qp); +int32 QP_EnqueueSegment(QPHandle *qp, const void *buf, size_t length); +int32 QP_EnqueueCommit(QPHandle *qp); +int32 QP_EnqueueReset(QPHandle *qp); + +static inline int32 +QP_EnqueueAtomic(QPHandle *qp, const void *buf, size_t length) +{ + int32 rc; + QP_EnqueueReset(qp); + rc = QP_EnqueueSegment(qp, buf, length); + if (rc < 0) { + return rc; + } else { + QP_EnqueueCommit(qp); + } + return rc; +} + +int32 QP_DequeueSpace(QPHandle *qp); +int32 QP_DequeueSegment(QPHandle *qp, const void *buf, size_t length); +int32 QP_DequeueReset(QPHandle *qp); +int32 QP_DequeueCommit(QPHandle *qp); + +static inline int32 +QP_DequeueAtomic(QPHandle *qp, const void *buf, size_t length) +{ + int32 rc; + QP_DequeueReset(qp); + rc = QP_DequeueSegment(qp, buf, length); + if (rc < 0) { + return rc; + } else { + QP_DequeueCommit(qp); + } + return rc; +} + +/* + * HVC methods and signatures + */ +#define MVP_QP_SIGNATURE 0x53525051 ///< 'QPRS' +#define MVP_QP_ATTACH (MVP_OBJECT_CUSTOM_BASE + 0) ///< attach to a queue pair +#define MVP_QP_DETACH (MVP_OBJECT_CUSTOM_BASE + 1) ///< detach from a queue pair +#define MVP_QP_NOTIFY (MVP_OBJECT_CUSTOM_BASE + 2) ///< notify host of attach +#define MVP_QP_LAST (MVP_OBJECT_CUSTOM_BASE + 3) ///< Number of methods + +/* + * Debug macros + */ +#ifdef QP_DEBUG + #ifdef IN_MONITOR + #define QP_DBG(...) Log(__VA_ARGS__) + #else + #define QP_DBG(...) printk(KERN_INFO __VA_ARGS__) + #endif +#else + #define QP_DBG(...) +#endif + +#endif diff --git a/arch/arm/mvp/commkm/utils.h b/arch/arm/mvp/commkm/utils.h new file mode 100644 index 0000000..b5f1e18 --- /dev/null +++ b/arch/arm/mvp/commkm/utils.h @@ -0,0 +1,172 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief General architecture-independent definitions, typedefs, and macros. + */ + +#ifndef _UTILS_H +#define _UTILS_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define MAX_FILENAME 128 + +// Round address up to given size boundary +// Note: ALIGN() conflicts with Linux + +#define MVP_ALIGN(_v, _n) (((_v) + (_n) - 1) & -(_n)) + +#define ALIGNVA(_addr, _size) MVP_ALIGN(_addr, _size) + +#define alignof(t) offsetof(struct { char c; typeof(t) x; }, x) + +#define MIN(x,y) ((x) < (y) ? (x) : (y)) +#define MAX(x,y) ((x) > (y) ? (x) : (y)) + +#ifndef NULL +#define NULL ((void *)0) +#endif + +#define KB(_X_) ((_X_)*1024U) +#define MB(_X_) (KB(_X_)*1024) +#define GB(_X_) (MB(_X_)*1024) + +#define NELEM(x) (sizeof(x)/sizeof((x)[0])) + +/* + * x in [low,high) + * args evaluated once + */ +#define RANGE(x,low,high) \ + ({ \ + typeof(x) _x = (x); \ + typeof(x) _low = (typeof(x))(low); \ + typeof(x) _high =(typeof(x))(high); \ + (_Bool)( (_low <= _x) && (_x < _high)); \ + }) + +#define OBJECTS_PER_PAGE(_type) (PAGE_SIZE / sizeof(_type)) + +#define MA_2_MPN(_ma) ((MPN)((_ma) / PAGE_SIZE)) +#define MPN_2_MA(_mpn) ((MA)((_mpn) * PAGE_SIZE)) + +#define VA_2_VPN(_va) ((_va) / PAGE_SIZE) +#define VPN_2_vA(_vpn) ((_vpn) * PAGE_SIZE) + +/* + * The following convenience macro can be used in a following situation + * + * send(..., &foo, sizeof(foo)) --> send(..., PTR_N_SIZE(foo)) + */ + +#define PTR_N_SIZE(_var) &(_var), sizeof(_var) + + +/* + * + * BIT-PULLING macros + * + */ +#define MVP_BIT(val,n) ( ((val)>>(n))&1) +#define MVP_BITS(val,m,n) (((val)<<(31-(n))) >> ((31-(n))+(m)) ) +#define MVP_EXTRACT_FIELD(w, m, n) MVP_BITS((w), (m), ((m) + (n) - 1)) +#define MVP_MASK(m, n) (MVP_EXTRACT_FIELD(~(uint32)0U, (m), (n)) << (m)) +#define MVP_UPDATE_FIELD(old_val, field_val, m, n) \ + (((old_val) & ~MVP_MASK((m), (n))) | (MVP_EXTRACT_FIELD((field_val), 0, (n)) << (m))) + +/* + * + * 64BIT-PULLING macros + * + */ +#define MVP_BITS64(val,m,n) (((val)<<(63-(n))) >> ((63-(n))+(m)) ) +#define MVP_EXTRACT_FIELD64(w, m, n) MVP_BITS64((w), (m), ((m) + (n) - 1)) +#define MVP_MASK64(m, n) (MVP_EXTRACT_FIELD64(~(uint64)0ULL, (m), (n)) << (m)) +#define MVP_UPDATE_FIELD64(old_val, field_val, m, n) \ + (((old_val) & ~MVP_MASK64((m), (n))) | (MVP_EXTRACT_FIELD64(((uint64)(field_val)), 0ULL, (n)) << (m))) + +/* + * + * BIT-CHANGING macros + * + */ +#define MVP_SETBIT(val,n) ((val)|=(1<<(n))) +#define MVP_CLRBIT(val,n) ((val)&=(~(1<<(n)))) + +/* + * Fixed bit-width sign extension. + */ +#define MVP_SIGN_EXTEND(val,width) \ + (((val) ^ (1 << ((width) - 1))) - (1 << ((width) - 1))) + + +/* + * Assembler helpers. + */ +#define _MVP_HASH # +#define MVP_HASH() _MVP_HASH + +#define _MVP_STRINGIFY(...) #__VA_ARGS__ +#define MVP_STRINGIFY(...) _MVP_STRINGIFY(__VA_ARGS__) + +#ifndef __ASSEMBLER__ + +#include +#include + +/* + * Constant equivalents of build-flags. + * + * Test these when possible instead of using #ifdef so that your code + * gets parsed. + */ +#ifdef MVP_DEBUG +static const _Bool mvpDebug = true; +#else +static const _Bool mvpDebug = false; +#endif + +#ifdef MVP_STATS +static const _Bool mvpStats = true; +#else +static const _Bool mvpStats = false; +#endif + +#ifdef MVP_DEVEL +static const _Bool mvpDevel = true; +#else +static const _Bool mvpDevel = false; +#endif + +#endif + +#endif diff --git a/arch/arm/mvp/commkm/vmid.h b/arch/arm/mvp/commkm/vmid.h new file mode 100644 index 0000000..f24a650 --- /dev/null +++ b/arch/arm/mvp/commkm/vmid.h @@ -0,0 +1,44 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Guest Communications + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +#ifndef _VMID_H +#define _VMID_H + +/** + * @file + * + * @brief The vmid definition + */ + + + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define VMID_UNDEF (uint16)0xffff +typedef uint16 VmId; + +#endif diff --git a/arch/arm/mvp/mvpkm/COPYING b/arch/arm/mvp/mvpkm/COPYING new file mode 100644 index 0000000..10828e0 --- /dev/null +++ b/arch/arm/mvp/mvpkm/COPYING @@ -0,0 +1,341 @@ + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/arch/arm/mvp/mvpkm/Kbuild b/arch/arm/mvp/mvpkm/Kbuild new file mode 100644 index 0000000..fc2fe96 --- /dev/null +++ b/arch/arm/mvp/mvpkm/Kbuild @@ -0,0 +1,24 @@ +# Warning: autogenerated +obj-m := mvpkm.o +mvpkm-objs := check_kconfig.o cpufreq_kernel.o mksck_kernel.o montimer_kernel.o mutex_kernel.o mvpkm_comm_ev.o mvpkm_main.o qp_host_kernel.o qp_common.o mksck_shared.o vfp_switch.o + +ccflags-y += -fno-pic -fno-dwarf2-cfi-asm -march=armv7-a -D__linux__ +ccflags-y += -mfpu=neon -DLIB_ARM_VERSION=7 -DIN_MODULE -DGPLED_CODE +ccflags-y += --std=gnu89 -O2 -g2 -ggdb -mapcs -fno-optimize-sibling-calls -mno-sched-prolog +ccflags-$(CONFIG_VMWARE_MVP_DEBUG) += -DMVP_DEBUG + +asflags-y += -mfpu=neon -DLIB_ARM_VERSION=7 -DIN_MODULE -DGPLED_CODE +asflags-y += -mfloat-abi=softfp + +LOWMEMKILLER_PATH := $(srctree)/drivers/staging/android/lowmemorykiller.c +ifeq ($(wildcard $(LOWMEMKILLER_PATH)),) +$(error "Unable to find lowmemorykiller.c at $(LOWMEMKILLER_PATH)") +endif +LOWMEMKILLER_MD5 := $(shell md5sum $(LOWMEMKILLER_PATH) | cut -f1 -d\ ) +LOWMEMKILLER_SUPPORT := $(srctree)/arch/arm/mvp/mvpkm/lowmemkiller_variant.sh +LOWMEMKILLER_SHRINK_MD5 := $(shell $(SHELL) $(LOWMEMKILLER_SUPPORT) $(LOWMEMKILLER_PATH) | cut -f1 -d\ ) +LOWMEMKILLER_VARIANT := $(shell $(SHELL) $(LOWMEMKILLER_SUPPORT) $(LOWMEMKILLER_PATH) | cut -f2 -d\ ) +ccflags-y += \ + -DLOWMEMKILLER_VARIANT=$(LOWMEMKILLER_VARIANT) \ + -DLOWMEMKILLER_SHRINK_MD5=$(LOWMEMKILLER_SHRINK_MD5) \ + -DLOWMEMKILLER_MD5=$(LOWMEMKILLER_MD5) diff --git a/arch/arm/mvp/mvpkm/Makefile b/arch/arm/mvp/mvpkm/Makefile new file mode 100644 index 0000000..16eb389 --- /dev/null +++ b/arch/arm/mvp/mvpkm/Makefile @@ -0,0 +1 @@ +# Warning: autogenerated diff --git a/arch/arm/mvp/mvpkm/actions.h b/arch/arm/mvp/mvpkm/actions.h new file mode 100644 index 0000000..0e89892 --- /dev/null +++ b/arch/arm/mvp/mvpkm/actions.h @@ -0,0 +1,57 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Bit definitions for instrBActions. + */ + +#ifndef _ACTIONS_H +#define _ACTIONS_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define L2_ACTION_GDB 0 ///< drop into guest debugger GDB +#define L2_ACTION_MKSCK 1 ///< scan the mksck pipes for incoming messages +#define L2_ACTION_ABORT 2 ///< abort the monitor cleanly +#define L2_ACTION_HALT 3 ///< halt the monitor +#define L2_ACTION_FIQ 6 ///< the VCPU's FIQ pin is active +#define L2_ACTION_IRQ 7 ///< the VCPU's IRQ pin is active +#define L2_ACTION_CKPT 8 ///< do a checkpoint +#define L2_ACTION_WFI 9 ///< wait for interrupt +#define L2_ACTION_TIMER 10 ///< timer event +#define L2_ACTION_BALLOON 11 ///< balloon trigger + +#define ACTION_GDB (1 << L2_ACTION_GDB) +#define ACTION_MKSCK (1 << L2_ACTION_MKSCK) +#define ACTION_ABORT (1 << L2_ACTION_ABORT) +#define ACTION_HALT (1 << L2_ACTION_HALT) +#define ACTION_IRQ (1 << L2_ACTION_IRQ) +#define ACTION_FIQ (1 << L2_ACTION_FIQ) +#define ACTION_CKPT (1 << L2_ACTION_CKPT) +#define ACTION_WFI (1 << L2_ACTION_WFI) +#define ACTION_TIMER (1 << L2_ACTION_TIMER) +#define ACTION_BALLOON (1 << L2_ACTION_BALLOON) + +#endif diff --git a/arch/arm/mvp/mvpkm/arm_as_macros.h b/arch/arm/mvp/mvpkm/arm_as_macros.h new file mode 100644 index 0000000..5a0b7fc --- /dev/null +++ b/arch/arm/mvp/mvpkm/arm_as_macros.h @@ -0,0 +1,91 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Macro definitions meta-ops to be used in assembler files + * + * This header contains asm macro definitions to be used in asm + * files only. This is intended to be the equivalent of arm_gcc_inline.h + */ + +#ifndef _ARM_AS_MACROS_H_ +#define _ARM_AS_MACROS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "coproc_defs.h" + +/** + * @name The following macros re-arrange the order of the mcr/mrc operands + * making it suitable to be used with the macros defined in coproc_defs.h + * + * @par For example + * mcr_p15 DOMAIN_CONTROL, r3 + * @par replaces + * mcr p15, 0, r3, c3, c0, 0 + * @{ + */ +.macro mcr_p15 op1, op2, op3, op4, reg, cond=al + mcr\cond p15, \op1, \reg, \op2, \op3, \op4 +.endm + +.macro mrc_p15 op1, op2, op3, op4, reg, cond=al + mrc\cond p15, \op1, \reg, \op2, \op3, \op4 +.endm + +.macro mcrr_p15 op1, op2, reg1, reg2 + mcrr p15, \op1, \reg1, \reg2, \op2 +.endm + +.macro mrrc_p15 op1, op2, reg1, reg2 + mrrc p15, \op1, \reg1, \reg2, \op2 +.endm +/*@}*/ + +/** + * @name Our toolchain does not include support for the VE instructions yet. + * @{ + */ +.macro hvc imm16 + .word ARM_INSTR_HVC_A1_ENC(\imm16) +.endm + +.macro eret + .word ARM_INSTR_ERET_A1_ENC(ARM_INSTR_COND_AL) +.endm + +.macro msr_ext rm, rn + .word ARM_INSTR_MSR_EXT_A1_ENC(ARM_INSTR_COND_AL, \rm, \rn) +.endm + +.macro mrs_ext rd, rm + .word ARM_INSTR_MRS_EXT_A1_ENC(ARM_INSTR_COND_AL, \rd, \rm) +.endm +/*@}*/ + +#endif /// ifndef _ARM_AS_MACROS_H_ diff --git a/arch/arm/mvp/mvpkm/arm_defs.h b/arch/arm/mvp/mvpkm/arm_defs.h new file mode 100644 index 0000000..2c39f6a --- /dev/null +++ b/arch/arm/mvp/mvpkm/arm_defs.h @@ -0,0 +1,54 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Umbrella header file for all ARM-related definitions. By + * including this you gain access to all such definitions in + * lib/arm and are guaranteed a stable include. + */ + +#ifndef _ARM_DEFS_H_ +#define _ARM_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define ARM_V4 4 +#define ARM_V5 5 +#define ARM_V6 6 +#define ARM_V7 7 +#define ARM_V8 8 + +#include "coproc_defs.h" +#include "exc_defs.h" +#include "instr_defs.h" +#include "mmu_defs.h" +#include "lpae_defs.h" +#include "ve_defs.h" +#include "psr_defs.h" + +#endif /// _ARM_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/arm_gcc_inline.h b/arch/arm/mvp/mvpkm/arm_gcc_inline.h new file mode 100644 index 0000000..33ffe69 --- /dev/null +++ b/arch/arm/mvp/mvpkm/arm_gcc_inline.h @@ -0,0 +1,206 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief GCC inline stubs for ARM assembler instructions. + */ + +#ifndef _ARM_GCC_INLINE_H_ +#define _ARM_GCC_INLINE_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "coproc_defs.h" + +/* + * Macros for accessing CP10. + */ +#define _ARM_CP10_MRCMCR_STR(_op1,_cr1,_cr2,_op2,_var) \ + " p10, " #_op1 ","#_var"," #_cr1 "," #_cr2 "," #_op2 "\n\t" + +#define _ARM_MRC_CP10(_op1,_cr1,_cr2,_op2,_var) \ + asm volatile ("mrc" _ARM_CP10_MRCMCR_STR(_op1,_cr1,_cr2,_op2,%0) \ + : "=r" (_var) ) + +#define ARM_MRC_CP10(_cp_reg,_var) _ARM_MRC_CP10(_cp_reg,_var) + +#define _ARM_MCR_CP10(_op1,_cr1,_cr2,_op2,_val) \ + asm volatile ("mcr" _ARM_CP10_MRCMCR_STR(_op1,_cr1,_cr2,_op2,%0) \ + : \ + : "r" (_val) ) + +#define ARM_MCR_CP10(_cp_reg,_val) _ARM_MCR_CP10(_cp_reg,_val) + + +/* + * Macros for accessing CP15. + */ +#define _ARM_CP15_MRCMCR_STR(_op1,_cr1,_cr2,_op2,_var) \ + " p15, " #_op1 ","#_var"," #_cr1 "," #_cr2 "," #_op2 "\n\t" + +#define ARM_CP15_MRCMCR_STR(_cp_reg,_var) _ARM_CP15_MRCMCR_STR(_cp_reg,_var) + +#ifdef __COVERITY__ +static uint32 __cp15; +#define _ARM_MRC_CP15(_op1,_cr1,_cr2,_op2,_var) \ + (_var) = (uint32)__cp15 +#else +#define _ARM_MRC_CP15(_op1,_cr1,_cr2,_op2,_var) \ + asm volatile ("mrc" _ARM_CP15_MRCMCR_STR(_op1,_cr1,_cr2,_op2,%0) \ + : "=r" (_var) \ + : \ + : "memory") +#endif + +#define ARM_MRC_CP15(_cp_reg,_var) _ARM_MRC_CP15(_cp_reg,_var) + + +#ifdef __COVERITY__ +#define _ARM_MCR_CP15(_op1,_cr1,_cr2,_op2,_val) \ + __cp15 = (_val) +#else +#define _ARM_MCR_CP15(_op1,_cr1,_cr2,_op2,_val) \ + asm volatile ("mcr" _ARM_CP15_MRCMCR_STR(_op1,_cr1,_cr2,_op2,%0) \ + : \ + : "r" (_val)\ + : "memory") +#endif + +#define ARM_MCR_CP15(_cp_reg,_val) _ARM_MCR_CP15(_cp_reg,_val) + +#define _ARM_MRRC_CP15(_op,_cr,_val1,_val2) \ + asm volatile ("mrrc p15, " #_op ",%0,%1," #_cr "\n\t" \ + : "=r" (_val1), "=r" (_val2) \ + : \ + : "memory") + +#define ARM_MRRC_CP15(_cp_reg,_val1,_val2) _ARM_MRRC_CP15(_cp_reg,_val1,_val2) + +#define ARM_MRRC64_CP15(_cp_reg,_val) \ + _ARM_MRRC_CP15(_cp_reg,_val,*((uint8 *)&(_val) + 4)) + +#define _ARM_MCRR_CP15(_op,_cr,_val1,_val2) \ + asm volatile ("mcrr p15, " #_op ",%0,%1," #_cr "\n\t" \ + : \ + : "r" (_val1), "r" (_val2) \ + : "memory") + +#define ARM_MCRR_CP15(_cp_reg,_val1,_val2) _ARM_MCRR_CP15(_cp_reg,_val1,_val2) + +#define ARM_MCRR64_CP15(_cp_reg,_val) \ + _ARM_MCRR_CP15(_cp_reg,_val,*((uint8 *)&(_val) + 4)) + +#define DMB() asm volatile ("dmb" : : : "memory") +#define DSB() asm volatile ("dsb" : : : "memory") +#define ISB() asm volatile ("isb" : : : "memory") + +/** + * @name 64-bit multiplies + * @{ + */ + +// rdhi:rdlo = rm * rs + rdhi + rdlo +#define ARM_UMAAL(rdlo,rdhi,rm,rs) asm ("umaal %0,%1,%2,%3" \ + : "+r" (rdlo), "+r" (rdhi) \ + : "r" (rm), "r" (rs)) + +// rdhi:rdlo += rm * rs +#define ARM_UMLAL(rdlo,rdhi,rm,rs) asm ("umlal %0,%1,%2,%3" \ + : "+r" (rdlo), "+r" (rdhi) \ + : "r" (rm), "r" (rs)) + +// rdhi:rdlo = rm * rs +#define ARM_UMULL(rdlo,rdhi,rm,rs) asm ("umull %0,%1,%2,%3" \ + : "=r" (rdlo), "=r" (rdhi) \ + : "r" (rm), "r" (rs)) +/*@}*/ + +/** + * @brief Disable interrupts (IRQ + FIQ) + * + * @return CPSR status prior to disabling - suitable for passing to + * ARM_RestoreInterrupts() to restore IRQ/FIQ levels to + * pre-call values + */ +static inline uint32 +ARM_DisableInterrupts(void) +{ + register uint32 status; + + asm volatile ("mrs %0, cpsr \n\t" + "orr r1, %0, %1 \n\t" + "msr cpsr_c, r1 \n\t" + : "=&r" (status) + : "i" (ARM_PSR_I | ARM_PSR_F) + : "r1", "memory"); + + return status; +} + +/** + * @brief Restore interrupts + * + * @param status return value from a previous call to ARM_DisableInterrupts() + */ +static inline void +ARM_RestoreInterrupts(uint32 status) +{ + asm volatile ("msr cpsr_c, %0 \n\t" : : "r" (status) : "memory"); +} + +/** + * @brief Read current CPSR value + * + * @return current CPSR value + */ +static inline uint32 +ARM_ReadCPSR(void) +{ + uint32 status; + + asm volatile ("mrs %0, cpsr \n\t" : "=r" (status)); + + return status; +} + +/** + * @brief Read current stack pointer + * + * @return stack pointer value + */ +static inline uint32 +ARM_ReadSP(void) +{ + uint32 sp; + + asm volatile ("mov %0, sp \n\t" : "=r" (sp)); + + return sp; +} + +#endif /// ifndef _ARM_GCC_INLINE_H_ diff --git a/arch/arm/mvp/mvpkm/arm_inline.h b/arch/arm/mvp/mvpkm/arm_inline.h new file mode 100644 index 0000000..3689a7f --- /dev/null +++ b/arch/arm/mvp/mvpkm/arm_inline.h @@ -0,0 +1,179 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Inline stubs for ARM assembler instructions. + */ + +#ifndef _ARM_INLINE_H_ +#define _ARM_INLINE_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "arm_types.h" +#include "arm_defs.h" + +/* + * Compiler specific include - we get the actual inline assembler macros here. + */ +#include "arm_gcc_inline.h" + +/* + * Some non-compiler specific helper functions for inline assembler macros + * included above. + */ + +/** + * @brief Predicate giving whether interrupts are currently enabled + * + * @return TRUE if enabled, FALSE otherwise + */ +static inline _Bool +ARM_InterruptsEnabled(void) +{ + return !(ARM_ReadCPSR() & ARM_PSR_I); +} + +/** + * @brief Read current TTBR0 base machine address + * + * @return machine address given by translation table base register 0 + */ +static inline MA +ARM_ReadTTBase0(void) +{ + MA ttbase; + + ARM_MRC_CP15(TTBASE0_POINTER, ttbase); + + return ttbase & ARM_CP15_TTBASE_MASK; +} + +/** + * @brief Read VFP/Adv.SIMD Extension System Register + * + * @param specReg which VFP/Adv. SIMD Extension System Register + * + * @return Read value + */ +static inline uint32 +ARM_ReadVFPSystemRegister(uint8 specReg) +{ + uint32 value = 0; + + /* + * VMRS is the instruction used to read VFP System Registers. + * VMRS is the new UAL-syntax equivalent for the FMRX instruction. + * At the end of the day, all these are just synonyms for MRC + * instructions on CP10, as the VFP system registers sit in CP10 + * and MRC is the Co-processor register read instruction. + * We use the primitive MRC synonym for VMRS here as VMRS/FMRX + * don't seem to be working when used inside asm volatile blocks, + * as, for some reason, the inline assembler seems to be setting + * the VFP mode to soft-float. Moreover, we WANT the monitor code + * to be compiled with soft-float so that the compiler doesn't use + * VFP instructions for the monitor's own use, such as for 64-bit + * integer operations, etc., since we pass-through the use of the + * underlying hardware's VFP/SIMD state to the guest. + */ + + switch (specReg) { + case ARM_VFP_SYSTEM_REG_FPSID: + ARM_MRC_CP10(VFP_FPSID, value); + break; + case ARM_VFP_SYSTEM_REG_MVFR0: + ARM_MRC_CP10(VFP_MVFR0, value); + break; + case ARM_VFP_SYSTEM_REG_MVFR1: + ARM_MRC_CP10(VFP_MVFR1, value); + break; + case ARM_VFP_SYSTEM_REG_FPEXC: + ARM_MRC_CP10(VFP_FPEXC, value); + break; + case ARM_VFP_SYSTEM_REG_FPSCR: + ARM_MRC_CP10(VFP_FPSCR, value); + break; + case ARM_VFP_SYSTEM_REG_FPINST: + ARM_MRC_CP10(VFP_FPINST, value); + break; + case ARM_VFP_SYSTEM_REG_FPINST2: + ARM_MRC_CP10(VFP_FPINST2, value); + break; + default: + NOT_IMPLEMENTED_JIRA(1849); + break; + } + + return value; +} + +/** + * @brief Write to VFP/Adv.SIMD Extension System Register + * + * @param specReg which VFP/Adv. SIMD Extension System Register + * @param value desired value to be written to the System Register + */ +static inline void +ARM_WriteVFPSystemRegister(uint8 specReg, uint32 value) +{ + /* + * VMSR is the instruction used to write to VFP System Registers. + * VMSR is the new UAL-syntax equivalent for the FMXR instruction. + * At the end of the day, all these are just synonyms for MCR + * instructions on CP10, as the VFP system registers sit in CP10 + * and MCR is the Co-processor register write instruction. + * We use the primitive MCR synonym for VMSR here as VMSR/FMXR + * don't seem to be working when used inside asm volatile blocks, + * as, for some reason, the inline assembler seems to be setting + * the VFP mode to soft-float. Moreover, we WANT the monitor code + * to be compiled with soft-float so that the compiler doesn't use + * VFP instructions for the monitor's own use, such as for 64-bit + * integer operations, etc., since we pass-through the use of the + * underlying hardware's VFP/SIMD state to the guest. + */ + + switch (specReg) { + case ARM_VFP_SYSTEM_REG_FPEXC: + ARM_MCR_CP10(VFP_FPEXC, value); + break; + case ARM_VFP_SYSTEM_REG_FPSCR: + ARM_MCR_CP10(VFP_FPSCR, value); + break; + case ARM_VFP_SYSTEM_REG_FPINST: + ARM_MCR_CP10(VFP_FPINST, value); + break; + case ARM_VFP_SYSTEM_REG_FPINST2: + ARM_MCR_CP10(VFP_FPINST2, value); + break; + default: + NOT_IMPLEMENTED_JIRA(1849); + break; + } +} + +#endif /// ifndef _ARM_INLINE_H_ diff --git a/arch/arm/mvp/mvpkm/arm_types.h b/arch/arm/mvp/mvpkm/arm_types.h new file mode 100644 index 0000000..2075860 --- /dev/null +++ b/arch/arm/mvp/mvpkm/arm_types.h @@ -0,0 +1,42 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Umbrella header file for all ARM-related types. + */ + +#ifndef _ARM_TYPES_H_ +#define _ARM_TYPES_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "exc_types.h" +#include "mmu_types.h" +#include "lpae_types.h" + +#endif /// _ARM_TYPES_H_ diff --git a/arch/arm/mvp/mvpkm/atomic.h b/arch/arm/mvp/mvpkm/atomic.h new file mode 100644 index 0000000..987860f --- /dev/null +++ b/arch/arm/mvp/mvpkm/atomic.h @@ -0,0 +1,88 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief bus-atomic operators. + * + * The 'atm' argument is the atomic memory cell being operated on and the + * remainder of the arguments are the values being applied to the atomic cell + * which is assumed to be located in shared normal memory. The operation is + * both atomic and visible to the default share-ability domain upon completion. + * + * The design of each macro is such that the compiler should check types + * correctly. For those macros that return a value, the return type should be + * the same as the 'atm' argument (with the exception of ATOMIC_SETIF which + * returns an int value of 0 or 1). + * + * Those names ending in 'M' return the modified value of 'atm'. + * Those names ending in 'O' return the original value of 'atm'. + * Those names ending in 'V' return void (ie, nothing). + */ + +#ifndef _ATOMIC_H +#define _ATOMIC_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#include "include_check.h" + +/* + * Wrap type 't' in an atomic struct. + * Eg, 'static ATOMIC(uint8) counter;'. + * + * The function macros use the atm_Normal member to clone the atom's type + * when the volatile semantic is not required. They use the atm_Volatl member + * when the volatile semantic is required. + */ +#define ATOMIC(t) union { t atm_Normal; t volatile atm_Volatl; } + +/* + * Static atomic variable initialization. + * Eg, 'static ATOMIC(uint8) counter = ATOMIC_INI(35);'. + */ +#define ATOMIC_INI(v) { .atm_Normal = v } + +/* + * Some commonly used atomic types. + */ +typedef ATOMIC(int32) AtmSInt32 __attribute__ ((aligned (4))); +typedef ATOMIC(uint32) AtmUInt32 __attribute__ ((aligned (4))); +typedef ATOMIC(uint64) AtmUInt64 __attribute__ ((aligned (8))); + +/* + * Architecture-dependent implementations. + */ +#if defined(__COVERITY__) +#include "atomic_coverity.h" +#elif defined(__arm__) +#include "atomic_arm.h" +#elif defined(__i386) || defined(__x86_64) +#include "atomic_x86.h" +#endif + +#endif diff --git a/arch/arm/mvp/mvpkm/atomic_arm.h b/arch/arm/mvp/mvpkm/atomic_arm.h new file mode 100644 index 0000000..447aa55 --- /dev/null +++ b/arch/arm/mvp/mvpkm/atomic_arm.h @@ -0,0 +1,329 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief bus-atomic operators, ARM implementation. + * Do not include directly, include 'atomic.h' instead. + * Memory where the atomic reside must be shared. + * + * These operations assume that the exclusive access monitor is cleared during + * abort entry but they do not assume that cooperative scheduling (e.g. Linux + * schedule()) clears the monitor and hence the use of "clrex" when required. + */ + +#ifndef _ATOMIC_ARM_H +#define _ATOMIC_ARM_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#include "include_check.h" + +#include "mvp_assert.h" + +/** + * @brief Atomic Add + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return the original value of 'atm' + */ +#define ATOMIC_ADDO(atm,modval) ATOMIC_OPO_PRIVATE(atm,modval,add) + +/** + * @brief Atomic Add + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return nothing + */ +#define ATOMIC_ADDV(atm,modval) ATOMIC_OPV_PRIVATE(atm,modval,add) + +/** + * @brief Atomic And + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return the original value of 'atm' + */ +#define ATOMIC_ANDO(atm,modval) ATOMIC_OPO_PRIVATE(atm,modval,and) + +/** + * @brief Atomic And + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return nothing + */ +#define ATOMIC_ANDV(atm,modval) ATOMIC_OPV_PRIVATE(atm,modval,and) + +/** + * @brief Retrieve an atomic value + * @param atm atomic cell to operate on + * @return the value of 'atm' + */ +#define ATOMIC_GETO(atm) ({ \ + typeof((atm).atm_Normal) _oldval; \ + switch (sizeof _oldval) { \ + case 4: \ + asm volatile ("ldrex %0, [%1]\n" \ + "clrex" \ + : "=&r" (_oldval) \ + : "r" (&((atm).atm_Volatl))); \ + break; \ + case 8: \ + asm volatile ("ldrexd %0, %H0, [%1]\n" \ + "clrex" \ + : "=&r" (_oldval) \ + : "r" (&((atm).atm_Volatl))); \ + break; \ + default: \ + FATAL(); \ + } \ + _oldval; \ +}) + +/** + * @brief Atomic Or + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return the original value of 'atm' + */ +#define ATOMIC_ORO(atm,modval) ATOMIC_OPO_PRIVATE(atm,modval,orr) + +/** + * @brief Atomic Or + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return nothing + */ +#define ATOMIC_ORV(atm,modval) ATOMIC_OPV_PRIVATE(atm,modval,orr) + +/** + * @brief Atomic Conditional Write, ie, + * set 'atm' to 'newval' iff it was 'oldval'. + * @param atm atomic cell to operate on + * @param newval value to possibly write to atomic cell + * @param oldval value that atomic cell must equal + * @return 0 if failed; 1 if successful + */ +#define ATOMIC_SETIF(atm,newval,oldval) ({ \ + int _failed; \ + typeof((atm).atm_Normal) _newval = newval; \ + typeof((atm).atm_Normal) _oldval = oldval; \ + ASSERT_ON_COMPILE(sizeof _newval == 4); \ + asm volatile ("1: ldrex %0, [%1] \n" \ + " cmp %0, %2 \n" \ + " mov %0, #2 \n" \ + " IT eq \n" \ + " strexeq %0, %3, [%1] \n" \ + " cmp %0, #1 \n" \ + " beq 1b \n" \ + " clrex" \ + : "=&r" (_failed) \ + : "r" (&((atm).atm_Volatl)), \ + "r" (_oldval), \ + "r" (_newval) \ + : "cc", "memory"); \ + !_failed; \ +}) + + +/** + * @brief Atomic Write (unconditional) + * @param atm atomic cell to operate on + * @param newval value to write to atomic cell + * @return the original value of 'atm' + */ +#define ATOMIC_SETO(atm,newval) ({ \ + int _failed; \ + typeof((atm).atm_Normal) _newval = newval; \ + typeof((atm).atm_Normal) _oldval; \ + switch (sizeof _newval) { \ + case 4: \ + asm volatile ("1: ldrex %0, [%2]\n" \ + " strex %1, %3, [%2]\n" \ + " teq %1, #0\n" \ + " bne 1b" \ + : "=&r" (_oldval), \ + "=&r" (_failed) \ + : "r" (&((atm).atm_Volatl)), \ + "r" (_newval) \ + : "cc", "memory"); \ + break; \ + case 8: \ + asm volatile ("1: ldrexd %0, %H0, [%2]\n" \ + " strexd %1, %3, %H3, [%2]\n"\ + " teq %1, #0\n" \ + " bne 1b" \ + : "=&r" (_oldval), \ + "=&r" (_failed) \ + : "r" (&((atm).atm_Volatl)), \ + "r" (_newval) \ + : "cc", "memory"); \ + break; \ + default: \ + FATAL(); \ + } \ + _oldval; \ +}) + +/** + * @brief Atomic Write (unconditional) + * @param atm atomic cell to operate on + * @param newval value to write to atomic cell + * @return nothing + */ +#define ATOMIC_SETV(atm,newval) do { ATOMIC_SETO((atm),(newval)); } while (0) + +/** + * @brief Atomic Subtract + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return the original value of 'atm' + */ +#define ATOMIC_SUBO(atm,modval) ATOMIC_OPO_PRIVATE(atm,modval,sub) + +/** + * @brief Atomic Subtract + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @return nothing + */ +#define ATOMIC_SUBV(atm,modval) ATOMIC_OPV_PRIVATE(atm,modval,sub) + +/** + * @brief Atomic Generic Binary Operation + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @param op ARM instruction (add, and, orr, etc) + * @return the original value of 'atm' + */ +#define ATOMIC_OPO_PRIVATE(atm,modval,op) ({ \ + int _failed; \ + typeof((atm).atm_Normal) _modval = modval; \ + typeof((atm).atm_Normal) _oldval; \ + typeof((atm).atm_Normal) _newval; \ + ASSERT_ON_COMPILE(sizeof _modval == 4); \ + asm volatile ("1: ldrex %0, [%3]\n" \ + #op " %1, %0, %4\n" \ + " strex %2, %1, [%3]\n" \ + " teq %2, #0\n" \ + " bne 1b" \ + : "=&r" (_oldval), \ + "=&r" (_newval), \ + "=&r" (_failed) \ + : "r" (&((atm).atm_Volatl)), \ + "r" (_modval) \ + : "memory"); \ + _oldval; \ +}) + +/** + * @brief Atomic Generic Binary Operation + * @param atm atomic cell to operate on + * @param modval value to apply to atomic cell + * @param op ARM instruction (add, and, orr, etc) + * @return nothing + */ +#define ATOMIC_OPV_PRIVATE(atm,modval,op) do { \ + int _failed; \ + typeof((atm).atm_Normal) _modval = modval; \ + typeof((atm).atm_Normal) _sample; \ + ASSERT_ON_COMPILE(sizeof _modval == 4); \ + asm volatile ("1: ldrex %0, [%2]\n" \ + #op " %0, %3\n" \ + " strex %1, %0, [%2]\n" \ + " teq %1, #0\n" \ + " bne 1b" \ + : "=&r" (_sample), \ + "=&r" (_failed) \ + : "r" (&((atm).atm_Volatl)), \ + "r" (_modval) \ + : "memory"); \ +} while (0) + +/** + * @brief Single-copy atomic word write. + * + * ARMv7 defines world-aligned word writes to be single-copy atomic. See + * A3-26 ARM DDI 0406A. + * + * @param p word aligned location to write to + * @param val word-sized value to write to p + */ +#define ATOMIC_SINGLE_COPY_WRITE32(p,val) \ + do { \ + ASSERT(sizeof(val) == 4); \ + ASSERT((MVA)(p) % sizeof(val) == 0); \ + asm volatile("str %0, [%1]" \ + : \ + : "r" (val), "r" (p) \ + : "memory"); \ + } while (0); + + +/** + * @brief Single-copy atomic word read. + * + * ARMv7 defines world-aligned word reads to be single-copy atomic. See + * A3-26 ARM DDI 0406A. + * + * @param p word aligned location to read from + * + * @return word-sized value from p + */ +#define ATOMIC_SINGLE_COPY_READ32(p) ({ \ + ASSERT((MVA)(p) % sizeof(uint32) == 0); \ + uint32 _val; \ + asm volatile("ldr %0, [%1]" \ + : "=r" (_val) \ + : "r" (p) \ + ); \ + _val; \ +}) + +/** + * @brief Single-copy atomic double word write. + * + * LPAE defines double world-aligned double word writes to be single-copy + * atomic. See 6.7 ARM PRD03-GENC-008469 13.0. + * + * @param p double word aligned location to write to + * @param val double word-sized value to write to p + */ +#define ATOMIC_SINGLE_COPY_WRITE64(p,val) \ + do { \ + ASSERT(sizeof(val) == 8); \ + ASSERT((MVA)(p) % sizeof(val) == 0); \ + asm volatile("mov r0, %0 \n" \ + "mov r1, %1 \n" \ + "strd r0, r1, [%2]" \ + : \ + : "r" ((uint32)(val)), \ + "r" (((uint64)(val)) >> 32),\ + "r" (p) \ + : "r0", "r1", "memory"); \ + } while (0); + +#endif diff --git a/arch/arm/mvp/mvpkm/check_kconfig.c b/arch/arm/mvp/mvpkm/check_kconfig.c new file mode 100644 index 0000000..bab1b6e --- /dev/null +++ b/arch/arm/mvp/mvpkm/check_kconfig.c @@ -0,0 +1,91 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * @brief Check for required kernel configuration + * + * Check to make sure that the kernel options that the MVP hypervisor requires + * have been enabled in the kernel that this kernel module is being built + * against. + */ +#include + +/* + * Minimum kernel version + * - network namespace support is only really functional starting in 2.6.29 + * - Android Gingerbread requires 2.6.35 + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) +#error "MVP requires a host kernel newer than 2.6.35" +#endif + +/* module loading ability */ +#ifndef CONFIG_MODULES +#error "MVP requires kernel loadable module support be enabled (CONFIG_MODULES)" +#endif +#ifndef CONFIG_MODULE_UNLOAD +#error "MVP requires kernel module unload support be enabled (CONFIG_MODULE_UNLOAD)" +#endif + +/* sysfs */ +#ifndef CONFIG_SYSFS +#error "MVP requires sysfs support (CONFIG_SYSFS)" +#endif + +/* network traffic isolation */ +#ifndef CONFIG_NAMESPACES +#error "MVP networking support requires namespace support (CONFIG_NAMESPACES)" +#endif +#ifndef CONFIG_NET_NS +#error "MVP networking support requires Network Namespace support to be enabled (CONFIG_NET_NS)" +#endif + +/* TCP/IP networking */ +#ifndef CONFIG_INET +#error "MVP networking requires IPv4 support (CONFIG_INET)" +#endif +#ifndef CONFIG_IPV6 +#error "MVP networking requires IPv6 support (CONFIG_IPV6)" +#endif + +/* VPN support */ +#if !defined(CONFIG_TUN) && !defined(CONFIG_TUN_MODULE) +#error "MVP VPN support requires TUN device support (CONFIG_TUN)" +#endif + +#if !defined(CONFIG_NETFILTER) && !defined(PVTCP_DISABLE_NETFILTER) +#error "MVP networking support requires netfilter support (CONFIG_NETFILTER)" +#endif + +/* Force /proc/config.gz support for eng/userdebug builds */ +#ifdef MVP_DEBUG +#if !defined(CONFIG_IKCONFIG) || !defined(CONFIG_IKCONFIG_PROC) +#error "MVP kernel /proc/config.gz support required for debuggability (CONFIG_IKCONFIG_PROC)" +#endif +#endif + +/* Sanity check we're only dealing with the memory hotplug + migrate and/or + * compaction combo */ +#ifdef CONFIG_MIGRATION +#if defined(CONFIG_NUMA) || defined(CONFIG_CPUSETS) || defined(CONFIG_MEMORY_FAILURE) +#error "MVP not tested with migration features other than CONFIG_MEMORY_HOTPLUG and CONFIG_COMPACTION" +#endif +#endif diff --git a/arch/arm/mvp/mvpkm/comm_os.h b/arch/arm/mvp/mvpkm/comm_os.h new file mode 100644 index 0000000..cf858f6 --- /dev/null +++ b/arch/arm/mvp/mvpkm/comm_os.h @@ -0,0 +1,150 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Cross-platform base type definitions and function declarations. + * Includes OS-specific base type definitions and function declarations. + */ + +#ifndef _COMM_OS_H_ +#define _COMM_OS_H_ + +/* For-ever timeout constant (in milliseconds). */ +#define COMM_OS_4EVER_TO ((unsigned long long)(~0UL >> 1)) + +/* Condition function prototype. Returns 1: true, 0: false, < 0: error code. */ +typedef int (*CommOSWaitConditionFunc)(void *arg1, void *arg2); + +/* Dispatch function prototype. Called by input (dispatch) kernel threads. */ +typedef unsigned int (*CommOSDispatchFunc)(void); + +/* Module initialization and exit callback functions. */ +extern int (*commOSModInit)(void *args); +extern void (*commOSModExit)(void); + +/* Macro to assign Init and Exit callbacks. */ +#define COMM_OS_MOD_INIT(init, exit) \ + int (*commOSModInit)(void *args) = init; \ + void (*commOSModExit)(void) = exit + + +/* + * OS-specific implementations must provide the following: + * 1. Types: + * CommOSAtomic + * CommOSSpinlock + * CommOSMutex + * CommOSWaitQueue + * CommOSWork + * CommOSWorkFunc + * CommOSList + * CommOSModule + * struct kvec + * + * 2. Definition, initializers: + * CommOSSpinlock_Define() + * + * 3. Functions: + * void CommOS_Debug(const char *format, ...); + * void CommOS_Log(const char *format, ...); + * void CommOS_WriteAtomic(CommOSAtomic *atomic, int val); + * int CommOS_ReadAtomic(CommOSAtomic *atomic); + * int CommOS_AddReturnAtomic(CommOSAtomic *atomic, int val); + * int CommOS_SubReturnAtomic(CommOSAtomic *atomic, int val); + * void CommOS_SpinlockInit(CommOSSpinlock *lock); + * void CommOS_SpinLockBH(CommOSSpinlock *lock); + * int CommOS_SpinTrylockBH(CommOSSpinlock *lock); + * void CommOS_SpinUnlockBH(CommOSSpinlock *lock); + * void CommOS_SpinLock(CommOSSpinlock *lock); + * int CommOS_SpinTrylock(CommOSSpinlock *lock); + * void CommOS_SpinUnlock(CommOSSpinlock *lock); + * void CommOS_MutexInit(CommOSMutex *mutex); + * void CommOS_MutexLock(CommOSMutex *mutex); + * int CommOS_MutexLockUninterruptible(CommOSMutex *mutex); + * int CommOS_MutexTrylock(CommOSMutex *mutex); + * void CommOS_MutexUnlock(CommOSMutex *mutex); + * void CommOS_WaitQueueInit(CommOSWaitQueue *wq); + * CommOS_DoWait(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc cond, + * void *condArg1, + * void *condArg2, + * unsigned long long *timeoutMillis, + * int interruptible); + * int CommOS_Wait(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc func, + * void *funcArg1, + * void *funcArg2, + * unsigned long long *timeoutMillis); + * int CommOS_WaitUninterruptible(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc func, + * void *funcArg1, + * void *funcArg2, + * unsigned long long *timeoutMillis); + * void CommOS_WakeUp(CommOSWaitQueue *wq); + * void *CommOS_KmallocNoSleep(unsigned int size); + * void *CommOS_Kmalloc(unsigned int size); + * void CommOS_Kfree(void *arg); + * void CommOS_Yield(void); + * unsigned long long CommOS_GetCurrentMillis(void); + * void CommOS_ListInit(CommOSList *list); + * int CommOS_ListEmpty(CommOSList *list); + * void CommOS_ListAdd(CommOSList *list, CommOSList *listElem); + * void CommOS_ListAddTail(CommOSList *list, CommOSList *listElem); + * void int CommOS_ListDel(CommOSList *listElem); + * Macros: + * CommOS_ListForEach(*list, *item, itemListFieldName); + * CommOS_ListForEachSafe(*list, *item, *tmp, itemListFieldName); + * void CommOS_ListSplice(CommOSList *list, CommOSList *listToAdd); + * void CommOS_ListSpliceTail(CommOSList *list, CommOSList *listToAdd); + * CommOSModule CommOS_ModuleSelf(void); + * int CommOS_ModuleGet(CommOSModule module); + * void CommOS_ModulePut(CommOSModule module); + * void CommOS_MemBarrier(void); + * + * These cannot be defined here: a) non-pointer type definitions need size + * information, and b) functions may or may not be inlined, or macros may + * be used instead. + */ + + +#ifdef __linux__ +#include "comm_os_linux.h" +#else +#error "Unsupported OS" +#endif + +/* Functions to start and stop the dispatch and aio kernel threads. */ +void CommOS_StopIO(void); +void CommOS_ScheduleDisp(void); +void CommOS_InitWork(CommOSWork *work, CommOSWorkFunc func); +int CommOS_ScheduleAIOWork(CommOSWork *work); +void CommOS_FlushAIOWork(CommOSWork *work); + +int +CommOS_StartIO(const char *dispatchTaskName, + CommOSDispatchFunc dispatchHandler, + unsigned int interval, + unsigned int maxCycles, + const char *aioTaskName); + + +#endif /* _COMM_OS_H_ */ diff --git a/arch/arm/mvp/mvpkm/comm_os_linux.h b/arch/arm/mvp/mvpkm/comm_os_linux.h new file mode 100644 index 0000000..c9d4f26 --- /dev/null +++ b/arch/arm/mvp/mvpkm/comm_os_linux.h @@ -0,0 +1,699 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Contains linux-specific type definitions and function declarations + */ + +#ifndef _COMM_OS_LINUX_H_ +#define _COMM_OS_LINUX_H_ + +#include +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) +#error "Kernel versions lower than 2.6.20 are not supported" +#endif + +#include +#include +#include +#include +#include +#include + + +/* + * Type definitions. + */ + +typedef atomic_t CommOSAtomic; +typedef spinlock_t CommOSSpinlock; +typedef struct mutex CommOSMutex; +typedef wait_queue_head_t CommOSWaitQueue; +typedef struct delayed_work CommOSWork; +typedef void (*CommOSWorkFunc)(CommOSWork *work); +typedef struct list_head CommOSList; +typedef struct module *CommOSModule; + + +/* + * Initializers. + */ + +#define CommOSSpinlock_Define DEFINE_SPINLOCK + + +#define COMM_OS_DOLOG(...) printk(KERN_INFO __VA_ARGS__) + + +/** + * @brief Logs given arguments in debug builds. + */ + +#if defined(COMM_OS_DEBUG) + #define CommOS_Debug(args) COMM_OS_DOLOG args +#else + #define CommOS_Debug(args) +#endif + + +/** + * @brief Logs given arguments. + */ + +#define CommOS_Log(args) COMM_OS_DOLOG args + + +/** + * @brief Logs function name and location. + */ + +#if defined(COMM_OS_TRACE) +#define TRACE(ptr) \ + do { \ + CommOS_Debug(("%p:%s: at [%s:%d] with arg ptr [0x%p].\n", current, \ + __FUNCTION__, __FILE__, __LINE__, (ptr))); \ + } while (0) +#else +#define TRACE(ptr) +#endif + + +/** + * @brief Write atomic variable + * @param[in,out] atomic variable to write + * @param val new value + */ + +static inline void +CommOS_WriteAtomic(CommOSAtomic *atomic, + int val) +{ + atomic_set(atomic, val); +} + + +/** + * @brief Reads atomic variable + * @param atomic variable to read + * @return value + */ + +static inline int +CommOS_ReadAtomic(CommOSAtomic *atomic) +{ + return atomic_read(atomic); +} + + +/** + * @brief Atomically add value to atomic variable, return new value. + * @param[in,out] atomic variable + * @param val value to add + * @return new value + */ + +static inline int +CommOS_AddReturnAtomic(CommOSAtomic *atomic, + int val) +{ + return atomic_add_return(val, atomic); +} + + +/** + * @brief Atomically substract value from atomic variable, return new value. + * @param[in,out] atomic variable + * @param val value to substract + * @return new value + */ + +static inline int +CommOS_SubReturnAtomic(CommOSAtomic *atomic, + int val) +{ + return atomic_sub_return(val, atomic); +} + + +/** + * @brief Initializes a given lock. + * @param[in,out] lock lock to initialize + */ + +static inline void +CommOS_SpinlockInit(CommOSSpinlock *lock) +{ + spin_lock_init(lock); +} + + +/** + * @brief Locks given lock and disables bottom half processing. + * @param[in,out] lock lock to lock + */ + +static inline void +CommOS_SpinLockBH(CommOSSpinlock *lock) +{ + spin_lock_bh(lock); +} + + +/** + * @brief Attempts to lock the given lock and disable BH processing. + * @param[in,out] lock lock to lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_SpinTrylockBH(CommOSSpinlock *lock) +{ + return !spin_trylock_bh(lock); +} + + +/** + * @brief Unlocks given lock and re-enables BH processing. + * @param[in,out] lock lock to unlock + */ + +static inline void +CommOS_SpinUnlockBH(CommOSSpinlock *lock) +{ + spin_unlock_bh(lock); +} + + +/** + * @brief Locks the given lock. + * @param[in,out] lock lock to lock + */ + +static inline void +CommOS_SpinLock(CommOSSpinlock *lock) +{ + spin_lock(lock); +} + + +/** + * @brief Attempts to lock the given lock. + * @param[in,out] lock lock to try-lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_SpinTrylock(CommOSSpinlock *lock) +{ + return !spin_trylock(lock); +} + + +/** + * @brief Unlocks given lock. + * @param[in,out] lock lock to unlock + */ + +static inline void +CommOS_SpinUnlock(CommOSSpinlock *lock) +{ + spin_unlock(lock); +} + + +/** + * @brief Initializes given mutex. + * @param[in,out] mutex mutex to initialize + */ + +static inline void +CommOS_MutexInit(CommOSMutex *mutex) +{ + mutex_init(mutex); +} + + +/** + * @brief Acquires mutex. + * @param[in,out] mutex mutex to lock + * @return zero if successful, non-zero otherwise (interrupted) + */ + +static inline int +CommOS_MutexLock(CommOSMutex *mutex) +{ + return mutex_lock_interruptible(mutex); +} + + +/** + * @brief Acquires mutex in uninterruptible mode. + * @param[in,out] mutex mutex to lock + */ + +static inline void +CommOS_MutexLockUninterruptible(CommOSMutex *mutex) +{ + mutex_lock(mutex); +} + + +/** + * @brief Attempts to acquire given mutex. + * @param[in,out] mutex mutex to try-lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_MutexTrylock(CommOSMutex *mutex) +{ + return !mutex_trylock(mutex); +} + + +/** + * @brief Releases a given mutex. + * @param[in,out] mutex mutex to unlock + */ + +static inline void +CommOS_MutexUnlock(CommOSMutex *mutex) +{ + mutex_unlock(mutex); +} + + +/** + * @brief Initializes a wait queue. + * @param[in,out] wq workqueue to initialize + */ + +static inline void +CommOS_WaitQueueInit(CommOSWaitQueue *wq) +{ + init_waitqueue_head(wq); +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * - a signal is pending + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @param interruptible enable/disable signal pending check + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, if a signal is pending or other error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_DoWait(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis, + int interruptible) +{ + int rc; + DEFINE_WAIT(wait); + long timeout; +#if defined(COMM_OS_LINUX_WAIT_WORKAROUND) + long tmpTimeout; + long retTimeout; + const unsigned int interval = 50; +#endif + + if (!timeoutMillis) { + return -1; + } + if ((rc = cond(condArg1, condArg2)) != 0) { + return rc; + } + +#if defined(COMM_OS_LINUX_WAIT_WORKAROUND) + timeout = msecs_to_jiffies(interval < *timeoutMillis ? + interval : (unsigned int)*timeoutMillis); + retTimeout = msecs_to_jiffies((unsigned int)(*timeoutMillis)); + + for (; retTimeout >= 0; ) { + prepare_to_wait(wq, &wait, + (interruptible?TASK_INTERRUPTIBLE:TASK_UNINTERRUPTIBLE)); + if ((rc = cond(condArg1, condArg2))) { + break; + } + if (interruptible && signal_pending(current)) { + rc = -EINTR; + break; + } + if ((tmpTimeout = schedule_timeout(timeout))) { + retTimeout -= (timeout - tmpTimeout); + } else { + retTimeout -= timeout; + } + if (retTimeout < 0) { + retTimeout = 0; + } + } + finish_wait(wq, &wait); + if (rc == 0) { + rc = cond(condArg1, condArg2); + if (rc && (retTimeout == 0)) { + retTimeout = 1; + } + } + *timeoutMillis = (unsigned long long)jiffies_to_msecs(retTimeout); +#else // !defined(COMM_OS_LINUX_WAIT_WORKAROUND) + timeout = msecs_to_jiffies((unsigned int)(*timeoutMillis)); + + for (;;) { + prepare_to_wait(wq, &wait, + (interruptible?TASK_INTERRUPTIBLE:TASK_UNINTERRUPTIBLE)); + if ((rc = cond(condArg1, condArg2)) != 0) { + break; + } + if (interruptible && signal_pending(current)) { + rc = -EINTR; + break; + } + if ((timeout = schedule_timeout(timeout)) == 0) { + rc = 0; + break; + } + } + finish_wait(wq, &wait); + if (rc == 0) { + rc = cond(condArg1, condArg2); + if (rc && (timeout == 0)) { + timeout = 1; + } + } + *timeoutMillis = (unsigned long long)jiffies_to_msecs(timeout); +#endif + + return rc; +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * - a signal is pending + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, if a signal is pending or other error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_Wait(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis) +{ + return CommOS_DoWait(wq, cond, condArg1, condArg2, timeoutMillis, 1); +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_WaitUninterruptible(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis) +{ + return CommOS_DoWait(wq, cond, condArg1, condArg2, timeoutMillis, 0); +} + + +/** + * @brief Wakes up task(s) waiting on the given wait queue. + * @param[in,out] wq wait queue. + */ + +static inline void +CommOS_WakeUp(CommOSWaitQueue *wq) +{ + wake_up(wq); +} + + +/** + * @brief Allocates kernel memory of specified size; does not sleep. + * @param size size to allocate. + * @return Address of allocated memory or NULL if the allocation fails. + */ + +static inline void * +CommOS_KmallocNoSleep(unsigned int size) +{ + return kmalloc(size, GFP_ATOMIC); +} + + +/** + * @brief Allocates kernel memory of specified size; may sleep. + * @param size size to allocate. + * @return Address of allocated memory or NULL if the allocation fails. + */ + +static inline void * +CommOS_Kmalloc(unsigned int size) +{ + return kmalloc(size, GFP_KERNEL); +} + + +/** + * @brief Frees previously allocated kernel memory. + * @param obj object to free. + */ + +static inline void +CommOS_Kfree(void *obj) +{ + if (obj) { + kfree(obj); + } +} + + +/** + * @brief Yields the current cpu to other runnable tasks. + */ + +static inline void +CommOS_Yield(void) +{ + cond_resched(); +} + + +/** + * @brief Gets the current time in milliseconds. + * @return Current time in milliseconds, with precision of at most one tick. + */ + +static inline unsigned long long +CommOS_GetCurrentMillis(void) +{ + return (unsigned long long)jiffies_to_msecs(jiffies); +} + + +/** + * @brief Initializes given list. + * @param list list to initialize. + */ + +static inline void +CommOS_ListInit(CommOSList *list) +{ + INIT_LIST_HEAD(list); +} + + +/** + * @brief Tests if list is empty. + * @param list list to test. + * @return non-zero if empty, zero otherwise. + */ + +#define CommOS_ListEmpty(list) list_empty((list)) + + +/** + * @brief Adds given element to beginning of list. + * @param list list to add to. + * @param elem element to add. + */ + +#define CommOS_ListAdd(list, elem) list_add((elem), (list)) + + +/** + * @brief Adds given element to end of list. + * @param list list to add to. + * @param elem element to add. + */ + +#define CommOS_ListAddTail(list, elem) list_add_tail((elem), (list)) + + +/** + * @brief Deletes given element from its list. + * @param elem element to delete. + */ + +#define CommOS_ListDel(elem) \ + do { \ + list_del((elem)); \ + INIT_LIST_HEAD((elem)); \ + } while (0) + + +/** + * @brief Iterates over a list. + * @param list list to iterate over. + * @param[out] item stores next element. + * @param itemListFieldName name in the item structure storing the list head. + */ + +#define CommOS_ListForEach(list, item, itemListFieldName) \ + list_for_each_entry((item), (list), itemListFieldName) + + +/** + * @brief Iterates safely over a list. + * @param list list to iterate over. + * @param[out] item stores next element. May be deleted in the loop. + * @param[out] tmpItem saves iteration element. + * @param itemListFieldName name in the item structure storing the list head. + */ + +#define CommOS_ListForEachSafe(list, item, tmpItem, itemListFieldName) \ + list_for_each_entry_safe((item), (tmpItem), (list), itemListFieldName) + + +/** + * @brief Combines two lists, adds second list to beginning of first one. + * @param list list to add to. + * @param list2 list to add. + */ + +#define CommOS_ListSplice(list, list2) list_splice((list2), (list)) + + +/** + * @brief Combines two lists, adds second list to end of first one. + * @param list list to add to. + * @param list2 list to add. + */ + +#define CommOS_ListSpliceTail(list, list2) list_splice_tail((list2), (list)) + + +/** + * @brief Gets current module handle. + * @return module handle. + */ + +static inline CommOSModule +CommOS_ModuleSelf(void) +{ + return THIS_MODULE; +} + + +/** + * @brief Retains module. + * @param[in,out] module to retain. + * @return zero if successful, non-zero otherwise. + */ + +static inline int +CommOS_ModuleGet(CommOSModule module) +{ + int rc = 0; + + if (!module) { + goto out; + } + if (!try_module_get(module)) { + rc = -1; + } + +out: + return rc; +} + + +/** + * @brief Releases module. + * @param[in,out] module to release. + */ + +static inline void +CommOS_ModulePut(CommOSModule module) +{ + if (module) { + module_put(module); + } +} + + +/** + * @brief Inserts r/w memory barrier. + */ + +#define CommOS_MemBarrier smp_mb + +#endif /* _COMM_OS_LINUX_H_ */ diff --git a/arch/arm/mvp/mvpkm/comm_transp.h b/arch/arm/mvp/mvpkm/comm_transp.h new file mode 100644 index 0000000..a90eb40 --- /dev/null +++ b/arch/arm/mvp/mvpkm/comm_transp.h @@ -0,0 +1,90 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Generic shared memory transport API. + */ + +#ifndef _COMM_TRANSP_H_ +#define _COMM_TRANSP_H_ + +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/* + * Common shared memory identifier. + * External handle that makes sense to both hypervisor and guest. + */ + +#define COMM_TRANSP_ID_8_ANY ((unsigned char)-1) +#define COMM_TRANSP_ID_32_ANY ((unsigned int)-1) +#define COMM_TRANSP_ID_64_ANY ((unsigned long long)-1) + + +typedef struct CommTranspID { + union { + unsigned char d8[8]; + unsigned int d32[2]; + unsigned long long d64; + }; +} CommTranspID; + + +/* Basic initialization arguments. */ + +typedef enum CommTranspInitMode { + COMM_TRANSP_INIT_CREATE = 0x0, + COMM_TRANSP_INIT_ATTACH = 0x1 +} CommTranspInitMode; + +typedef struct CommTranspInitArgs { + unsigned int capacity; // Shared memory capacity. + unsigned int type; // Type / implementation using this area. + CommTranspID id; // ID (name) of shared memory area. + CommTranspInitMode mode; // Init mode (above). +} CommTranspInitArgs; + + +/** + * @brief Generate a type id from description (protocol) string. This function + * uses djb2, a string hashing algorithm by Dan Bernstein. + * (see http://www.cse.yorku.ca/~oz/hash.html) + * @param str string to hash + * @return 32-bit hash value + */ + +static inline unsigned int +CommTransp_GetType(const char *str) +{ + unsigned int hash = 5381; + int c; + + while ((c = *str++)) { + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + } + return hash; +} + +#endif // _COMM_TRANSP_H_ diff --git a/arch/arm/mvp/mvpkm/comm_transp_impl.h b/arch/arm/mvp/mvpkm/comm_transp_impl.h new file mode 100644 index 0000000..6438ac9 --- /dev/null +++ b/arch/arm/mvp/mvpkm/comm_transp_impl.h @@ -0,0 +1,165 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Generic shared memory transport private API. + */ + +#ifndef _COMM_TRANSP_IMPL_H_ +#define _COMM_TRANSP_IMPL_H_ + +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "comm_transp.h" + + +/* Shared memory opaque descriptor/handle. Only meaningful locally. */ + +typedef struct CommTranspPriv *CommTransp; + + +/* Asynchronous signaling initialization arguments. */ + +typedef enum CommTranspIOEvent { + COMM_TRANSP_IO_DETACH = 0x0, + COMM_TRANSP_IO_IN = 0x1, + COMM_TRANSP_IO_OUT = 0x2, + COMM_TRANSP_IO_INOUT = 0x3 +} CommTranspIOEvent; + +typedef struct CommTranspEvent { + void (*ioEvent)(CommTransp transp, CommTranspIOEvent event, void *data); + void *ioEventData; +} CommTranspEvent; + + +/* + * Mechanism to detect and optionally attach to, created shared memory regions. + */ + +typedef struct CommTranspListener { + int (*probe)(CommTranspInitArgs *transpArgs, void *probeData); + void *probeData; +} CommTranspListener; + + + +/* + * Function prototypes. + */ + +int CommTranspEvent_Init(void); +void CommTranspEvent_Exit(void); +int CommTranspEvent_Process(CommTranspID *transpID, CommTranspIOEvent event); +int +CommTranspEvent_Raise(unsigned int peerEvID, + CommTranspID *transpID, + CommTranspIOEvent event); + +int CommTransp_Init(void); +void CommTransp_Exit(void); + +int CommTransp_Register(const CommTranspListener *listener); +void CommTransp_Unregister(const CommTranspListener *listener); +int +CommTransp_Notify(const CommTranspID *notificationCenterID, + CommTranspInitArgs *transpArgs); + +int +CommTransp_Open(CommTransp *transp, + CommTranspInitArgs *transpArgs, + CommTranspEvent *transpEvent); +void CommTransp_Close(CommTransp transp); + +int CommTransp_EnqueueSpace(CommTransp transp); +int CommTransp_EnqueueReset(CommTransp transp); +int CommTransp_EnqueueCommit(CommTransp transp); +int +CommTransp_EnqueueSegment(CommTransp transp, + const void *buf, + unsigned int bufLen); + +int CommTransp_DequeueSpace(CommTransp transp); +int CommTransp_DequeueReset(CommTransp transp); +int CommTransp_DequeueCommit(CommTransp transp); +int +CommTransp_DequeueSegment(CommTransp transp, + void *buf, + unsigned int bufLen); + +unsigned int CommTransp_RequestInlineEvents(CommTransp transp); +unsigned int CommTransp_ReleaseInlineEvents(CommTransp transp); + + +/** + * @brief Enqueues data into the transport object, data is available for + * reading immediately. + * @param transp handle to the transport object. + * @param buf bytes to enqueue. + * @param bufLen number of bytes to enqueue. + * @return number of bytes enqueued on success, < 0 otherwise. + */ + +static inline int +CommTransp_EnqueueAtomic(CommTransp transp, + const void *buf, + unsigned int bufLen) +{ + int rc; + + CommTransp_EnqueueReset(transp); + rc = CommTransp_EnqueueSegment(transp, buf, bufLen); + if (CommTransp_EnqueueCommit(transp)) { + rc = -1; + } + return rc; +} + + +/** + * @brief Dequeues data from the transport object into a buffer. + * @param transp handle to the transport object. + * @param[out] buf buffer to copy to. + * @param bufLen number of bytes to dequeue. + * @return number of bytes dequeued on success, < 0 otherwise, + */ + +static inline int +CommTransp_DequeueAtomic(CommTransp transp, + void *buf, + unsigned int bufLen) +{ + int rc; + + CommTransp_DequeueReset(transp); + rc = CommTransp_DequeueSegment(transp, buf, bufLen); + if (CommTransp_DequeueCommit(transp)) { + rc = -1; + } + return rc; +} + +#endif // _COMM_TRANSP_IMPL_H_ diff --git a/arch/arm/mvp/mvpkm/coproc_defs.h b/arch/arm/mvp/mvpkm/coproc_defs.h new file mode 100644 index 0000000..26218cf --- /dev/null +++ b/arch/arm/mvp/mvpkm/coproc_defs.h @@ -0,0 +1,351 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Constant definitions for ARM CP15 coprocessor registers. + * + * Derived from tweety hypervisor/src/armv6/trango_macros.inc file + */ + +#ifndef _COPROC_DEFS_H_ +#define _COPROC_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/** + * @name CP10 registers. + * + * MCR/MRC format: @code #define , , , @endcode + * @{ + */ +#define VFP_FPSID 7, c0, c0, 0 +#define VFP_MVFR0 7, c7, c0, 0 +#define VFP_MVFR1 7, c6, c0, 0 +#define VFP_FPEXC 7, c8, c0, 0 +#define VFP_FPSCR 7, c1, c0, 0 +#define VFP_FPINST 7, c9, c0, 0 +#define VFP_FPINST2 7, c10, c0, 0 +/*@}*/ + + +/** + * @name CP15 registers. + * + * MCR/MRC format: @code #define , , , @endcode + * MCRR format: @code #define , @endcode + * @{ + */ +#define ID_CODE 0, c0, c0, 0 +#define CACHE_TYPE 0, c0, c0, 1 +#define MPIDR 0, c0, c0, 5 +#define CACHE_SIZE_ID 1, c0, c0, 0 +#define CACHE_LEVEL_ID 1, c0, c0, 1 +#define CACHE_SIZE_SELECTION 2, c0, c0, 0 +#define MEM_MODEL_FEATURE_0 0, c0, c1, 4 +#define CONTROL_REGISTER 0, c1, c0, 0 +#define TTBASE0_POINTER 0, c2, c0, 0 +#define TTBASE1_POINTER 0, c2, c0, 1 +#define TTCONTROL 0, c2, c0, 2 +#define DOMAIN_CONTROL 0, c3, c0, 0 +#define DATA_FAULT_STATUS 0, c5, c0, 0 +#define INST_FAULT_STATUS 0, c5, c0, 1 +#define AUX_DATA_FAULT_STATUS 0, c5, c1, 0 +#define AUX_INST_FAULT_STATUS 0, c5, c1, 1 +#define DATA_FAULT_ADDRESS 0, c6, c0, 0 +#define INST_FAULT_ADDRESS 0, c6, c0, 2 +#define WAIT_FOR_INTERRUPT 0, c7, c0, 4 +#define PHYSICAL_ADDRESS 0, c7, c4, 0 +#define ICACHE_INVALIDATE_POU 0, c7, c5, 0 +#define ICACHE_INVALIDATE_MVA_POU 0, c7, c5, 1 +#define ICACHE_INVALIDATE_INDEX 0, c7, c5, 2 +#define BTAC_INVALIDATE 0, c7, c5, 6 +#define BTAC_INVALIDATE_MVA 0, c7, c5, 7 +#define DCACHE_INVALIDATE 0, c7, c6, 0 +#define DCACHE_INVALIDATE_MVA_POC 0, c7, c6, 1 +#define DCACHE_INVALIDATE_INDEX 0, c7, c6, 2 +#define UCACHE_INVALIDATE 0, c7, c7, 0 +#define V2P_CURRENT_PRIV_READ 0, c7, c8, 0 +#define V2P_CURRENT_PRIV_WRITE 0, c7, c8, 1 +#define V2P_CURRENT_USER_READ 0, c7, c8, 2 +#define V2P_CURRENT_USER_WRITE 0, c7, c8, 3 +#define V2P_OTHER_PRIV_READ 0, c7, c8, 4 +#define V2P_OTHER_PRIV_WRITE 0, c7, c8, 5 +#define V2P_OTHER_USER_READ 0, c7, c8, 6 +#define V2P_OTHER_USER_WRITE 0, c7, c8, 7 +#define DCACHE_CLEAN 0, c7, c10, 0 +#define DCACHE_CLEAN_MVA_POC 0, c7, c10, 1 +#define DCACHE_CLEAN_INDEX 0, c7, c10, 2 +#define DCACHE_CLEAN_MVA_POU 0, c7, c11, 1 +#define DCACHE_CLEAN_INVALIDATE 0, c7, c14, 0 +#define DCACHE_CLEAN_INVALIDATE_MVA_POC 0, c7, c14, 1 +#define DCACHE_CLEAN_INVALIDATE_INDEX 0, c7, c14, 2 +#define ITLB_INVALIDATE_ALL 0, c8, c5, 0 +#define ITLB_INVALIDATE_SINGLE 0, c8, c5, 1 +#define ITLB_INVALIDATE_ASID 0, c8, c5, 2 +#define DTLB_INVALIDATE_ALL 0, c8, c6, 0 +#define DTLB_INVALIDATE_SINGLE 0, c8, c6, 1 +#define DTLB_INVALIDATE_ASID 0, c8, c6, 2 +#define UTLB_INVALIDATE_ALL 0, c8, c7, 0 +#define UTLB_INVALIDATE_SINGLE 0, c8, c7, 1 +#define UTLB_INVALIDATE_ASID 0, c8, c7, 2 +#define TLB_LOCKDOWN 0, c10, c0, 0 +#define PRIMARY_REGION_REMAP 0, c10, c2, 0 +#define MAIR0 PRIMARY_REGION_REMAP +#define NORMAL_MEMORY_REMAP 0, c10, c2, 1 +#define MAIR1 NORMAL_MEMORY_REMAP +#define VECTOR_BASE 0, c12, c0, 0 +#define INTERRUPT_STATUS 0, c12, c1, 0 +#define CONTEXT_ID 0, c13, c0, 1 +#define TID_USER_RW 0, c13, c0, 2 +#define TID_USER_RO 0, c13, c0, 3 +#define TID_PRIV_RW 0, c13, c0, 4 +#define CLEAR_FAULT_IN_EFSR 7, c15, c0, 1 +#define VBAR 0, c12, c0, 0 + +/* + * ARMv7 performance counters' registers (MVP related) + * - ARM Architecture Reference Manual v7-A and v7-R: DDI0406B + * - Cortex-A8 TRM, rev.r1p1: DDI0344B + */ +#define PERF_MON_CONTROL_REGISTER 0, c9, c12, 0 +#define CYCLE_COUNT 0, c9, c13, 0 +#define PERF_MON_COUNT_SET 0, c9, c12, 1 +#define PERF_MON_COUNT_CLR 0, c9, c12, 2 +#define PERF_MON_FLAG_RDCLR 0, c9, c12, 3 +#define PERF_MON_EVENT_SELECT 0, c9, c12, 5 +#define PERF_MON_EVENT_TYPE 0, c9, c13, 1 +#define PERF_MON_EVENT_COUNT 0, c9, c13, 2 +#define PERF_MON_INTEN_SET 0, c9, c14, 1 +#define PERF_MON_INTEN_CLR 0, c9, c14, 2 + +#define COPROC_ACCESS_CONTROL 0, c1, c0, 2 +#define NON_SECURE_ACCESS_CONTROL 0, c1, c1, 2 + +#define HYP_CFG 4, c1, c1, 0 +#define HYP_DEBUG_CONTROL 4, c1, c1, 1 +#define HYP_COPROC_TRAP 4, c1, c1, 2 +#define HYP_SYS_TRAP 4, c1, c1, 3 +#define VIRT_TCR 4, c2, c1, 2 +#define HYP_SYNDROME 4, c5, c2, 0 +#define HYP_DATA_FAULT_ADDRESS 4, c6, c0, 0 +#define HYP_INST_FAULT_ADDRESS 4, c6, c0, 2 +#define HYP_IPA_FAULT_ADDRESS 4, c6, c0, 4 +#define UTLB_INVALIDATE_ALL_HYP 4, c8, c7, 0 +#define UTLB_INVALIDATE_SINGLE_HYP 4, c8, c7, 1 +#define UTLB_INVALIDATE_ALL_NS_NON_HYP 4, c8, c7, 4 + +#define EXT_TTBR0 0, c2 +#define EXT_TTBR1 1, c2 +#define HYP_TTBR 4, c2 +#define VIRT_TTBR 6, c2 +#define EXT_PHYSICAL_ADDRESS 0, c7 +/*@}*/ + +/** + * @name CP15 configuration control register bits. + * @{ + */ +#define ARM_CP15_CNTL_M (1 << 0) +#define ARM_CP15_CNTL_A (1 << 1) +#define ARM_CP15_CNTL_C (1 << 2) +#define ARM_CP15_CNTL_B (1 << 7) +#define ARM_CP15_CNTL_Z (1 << 11) +#define ARM_CP15_CNTL_I (1 << 12) +#define ARM_CP15_CNTL_V (1 << 13) +#define ARM_CP15_CNTL_U (1 << 22) +#define ARM_CP15_CNTL_VE (1 << 24) +#define ARM_CP15_CNTL_EE (1 << 25) +#define ARM_CP15_CNTL_TRE (1 << 28) +#define ARM_CP15_CNTL_AFE (1 << 29) +#define ARM_CP15_CNTL_TE (1 << 30) + +/*@}*/ + +/** + * @brief Initial System Control Register (SCTLR) value. + * + * Magic described on B3-97 ARM DDI 0406B, it's the power-on + * value, e.g. caches/MMU/alignment checking/TEX remap etc. disabled. + */ +#define ARM_CP15_CNTL_INIT 0x00c50078 + +/** + * @name System control coprocessor primary registers. + * Each primary register is backed by potentially multiple + * physical registers in the vCPU CP15 register file. + * @{ + */ +#define ARM_CP15_CRN_ID 0 ///< Processor ID, cache, TCM and TLB type +#define ARM_CP15_CRN_CNTL 1 ///< System configuration bits +#define ARM_CP15_CRN_PT 2 ///< Page table control +#define ARM_CP15_CRN_DACR 3 ///< Domain access control +#define ARM_CP15_CRN_F_STATUS 5 ///< Fault status +#define ARM_CP15_CRN_F_ADDR 6 ///< Fault address +#define ARM_CP15_CRN_CACHE 7 ///< Cache/write buffer control +#define ARM_CP15_CRN_TLB 8 ///< TLB control +#define ARM_CP15_CRN_REMAP 10 ///< Memory Remap registers +#define ARM_CP15_CRN_SER 12 ///< Security Extension registers +#define ARM_CP15_CRN_PID 13 ///< Process ID +#define ARM_CP15_CRN_TIMER 14 ///< Architecture timers + +#define ARM_CP15_CRM_INVALIDATE_D_CACHE_RANGE 6 +#define ARM_CP15_CRM_CLEAN_AND_INVALIDATE_D_CACHE_RANGE 14 +/*@}*/ + +/** + * @name ARMv7 performance counter control/status register bits (MVP related) + * INTEN: counters overflow interrupt enable + * CNTEN: counters enable + * @{ + */ +#define ARMV7_PMNC_E (1 << 0) +#define ARMV7_PMNC_INTEN_P0 (1 << 0) +#define ARMV7_PMNC_INTEN_P1 (1 << 1) +#define ARMV7_PMNC_INTEN_P2 (1 << 2) +#define ARMV7_PMNC_INTEN_P3 (1 << 3) +#define ARMV7_PMNC_INTEN_C (1 << 31) +#define ARMV7_PMNC_INTEN_MASK 0x8000000f +#define ARMV7_PMNC_CNTEN_P0 (1 << 0) +#define ARMV7_PMNC_CNTEN_P1 (1 << 1) +#define ARMV7_PMNC_CNTEN_P2 (1 << 2) +#define ARMV7_PMNC_CNTEN_P3 (1 << 3) +#define ARMV7_PMNC_CNTEN_C (1 << 31) +#define ARMV7_PMNC_FLAG_P0 (1 << 0) +#define ARMV7_PMNC_FLAG_P1 (1 << 1) +#define ARMV7_PMNC_FLAG_P2 (1 << 2) +#define ARMV7_PMNC_FLAG_P3 (1 << 3) +#define ARMV7_PMNC_FLAG_C (1 << 31) +/*@}*/ + +/** + * @name TTBR masks. + * See B4.9.2 ARM DDI 0100I and B3.12.24 ARM DDI 0406A. + * @{ + */ +#define ARM_CP15_TTBASE_MASK MVP_MASK(14, 18) +#define ARM_CP15_TTBASE_SPLIT_MASK(ttbcrn) MVP_MASK(14-ttbcrn, 18+ttbcrn) +#define ARM_CP15_TTATTRIB_MASK MVP_MASK(0, 6) +/*@}*/ + +/** + * @name ARM fault status register encoding/decoding. + * See B4.6 and B4.9.6 in ARM DDI 0100I. + * @{ + */ +#define ARM_CP15_FSR_STATUS_POS 0 +#define ARM_CP15_FSR_STATUS_POS2 10 +#define ARM_CP15_FSR_DOMAIN_POS 4 +#define ARM_CP15_FSR_WR_POS 11 + +#define ARM_CP15_FSR_STATUS_LEN 4 +#define ARM_CP15_FSR_DOMAIN_LEN 4 + +#define ARM_CP15_FSR_STATUS_DEBUG_EVENT 0x2 +#define ARM_CP15_FSR_STATUS_ALIGNMENT 0x1 +#define ARM_CP15_FSR_STATUS_ICACHE_MAINT 0x4 +#define ARM_CP15_FSR_STATUS_TRANSLATION_SECT 0x5 +#define ARM_CP15_FSR_STATUS_TRANSLATION_PAGE 0x7 +#define ARM_CP15_FSR_STATUS_DOMAIN_SECT 0x9 +#define ARM_CP15_FSR_STATUS_DOMAIN_PAGE 0xb +#define ARM_CP15_FSR_STATUS_PERMISSION_SECT 0xd +#define ARM_CP15_FSR_STATUS_PERMISSION_PAGE 0xf +#define ARM_CP15_FSR_STATUS_ACCESS_FLAG_SECT 0x3 +#define ARM_CP15_FSR_STATUS_ACCESS_FLAG_PAGE 0x6 +#define ARM_CP15_FSR_STATUS_SYNC_EXT_ABORT 0x8 +#define ARM_CP15_FSR_STATUS_ASYNC_EXT_ABORT 0x16 +/*@}*/ + +/** + * @brief Generate ARM fault status register value. + * + * @param fs status from Table B4-1. Only implemented for fs <= 0xf. + * @param domain domain accessed when abort occurred. + * @param write write access caused abort. + */ +#define ARM_CP15_FSR(fs,domain,write) \ + (((fs) << ARM_CP15_FSR_STATUS_POS) | \ + ((domain) << ARM_CP15_FSR_DOMAIN_POS) | \ + ((write) ? (1 << ARM_CP15_FSR_WR_POS) : 0)) + +#define ARM_CP15_FSR_STATUS(r) \ + (MVP_EXTRACT_FIELD((r), ARM_CP15_FSR_STATUS_POS, ARM_CP15_FSR_STATUS_LEN) | \ + (MVP_BIT((r), ARM_CP15_FSR_STATUS_POS2) << ARM_CP15_FSR_STATUS_LEN)) +#define ARM_CP15_FSR_DOMAIN(r) \ + MVP_EXTRACT_FIELD((r), ARM_CP15_FSR_DOMAIN_POS, ARM_CP15_FSR_DOMAIN_LEN) +#define ARM_CP15_FSR_WR(r) \ + MVP_BIT((r), ARM_CP15_FSR_WR_POS) +/*@}*/ + +/* + * This should mask out the major and minor revision numbers. + * As per http://infocenter.arm.com/help/topic/com.arm.doc.ddi0211k/I65012.html + */ +#define ARM_CP15_MAIN_ID_NOREVISION_MASK 0xFF0FFFF0 + +// 2-8 ARM DDI 0151C +#define ARM_CP15_MAIN_ID_920_T 0x41129200 +// 3-18 ARM DDI 0211H +#define ARM_CP15_MAIN_ID_1136J_S 0x4107B362 + +/* Coprocessor Access Control Register */ +#define CPACR_ASEDIS (1 << 31) +#define CPACR_D32DIS (1 << 30) +#define CPACR_CP10_MASK (0x3 << (10*2)) +#define CPACR_CP10_CP11_MASK ( (0x3 << (10*2)) | (0x3 << (11*2)) ) +#define CPACR_CP10_CP11_PRIV_ONLY ( (0x1 << (10*2)) | (0x1 << (11*2)) ) + /* 2-bit access permission per Co-Proc */ + +/** + * @name ARM VFP/Adv. SIMD Extension System Registers + * @{ + */ +#define ARM_VFP_SYSTEM_REG_FPSID 0x0 +#define ARM_VFP_SYSTEM_REG_FPSCR 0x1 +#define ARM_VFP_SYSTEM_REG_MVFR1 0x6 +#define ARM_VFP_SYSTEM_REG_MVFR0 0x7 +#define ARM_VFP_SYSTEM_REG_FPEXC 0x8 +#define ARM_VFP_SYSTEM_REG_FPINST 0x9 +#define ARM_VFP_SYSTEM_REG_FPINST2 0xa + +#define ARM_VFP_SYSTEM_REG_FPEXC_EX (1 << 31) +#define ARM_VFP_SYSTEM_REG_FPEXC_EN (1 << 30) +#define ARM_VFP_SYSTEM_REG_FPEXC_FP2V (1 << 28) + +#define ARM_VFP_SYSTEM_REG_MVFR0_A_SIMD_BIT (0) +#define ARM_VFP_SYSTEM_REG_MVFR0_A_SIMD_MASK (0xf << ARM_VFP_SYSTEM_REG_MVFR0_A_SIMD_BIT) +/*@}*/ + +/** + * @name ARM Multi Processor ID Register (MPIDR) decoding + * @{ + */ +#define ARM_CP15_MPIDR_MP (0x1 << 31) +#define ARM_CP15_MPIDR_U (0x1 << 30) +/*@}*/ + +#endif /// ifndef _COPROC_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/cpufreq_kernel.c b/arch/arm/mvp/mvpkm/cpufreq_kernel.c new file mode 100644 index 0000000..4ba71f2 --- /dev/null +++ b/arch/arm/mvp/mvpkm/cpufreq_kernel.c @@ -0,0 +1,308 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MVP host kernel cpufreq related + * + * Track CPU frequency changes. + */ + +#include +#include +#include +#include +#include +#include + +#include "mvp.h" +#include "cpufreq_kernel.h" +#include "mvp_timer.h" + +DEFINE_PER_CPU(struct TscToRate64Cb, tscToRate64); + + +/** + * @brief Return current CPU frequency + * @param cpu CPU number + * @return CPU frequency in Hz + * + * When CPU_FREQ is not available, it uses hardcoded frequencies. + */ +static uint32 +GetCpuFrequency(unsigned int cpu) +{ + unsigned int counterKHZ; + +#ifdef CONFIG_CPU_FREQ + counterKHZ = cpufreq_quick_get(cpu); + if (counterKHZ == 0) { + counterKHZ = cpufreq_get(cpu); + FATAL_IF(counterKHZ == 0); + } +#elif defined(MVP_HOST_BOARD_ve) + /** + * @knownjira{MVP-143} + * We're only using this under the simulator, and it's almost non perceptible to + * provide a fixed TSC frequency as the instructions / second executed widely + * varies depending over time. While we resolve this issue we can use the + * BogoMIPS reported at boot for now. + */ + KNOWN_BUG(MVP-143); + counterKHZ = 125e3; + printk(KERN_INFO "mvpkm: CPU_FREQ not available, forcing TSC to %d KHz\n", counterKHZ); +#elif defined(MVP_HOST_BOARD_panda) + counterKHZ = 1e6; +#else + /* + * If the kernel can't tell us and we have no further host knowledge, + * time to die. + */ +#error "host TSC frequency unknown." +#endif + + return counterKHZ * 1000; +} + +/** + * @brief Compute TSC to RATE64 ratio + * @param cpuFreq TSC frequency in Hz + * @param[out] ttr tscToRate64 pointer + */ +static void +TscToRate64(uint32 cpuFreq, struct TscToRate64Cb *ttr) +{ + uint32 shift; + uint64 mult; + + /* + * A little bit of math ! + * + * We need here to convert the TSC value to our RATE64 timebase. + * + * In other words: + * + * tsc * MVP_TIMER_RATE64 + * rate64 = ---------------------- + * cpuFreq + * + * But we are limited by CPU performance (does not divide easily), CPU + * instruction set, and CPU register file width. To fit performance + * requirement, the math becomes: + * + * rate64 = (cpuFreq * mult) >> shift + * + * To respect instruction set, both cpuFreq and mult must be 32-bit + * numbers. Thus (cpuFreq * mult) will be a 64-bit number. + * + * + * Log2 rate64 = Log2 cpuFreq + Log2 mult - shift + * + * shift = Log2 mult + Log2 cpuFreq - Log2 rate64 + * + * && Log2 mult < 32 + * + * => shift < 32 + Log2 cpuFreq - Log2 rate64 + * + * rate64 << shift + * => mult = --------------- + * cpuFreq + * + * (rate64 << shift) must be a 64-bit number: + * + * Log2 rate64 + shift < 64 + * + * => shift < 64 - Log2 rate64 + * + * While cpuFreq is lower than 2^32 Hz, we have: + * + * shift < 32 + Log2 cpuFreq - Log2 rate64 < 64 - Log2 rate64 + * + * As (31 - CLZ32 x) <= Log2 x < (32 - CLZ32 x): + * + * 31 - CLZ32 cpuFreq <= Log2 cpuFreq && + * + * CLZ32 rate64 - 32 < - Log2 rate64 + * + * 31 + CLZ32 rate64 - CLZ32 cpuFreq < 32 + Log2 cpuFreq - Log2 rate64 + * + * As we want shift to be as great as possible: + * + * => shift = 31 + CLZ32 rate64 - CLZ32 cpuFreq + * + * rate64 << shift + * && mult = --------------- + * cpuFreq + * + * + */ + + /* CLZ(MVP_TIMER_RATE64) is optimized by compiler in a constant */ + shift = 31 + CLZ(MVP_TIMER_RATE64) - CLZ(cpuFreq); + mult = MVP_TIMER_RATE64; + mult <<= shift; + do_div(mult, cpuFreq); + + /* verify Log2 mult < 32 */ + ASSERT(mult < (1ULL<<32)); + + /* update global variables */ + ttr->mult = mult; + ttr->shift = shift; +} + +/** + * @brief Compute TSC to RATE64 ratio for the current cpu + * @param info TSC frequency in Hz + * @sideeffect Update local cpu tscToRate64 + */ +static void +TscToRate64IPI(void *info) +{ + uint32 cpuFreq = (uint32)info; + + TscToRate64(cpuFreq, &__get_cpu_var(tscToRate64)); +} + +/** + * @brief Handle cpufreq transition notifications. + * @param nb Notifier block + * @param val Notified event + * @param data Linux cpufreq_freqs info + * @return NOTIFY_OK + * + * @note A frequency change can fail in which case PRECHANGE and POSTCHANGE + * will not be paired and you get any number of PRECHANGE and maybe never a + * POSTCHANGE (i.e. there is not enough battery voltage available to support a + * high frequency). + * @note This is called once per cpu core that is changing but not always on + * the core that is changing. + */ +static int +CpuFreqNotifier(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + bool updateRequired; + + /* ASSUMPTION: Only freq. increases can fail and that it is ok to tell the + * guest a higher frequency than it really is but not the other way around + * as that just leads to time "jumping" forward in the guest not backwards. + */ + updateRequired = (val == CPUFREQ_PRECHANGE && freq->new > freq->old) || + (val == CPUFREQ_POSTCHANGE && freq->new < freq->old); + + /* Call TscToRate64() on the correct CPU core so that locking is not + * required. This also has the side-effect of forcing any currently running + * vCPU's to worldswitch back to the host and correctly update the world + * switch page. + */ + if (updateRequired) { + uint32 hz = freq->new * 1000; + smp_call_function_single(freq->cpu, TscToRate64IPI, (void *)hz, false); + } + + return NOTIFY_OK; +} + +/** + * @brief Notifier block for cpufreq transitions + */ +static struct notifier_block cpuFreqNotifierBlock = { + .notifier_call = CpuFreqNotifier +}; + +/** + * @brief Handle cpuUp notifications. + * @param nb Notifier block + * @param action Notified action, e.g., CPU_ONLINE + * @param hcpu cpu no + * @return NOTIFY_OK + */ +static int +CpuUpNotifier(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + /* The new CPU core is not yet executing normal tasks, so it is safe + * to update it's scaling factors from a different core. */ + TscToRate64(GetCpuFrequency(cpu), &per_cpu(tscToRate64, cpu)); + break; + default: + /* + * Ignore all other action notifications, + * such as CPU_UP_PREPARE, CPU_UP_CANCELED, + * CPU_DOWN_PREPARE, CPU_DOWN_FAILED, + * CPU_DYING, CPU_DEAD, CPU_POST_DEAD, etc. + * as they are irrelevant here. + */ + break; + } + + return NOTIFY_OK; +} + +/** + * @brief Notifier block for cpus going online + */ +static struct notifier_block cpuUpNotifierBlock = { + .notifier_call = CpuUpNotifier +}; + +/** + * @brief Initialize TSC ratio and register cpufreq transitions. + */ +void +CpuFreq_Init(void) +{ + int ret; + int cpu; + + /* register callback on frequency change */ + ret = cpufreq_register_notifier(&cpuFreqNotifierBlock, + CPUFREQ_TRANSITION_NOTIFIER); + FATAL_IF(ret < 0); + + /* register callback on cpu core online */ + ret = register_cpu_notifier(&cpuUpNotifierBlock); + FATAL_IF(ret < 0); + + /* Make sure that things are correctly initialized. */ + for_each_online_cpu(cpu) { + TscToRate64(GetCpuFrequency(cpu), &per_cpu(tscToRate64, cpu)); + } +} + +/** + * @brief Exit cpufreq, unregister cpufreq transitions + */ +void +CpuFreq_Exit(void) +{ + cpufreq_unregister_notifier(&cpuFreqNotifierBlock, + CPUFREQ_TRANSITION_NOTIFIER); + unregister_cpu_notifier(&cpuUpNotifierBlock); +} diff --git a/arch/arm/mvp/mvpkm/cpufreq_kernel.h b/arch/arm/mvp/mvpkm/cpufreq_kernel.h new file mode 100644 index 0000000..a84b6dd --- /dev/null +++ b/arch/arm/mvp/mvpkm/cpufreq_kernel.h @@ -0,0 +1,47 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The monitor-kernel socket interface kernel-only definitions. + */ + +#ifndef _CPUFREQ_KERNEL_H +#define _CPUFREQ_KERNEL_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/* Scaling factors to convert CPU clock cycles to Rate64 value */ +struct TscToRate64Cb { + uint32 mult; + uint32 shift; +}; + +/* It is assumed that this is only accessed from the current CPU core and not + * "across cores" */ +DECLARE_PER_CPU(struct TscToRate64Cb, tscToRate64); + +void CpuFreq_Init(void); +void CpuFreq_Exit(void); + +#endif diff --git a/arch/arm/mvp/mvpkm/exc_defs.h b/arch/arm/mvp/mvpkm/exc_defs.h new file mode 100644 index 0000000..1d5d063 --- /dev/null +++ b/arch/arm/mvp/mvpkm/exc_defs.h @@ -0,0 +1,67 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Exception-related definitions. See A2.6 ARM DDI 0100I. + */ + +#ifndef _EXC_DEFS_H_ +#define _EXC_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define EXC_VECTOR_SIZE 0x20 + +#define EXC_RESET_VECTOR_OFFSET 0x00 +#define EXC_UNDEFINED_VECTOR_OFFSET 0x04 +#define EXC_SWI_VECTOR_OFFSET 0x08 +#define EXC_PREFETCH_ABORT_VECTOR_OFFSET 0x0c +#define EXC_DATA_ABORT_VECTOR_OFFSET 0x10 +#define EXC_HYP_VECTOR_OFFSET 0x14 +#define EXC_IRQ_VECTOR_OFFSET 0x18 +#define EXC_FIQ_VECTOR_OFFSET 0x1c + +#define EXC_ARM_UNDEFINED_SAVED_PC_OFFSET 4 +#define EXC_ARM_SWI_SAVED_PC_OFFSET 4 +#define EXC_ARM_PREFETCH_ABORT_SAVED_PC_OFFSET 4 +#define EXC_ARM_DATA_ABORT_SAVED_PC_OFFSET 8 +#define EXC_ARM_IRQ_SAVED_PC_OFFSET 4 +#define EXC_ARM_FIQ_SAVED_PC_OFFSET 4 + +#define EXC_THUMB_UNDEFINED_SAVED_PC_OFFSET 2 +#define EXC_THUMB_SWI_SAVED_PC_OFFSET 2 +#define EXC_THUMB_PREFETCH_ABORT_SAVED_PC_OFFSET 4 +#define EXC_THUMB_DATA_ABORT_SAVED_PC_OFFSET 8 +#define EXC_THUMB_IRQ_SAVED_PC_OFFSET 4 +#define EXC_THUMB_FIQ_SAVED_PC_OFFSET 4 + +#define EXC_SAVED_PC_OFFSET(exc, cpsr) \ + (((cpsr) & ARM_PSR_T) ? EXC_THUMB_##exc##_SAVED_PC_OFFSET : \ + EXC_ARM_##exc##_SAVED_PC_OFFSET) + +#endif /// _EXC_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/exc_types.h b/arch/arm/mvp/mvpkm/exc_types.h new file mode 100644 index 0000000..ba835e5 --- /dev/null +++ b/arch/arm/mvp/mvpkm/exc_types.h @@ -0,0 +1,53 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Exception-related types. See A2.6 ARM DDI 0100I. + */ + +#ifndef _EXC_TYPES_H_ +#define _EXC_TYPES_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/** + * @brief ARM hardware exception enumeration. EXC_NONE is added to provide + * a distinguished value to flag non-exception states. + */ +typedef enum { + EXC_NONE, + EXC_RESET, + EXC_UNDEFINED, + EXC_SWI, + EXC_PREFETCH_ABORT, + EXC_DATA_ABORT, + EXC_IRQ, + EXC_FIQ +} ARM_Exception; + +#endif /// _EXC_TYPES_H_ diff --git a/arch/arm/mvp/mvpkm/exitstatus.h b/arch/arm/mvp/mvpkm/exitstatus.h new file mode 100644 index 0000000..c53827a --- /dev/null +++ b/arch/arm/mvp/mvpkm/exitstatus.h @@ -0,0 +1,67 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Exit Status values + */ + +#ifndef _EXITSTATUS_H +#define _EXITSTATUS_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#define INCLUDE_ALLOW_HOSTUSER +#include "include_check.h" + + +#define _EXIT_STATUS_DEF \ + _EXIT_STATUS_ITEM(Success, 0) \ + _EXIT_STATUS_ITEM(ReturnToHost, 1) \ + _EXIT_STATUS_ITEM(GuestExit, 2) \ + _EXIT_STATUS_ITEM(HostRequest, 3) \ + _EXIT_STATUS_ITEM(VMXFatalError, 4) \ + _EXIT_STATUS_ITEM(VMMFatalError, 5) \ + _EXIT_STATUS_ITEM(MVPDFatalError, 6) \ + _EXIT_STATUS_ITEM(VPNFatalError, 7) \ + _EXIT_STATUS_ITEM(VMXFindCause, 8) + + +enum ExitStatus { +#define _EXIT_STATUS_ITEM(name,num) ExitStatus##name = num, +_EXIT_STATUS_DEF +#undef _EXIT_STATUS_ITEM +}; + +typedef enum ExitStatus ExitStatus; + +#ifndef __cplusplus +static const char * ExitStatusName[] UNUSED = { +#define _EXIT_STATUS_ITEM(name,num) [ExitStatus##name] = #name, +_EXIT_STATUS_DEF +#undef _EXIT_STATUS_ITEM +}; +#endif + +#endif diff --git a/arch/arm/mvp/mvpkm/fatalerror.h b/arch/arm/mvp/mvpkm/fatalerror.h new file mode 100644 index 0000000..58e1f98 --- /dev/null +++ b/arch/arm/mvp/mvpkm/fatalerror.h @@ -0,0 +1,126 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief fatal error handlers. They all post fatal errors regardless of build + * type. + */ + +#ifndef _FATALERROR_H +#define _FATALERROR_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mvp_compiler.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum FECode { + FECodeMisc, ///< generic FATAL() call of sorts + FECodeOOM, ///< FATAL_OOM() call of sorts + FECodeAssert, ///< ASSERT() call of sorts + FECodeNR, ///< NOT_REACHED() call of sorts + FECodeNI, ///< NOT_IMPLEMENTED() call of sorts + FECodeNT, ///< NOT_TESTED() call of sorts + FECodeCF ///< COMPILE_FAIL() call of sorts +}; +typedef enum FECode FECode; + +#define FATAL() FatalError(__FILE__, __LINE__, FECodeMisc, 0, NULL) +#define FATAL_IF(x) do { if (UNLIKELY(x)) FATAL(); } while (0) +#define FATAL_OOM() FatalError(__FILE__, __LINE__, FECodeOOM, 0, NULL) +#define FATAL_OOM_IF(x) do { if (UNLIKELY(x)) FATAL_OOM(); } while (0) + +extern _Bool FatalError_hit; + +void NORETURN FatalError(char const *file, + int line, + FECode feCode, + int bugno, + char const *fmt, + ...) FORMAT(printf,5,6); + +#define FATALERROR_COMMON(printFunc, \ + printFuncV, \ + file, \ + line, \ + feCode, \ + bugno, \ + fmt) { \ + va_list ap; \ + \ + printFunc("FatalError: %s:%d, code %d, bugno %d\n", \ + file, line, feCode, bugno); \ + if (fmt != NULL) { \ + va_start(ap, fmt); \ + printFuncV(fmt, ap); \ + va_end(ap); \ + } \ + } + +#if defined IN_HOSTUSER || defined IN_GUESTUSER || defined IN_WORKSTATION + +#define FATALERROR_POSIX_USER \ +void \ +FatalError_VErrPrintf(const char *fmt, va_list ap) \ +{ \ + vfprintf(stderr, fmt, ap); \ +} \ +\ +void \ +FatalError_ErrPrintf(const char *fmt, ...) \ +{ \ + va_list ap; \ + va_start(ap, fmt); \ + FatalError_VErrPrintf(fmt, ap); \ + va_end(ap); \ +} \ +\ +void NORETURN \ +FatalError(char const *file, \ + int line, \ + FECode feCode, \ + int bugno, \ + const char *fmt, \ + ...) \ +{ \ + FATALERROR_COMMON(FatalError_ErrPrintf, FatalError_VErrPrintf, file, line, feCode, bugno, fmt); \ + exit(EXIT_FAILURE); \ +} +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/arch/arm/mvp/mvpkm/include_check.h b/arch/arm/mvp/mvpkm/include_check.h new file mode 100644 index 0000000..2eeafe7 --- /dev/null +++ b/arch/arm/mvp/mvpkm/include_check.h @@ -0,0 +1,18 @@ +/* + * Linux 2.6.32 and later Kernel module for Empty File Placeholder + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ diff --git a/arch/arm/mvp/mvpkm/instr_defs.h b/arch/arm/mvp/mvpkm/instr_defs.h new file mode 100644 index 0000000..ce6b1c9 --- /dev/null +++ b/arch/arm/mvp/mvpkm/instr_defs.h @@ -0,0 +1,426 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief ARM instruction encoding/decoding macros. + */ + +#ifndef _INSTR_DEFS_H_ +#define _INSTR_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "utils.h" + +/** + * @name ARM register synonyms. + * @{ + */ +#define ARM_REG_R0 0 +#define ARM_REG_R1 1 +#define ARM_REG_R2 2 +#define ARM_REG_R3 3 +#define ARM_REG_R4 4 +#define ARM_REG_R5 5 +#define ARM_REG_R6 6 +#define ARM_REG_R7 7 +#define ARM_REG_R8 8 +#define ARM_REG_R9 9 +#define ARM_REG_R10 10 +#define ARM_REG_FP 11 +#define ARM_REG_IP 12 +#define ARM_REG_SP 13 +#define ARM_REG_LR 14 +#define ARM_REG_PC 15 +/*@}*/ + +/** + * @name Data-processing + load/store instruction register field decoding. + * + * @note The following constants and masks used to fetch register operands + * are meant to be used in strictly the following sets of instructions: + * data processing instructions and load/store instructions. + * If you want to fetch the RN, RD, RS and RM fields for some other + * type of instructions, please verify before using -- you may have + * to introduce special constants just for those sets of instructions. + * For instance, all the multiply and signed multiply instructions have + * RN and RD reversed. So, @b BEWARE ! + * + * @{ + */ +#define ARM_INSTR_RN_BIT_POS 16 +#define ARM_INSTR_RD_BIT_POS 12 +#define ARM_INSTR_RS_BIT_POS 8 +#define ARM_INSTR_RM_BIT_POS 0 + +#define ARM_INSTR_RN_LENGTH 4 +#define ARM_INSTR_RD_LENGTH 4 +#define ARM_INSTR_RS_LENGTH 4 +#define ARM_INSTR_RM_LENGTH 4 + +#define ARM_INSTR_RN(r) \ + MVP_EXTRACT_FIELD((r), ARM_INSTR_RN_BIT_POS, ARM_INSTR_RN_LENGTH) +#define ARM_INSTR_RD(r) \ + MVP_EXTRACT_FIELD((r), ARM_INSTR_RD_BIT_POS, ARM_INSTR_RD_LENGTH) +#define ARM_INSTR_RS(r) \ + MVP_EXTRACT_FIELD((r), ARM_INSTR_RS_BIT_POS, ARM_INSTR_RS_LENGTH) +#define ARM_INSTR_RM(r) \ + MVP_EXTRACT_FIELD((r), ARM_INSTR_RM_BIT_POS, ARM_INSTR_RM_LENGTH) + +#define ARM_INSTR_RN_SHIFT(word) ((word) << ARM_INSTR_RN_BIT_POS) +#define ARM_INSTR_RD_SHIFT(word) ((word) << ARM_INSTR_RD_BIT_POS) +#define ARM_INSTR_RS_SHIFT(word) ((word) << ARM_INSTR_RS_BIT_POS) +#define ARM_INSTR_RM_SHIFT(word) ((word) << ARM_INSTR_RM_BIT_POS) + +#define ARM_INSTR_RN_MASK (~(ARM_INSTR_RN_SHIFT(0xf))) +#define ARM_INSTR_RD_MASK (~(ARM_INSTR_RD_SHIFT(0xf))) +#define ARM_INSTR_RS_MASK (~(ARM_INSTR_RS_SHIFT(0xf))) +#define ARM_INSTR_RM_MASK (~(ARM_INSTR_RM_SHIFT(0xf))) +/*@}*/ + +/** + * @name Condition field -- common bit layout across all ARM instructions. + * @{ + */ +#define ARM_INSTR_COND(instr) (MVP_EXTRACT_FIELD(instr, 28, 4)) + +#define ARM_INSTR_COND_EQ 0x0 ///< Equal +#define ARM_INSTR_COND_NE 0x1 ///< Not equal +#define ARM_INSTR_COND_CS 0x2 ///< Carry set/unsigned higher or same +#define ARM_INSTR_COND_CC 0x3 ///< Carry clear/unsigned lower +#define ARM_INSTR_COND_MI 0x4 ///< Minus/negative +#define ARM_INSTR_COND_PL 0x5 ///< Plus/positive or zero +#define ARM_INSTR_COND_VS 0x6 ///< Overflow +#define ARM_INSTR_COND_VC 0x7 ///< No overflow +#define ARM_INSTR_COND_HI 0x8 ///< Unsigned higher +#define ARM_INSTR_COND_LS 0x9 ///< Unsigned lower or same +#define ARM_INSTR_COND_GE 0xa ///< Signed greater than or equal +#define ARM_INSTR_COND_LT 0xb ///< Signed less than +#define ARM_INSTR_COND_GT 0xc ///< Signed greater than +#define ARM_INSTR_COND_LE 0xd ///< Signed less than or equal +#define ARM_INSTR_COND_AL 0xe ///< Always (unconditional) +#define ARM_INSTR_COND_NV 0xf ///< Invalid +/*@}*/ + +/** + * @name Load/store instruction decoding. + * @{ + */ + +/* + * I bit indicating Register(1)/Immediate(0) addressing modes. + */ +#define ARM_INSTR_LDST_IBIT(instr) (MVP_BIT(instr, 25)) + +/* + * U bit indicating whether the offset is added to the base (U == 1) + * or is subtracted from the base (U == 0). + */ +#define ARM_INSTR_LDST_UBIT(instr) (MVP_BIT(instr, 23)) + +/* + * B bit indicating byte(1)/word(0). + */ +#define ARM_INSTR_LDST_BBIT(instr) (MVP_BIT(instr, 22)) + +/* + * L bit indicating ld(1)/st(0). + */ +#define ARM_INSTR_LDST_LBIT(instr) (MVP_BIT(instr, 20)) + +/* + * Shifter operand. + */ +#define ARM_INSTR_LDST_SHIFTER_OPERAND(instr) (MVP_EXTRACT_FIELD(instr, 0, 12)) + +/* + * Immediate offset (12-bits wide) for load/store. + */ +#define ARM_INSTR_LDST_IMMEDIATE(instr) (MVP_EXTRACT_FIELD(instr, 0, 12)) + +/* + * Register List for multiple ld/st. + */ +#define ARM_INSTR_LDMSTM_REGLIST(instr) (MVP_EXTRACT_FIELD(instr, 0, 16)) + +/* + * Immediate Offset for Miscellaneous ld/st instructions. + */ +#define ARM_INSTR_MISC_LDST_IMM_OFFSET(instr) \ + ((MVP_EXTRACT_FIELD(instr, 8, 4) << 4) | MVP_EXTRACT_FIELD(instr, 0, 4)) + +/*@}*/ + +/** + * @name Thumb ldrt/strt instruction decoding. + * @{ + */ + +/* + * L bit indicating ld(1)/st(0). + */ +#define ARM_THUMB_INSTR_LDST_LBIT(instr) (MVP_BIT(instr, 20)) + +/* + * W bit indicating word(1)/byte(0). + */ +#define ARM_THUMB_INSTR_LDST_WBIT(instr) (MVP_BIT(instr, 22)) + +/* + * Immediate offset (8-bits wide) for load/store. + */ +#define ARM_THUMB_INSTR_LDST_IMMEDIATE(instr) (MVP_EXTRACT_FIELD(instr, 0, 8)) + +/*@}*/ + +/** + * @name ARM instruction opcodes. + * @{ + */ +#define ARM_OP_BR_A1 0x0a000000 +#define ARM_OP_BX_A1 0x012fff10 +#define ARM_OP_LDR_LIT_A1 0x051f0000 +#define ARM_OP_MOV_A1 0x01a00000 +#define ARM_OP_MOVW_A2 0x03000000 +#define ARM_OP_MOVT_A1 0x03400000 +#define ARM_OP_MRS_A1 0x01000000 +#define ARM_OP_MSR_T1 0x8000f380 +#define ARM_OP_MSR_A1 0x0120f000 +#define ARM_OP_HVC_A1 0x01400070 +#define ARM_OP_ERET_T1 0x8f00f3de +#define ARM_OP_ERET_A1 0x0160006e + +/* + * Set SYSm[5] = 1 for VE MSR/MRS, see p77-78 ARM PRD03-GENC-008353 10.0. + */ +#define ARM_OP_MRS_EXT_A1 (ARM_OP_MRS_A1 | (1 << 9)) +#define ARM_OP_MSR_EXT_T1 (ARM_OP_MSR_T1 | (1 << 21)) +#define ARM_OP_MSR_EXT_A1 (ARM_OP_MSR_A1 | (1 << 9)) + +#define ARM_OP_I 0x02000000 +#define ARM_OP_S 0x00100000 +#define ARM_OP_W 0x00200000 +/*@}*/ + +/** + * @name ARM instruction class - see Figure A3-1 ARM DDI 0100I. + * @{ + */ +#define ARM_INSTR_CLASS(instr) MVP_BITS(instr, 25, 27) + +#define ARM_INSTR_CLASS_BRANCH 0x5 +/*@}*/ + +/** + * @name ARM instruction opcode - see Figure A3-1 ARM DDI 0100I. Does not + * include extension bits 4-7. + * @{ + */ +#define ARM_INSTR_OPCODE(instr) MVP_EXTRACT_FIELD(instr, 20, 8) + +#define ARM_INSTR_OPCODE_EQ(instr1, instr2) \ + (ARM_INSTR_OPCODE(instr1) == ARM_INSTR_OPCODE(instr2)) +/*@}*/ + +/** + * @brief Extract the offset in a branch instruction - i.e., the least + * significant 24 bits sign extended. + */ +#define ARM_INSTR_BRANCH_TARGET(inst) (((int32)(inst) << 8) >> 6) + +/** + * @brief Check if a potential branch target is outside the encodable distance. + */ +#define ARM_INSTR_BRANCH_TARGET_OVERFLOWS(v) ((v) + (1 << 25) >= (1<< 26)) + +/** + * @brief Modify branch instruction encoding 'ins' with 'offset' as the + * new target. + */ +#define ARM_INSTR_BRANCH_UPDATE_OFFSET(ins, offset) \ + (((ins) & MVP_MASK(24, 8)) | (((offset) >> 2) & MVP_MASK(0, 24))) + +/** + * @brief B instruction encoding - see A8.6.16 ARM DDI 0406A. + */ +#define ARM_INSTR_BR_ENC(cond, offset) \ + (((cond) << 28) | ARM_OP_BR_A1 | MVP_BITS(((uint32)offset) >> 2, 0, 23)) + +/** + * @brief BX instruction encoding + */ +#define ARM_INSTR_BX_ENC(cond, rm) \ + (((cond) << 28) | ARM_OP_BX_A1 | (rm)) + +/** + * @brief LDR +literal instruction encoding - see ARM8.6.59 DDI 0506A. + */ +#define ARM_INSTR_LDR_LIT_ADD_ENC(cond, reg, offset) \ + (((cond) << 28) | ARM_OP_LDR_LIT_A1 | (1 << 23) | ((reg) << 12) | (offset)) + +/** + * @brief Generate encoding of the instruction mov rd, rn. + */ +#define ARM_INSTR_MOV_A1_ENC(cond, rd, rn) \ + ((((cond) << 28) | ARM_OP_MOV_A1 | ((rd) << 12) | (rn))) + +/** + * @name Encoding/decoding of MOVT/W instructions. + * @{ + */ +#define ARM_INSTR_MOVTW_IMMED(instr) \ + (MVP_BITS(instr, 0, 11) | (MVP_BITS(instr, 16, 19) << 12)) + +#define ARM_INSTR_MOVW_A2_ENC(cond,rd,immed) \ + (((cond) << 28) | ARM_OP_MOVW_A2 | (MVP_BITS(immed, 12, 15) << 16) | \ + ((rd) << 12) | MVP_BITS(immed, 0, 11)) + +#define ARM_INSTR_MOVT_A1_ENC(cond,rd,immed) \ + (((cond) << 28) | ARM_OP_MOVT_A1 | \ + (MVP_BITS(((immed) >> 16), 12, 15) << 16) | \ + ((rd) << 12) | MVP_BITS(((immed) >> 16), 0, 11)) +/*@}*/ + +/** + * @brief BKPT instruction encoding - see A4.1.7 ARM DDI 0100I. + */ +#define ARM_INSTR_BKPT_ENC(immed) \ + (0xe1200070 | \ + MVP_EXTRACT_FIELD(immed, 0, 4) | \ + (MVP_EXTRACT_FIELD(immed, 4, 12) << 8)) + +/** + * @name VE instruction encodings - see section 13 ARM PRD03-GENC-008353 10.0. + * @{ + */ +#define ARM_INSTR_HVC_A1_ENC(immed) \ + ((ARM_INSTR_COND_AL << 28) | ARM_OP_HVC_A1 | \ + MVP_EXTRACT_FIELD(immed, 0, 4) | \ + (MVP_EXTRACT_FIELD(immed, 4, 12) << 8)) + +#define ARM_INSTR_ERET_A1_ENC(cond) \ + (((cond) << 28) | ARM_OP_ERET_A1) + +/* + * R=0 + */ +#define ARM_REG_R8_USR 0 +#define ARM_REG_R9_USR 1 +#define ARM_REG_R10_USR 2 +#define ARM_REG_R11_USR 3 +#define ARM_REG_R12_USR 4 +#define ARM_REG_SP_USR 5 +#define ARM_REG_LR_USR 6 +#define ARM_REG_R8_FIQ 8 +#define ARM_REG_R9_FIQ 9 +#define ARM_REG_R10_FIQ 10 +#define ARM_REG_FP_FIQ 11 +#define ARM_REG_IP_FIQ 12 +#define ARM_REG_SP_FIQ 13 +#define ARM_REG_LR_FIQ 14 +#define ARM_REG_LR_IRQ 16 +#define ARM_REG_SP_IRQ 17 +#define ARM_REG_LR_SVC 18 +#define ARM_REG_SP_SVC 19 +#define ARM_REG_LR_ABT 20 +#define ARM_REG_SP_ABT 21 +#define ARM_REG_LR_UND 22 +#define ARM_REG_SP_UND 23 +#define ARM_REG_LR_MON 28 +#define ARM_REG_SP_MON 29 +#define ARM_REG_ELR_HYP 30 +#define ARM_REG_SP_HYP 31 + +/* + * R=1 + */ +#define R_EXTEND(x) ((1 << 5) | (x)) +#define ARM_REG_SPSR_FIQ R_EXTEND(ARM_REG_LR_FIQ) +#define ARM_REG_SPSR_IRQ R_EXTEND(ARM_REG_LR_IRQ) +#define ARM_REG_SPSR_SVC R_EXTEND(ARM_REG_LR_SVC) +#define ARM_REG_SPSR_ABT R_EXTEND(ARM_REG_LR_ABT) +#define ARM_REG_SPSR_UND R_EXTEND(ARM_REG_LR_UND) +#define ARM_REG_SPSR_MON R_EXTEND(ARM_REG_LR_MON) +#define ARM_REG_SPSR_HYP R_EXTEND(ARM_REG_ELR_HYP) + +#define ARM_INSTR_MSR_EXT_T1_ENC(rm,rn) \ + (ARM_OP_MSR_EXT_T1 | (MVP_BIT(rm, 5) << 4) | \ + (MVP_BIT(rm, 4) << 20) | (MVP_EXTRACT_FIELD(rm, 0, 4) << 24) | ((rn) << 0)) + +#define ARM_INSTR_MSR_EXT_A1_ENC(cond,rm,rn) \ + (((cond) << 28) | ARM_OP_MSR_EXT_A1 | (MVP_BIT(rm, 5) << 22) | \ + (MVP_BIT(rm, 4) << 8) | (MVP_EXTRACT_FIELD(rm, 0, 4) << 16) | ((rn) << 0)) + +#define ARM_INSTR_MRS_EXT_A1_ENC(cond,rd,rm) \ + (((cond) << 28) | ARM_OP_MRS_EXT_A1 | (MVP_BIT(rm, 5) << 22) | \ + (MVP_BIT(rm, 4) << 8) | (MVP_EXTRACT_FIELD(rm, 0, 4) << 16) | ((rd) << 12)) +/*@}*/ + +/** + * @name ARM MCR/MRC/MCRR instruction decoding. + * @{ + */ +#define ARM_INSTR_COPROC_CR_LEN 4 +#define ARM_INSTR_COPROC_CR_MAX (1 << ARM_INSTR_COPROC_CR_LEN) +#define ARM_INSTR_COPROC_OPCODE_LEN 3 +#define ARM_INSTR_COPROC_OPCODE_MAX (1 << ARM_INSTR_COPROC_OPCODE_LEN) + +#define ARM_INSTR_COPROC_CRM(instr) MVP_EXTRACT_FIELD(instr, 0, 4) +#define ARM_INSTR_COPROC_CRN(instr) MVP_EXTRACT_FIELD(instr, 16, 4) +#define ARM_INSTR_COPROC_OPCODE1(instr) MVP_EXTRACT_FIELD(instr, 21, 3) +#define ARM_INSTR_COPROC_OPCODE2(instr) MVP_EXTRACT_FIELD(instr, 5, 3) +#define ARM_INSTR_COPROC_OPCODE(instr) MVP_EXTRACT_FIELD(instr, 4, 4) +#define ARM_INSTR_COPROC_CPNUM(instr) MVP_EXTRACT_FIELD(instr, 8, 4) +/*@}*/ + +/** + * @name ARM VMRS/VMSR instruction decoding -- See VMRS (B6.1.14) + * and VMSR (B6.1.15) in ARM DDI 0406B. + * @{ + */ +#define ARM_INSTR_IS_VMRS(instr) ((MVP_EXTRACT_FIELD(instr, 0, 12) == 0xa10) && \ + (ARM_INSTR_OPCODE(instr) == 0xef)) + +#define ARM_INSTR_IS_VMSR(instr) ((MVP_EXTRACT_FIELD(instr, 0, 12) == 0xa10) && \ + (ARM_INSTR_OPCODE(instr) == 0xee)) + +#define ARM_INSTR_VMRS_SPECREG(instr) MVP_EXTRACT_FIELD(instr, 16, 4) +#define ARM_INSTR_VMRS_RT(instr) MVP_EXTRACT_FIELD(instr, 12, 4) + +#define ARM_INSTR_VMSR_SPECREG(instr) MVP_EXTRACT_FIELD(instr, 16, 4) +#define ARM_INSTR_VMSR_RT(instr) MVP_EXTRACT_FIELD(instr, 12, 4) +/*@}*/ + +/** + * @name ARM SWP{B} instruction checking. + * @{ + */ +#define ARM_INSTR_IS_SWP(instr) ((instr & 0x0fb00ff0) == 0x01000090) +/*@}*/ + +#endif /// _INSTR_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/lowmemkiller_variant.sh b/arch/arm/mvp/mvpkm/lowmemkiller_variant.sh new file mode 100644 index 0000000..2c9ab50 --- /dev/null +++ b/arch/arm/mvp/mvpkm/lowmemkiller_variant.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# +# Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support +# +# Copyright (C) 2010-2012 VMware, Inc. All rights reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 as published by +# the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; see the file COPYING. If not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# @brief Script providing the variant of the low memory killer implementation +# to assist in mvpkm's export of the other_file calculation. + +if [ -z "$1" ] +then + echo "Usage: $0 " + exit 1 +fi + +# We look at the relevant section of the lowmem_shrink function here. This +# pattern is sufficient to distinguish between the known variants without +# introducing too many false positives for new variants. I.e. we can spot the +# lines that matter for the other_file calculation. In some cases the +# lowmemorykiller uses only the other_file calculation instead of max(free, +# file) - in the cases we've seen this is OK with the balloon policy, since the +# free term isn't really significant when we get into low memory states anyway. + +tmp_file="lmk_md5sum_$RANDOM" + +cat $1 | tr -d '\ \t\n\r' > $tmp_file +sed -i -e 's/.*\(intother_file.*other_file<\).*/;\1/' \ + -e 's/[;][^;]*other_file[^;]*/#<#&#>#/g' \ + -e 's/#>#[^#]*//g' $tmp_file + +MD5=`md5sum $tmp_file | cut -f1 -d\ ` + +rm $tmp_file + +case $MD5 in +4af66fafb5e4cbd7b4092e29e071f152|\ +a0f18472eb53e52b38d6f85d4ec66842|\ +590b89af56f57146edffceba60845ad8|\ +fddbb73a58e82ba1966fd862a561c2bd) + #/* + # * This is the same as the non-exported global_reclaimable_pages() when there + # * is no swap. + # */ + #other_file = global_page_state(NR_ACTIVE_FILE) + + # global_page_state(NR_INACTIVE_FILE); + V=1 +;; +943372c447dd868845d71781292eae17|\ +14d0cc4189c1f4fd7818c3393cc8c311) + # other_file = global_page_state(NR_FILE_PAGES); + V=2 +;; +59f3bb678a855acfea2365b7a904bc5b|\ +df96cbb1784869ac7d017dd343e4e8f2) + # other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM); + V=3 +;; +ed03b69361c2881ed1a031c9b9a24d8a|\ +8639aca416d3014d68548d6cb538405b) + # other_file = global_page_state(NR_FREE_PAGES) + global_page_state(NR_FILE_PAGES); + # (other_free not used, but max(other_free, other_file) = other_file in this + # case. + V=4 +;; +*) + V=0 +;; +esac + +echo "$MD5 $V" diff --git a/arch/arm/mvp/mvpkm/lpae_defs.h b/arch/arm/mvp/mvpkm/lpae_defs.h new file mode 100644 index 0000000..7729268 --- /dev/null +++ b/arch/arm/mvp/mvpkm/lpae_defs.h @@ -0,0 +1,92 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Large physical address extension definitions. + * + * See ARM PRD03-GENC-008469 11.0. + */ +#ifndef _LPAE_DEFS_H_ +#define _LPAE_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define ARM_LPAE_PT_ORDER 12 + +#define ARM_LPAE_PT_SIZE (1 << ARM_LPAE_PT_ORDER) +#define ARM_LPAE_ENTRY_ORDER 3 +#define ARM_LPAE_PT_ENTRIES_ORDER (ARM_LPAE_PT_ORDER - ARM_LPAE_ENTRY_ORDER) +#define ARM_LPAE_PT_ENTRIES (1 << ARM_LPAE_PT_ENTRIES_ORDER) + +#define ARM_LPAE_L1D_BLOCK_ORDER 30 +#define ARM_LPAE_L2D_BLOCK_ORDER 21 +#define ARM_LPAE_L3D_BLOCK_ORDER 12 + +#define ARM_LPAE_L1D_BLOCK_BITS (40 - ARM_LPAE_L1D_BLOCK_ORDER) +#define ARM_LPAE_L2D_BLOCK_BITS (40 - ARM_LPAE_L2D_BLOCK_ORDER) +#define ARM_LPAE_L3D_BLOCK_BITS (40 - ARM_LPAE_L3D_BLOCK_ORDER) + +/* + * Currently supporting up to 16GB PA spaces. + */ +#define ARM_LPAE_L1PT_INDX(addr) \ + MVP_EXTRACT_FIELD64(addr, ARM_LPAE_L1D_BLOCK_ORDER, 4) +#define ARM_LPAE_L2PT_INDX(addr) \ + MVP_EXTRACT_FIELD64(addr, ARM_LPAE_L2D_BLOCK_ORDER, ARM_LPAE_PT_ENTRIES_ORDER) +#define ARM_LPAE_L3PT_INDX(addr) \ + MVP_EXTRACT_FIELD64(addr, ARM_LPAE_L3D_BLOCK_ORDER, ARM_LPAE_PT_ENTRIES_ORDER) + +#define ARM_LPAE_L1D_BLOCK_BASE_ADDR(base) ((base) << ARM_LPAE_L1D_BLOCK_ORDER) +#define ARM_LPAE_L1D_BLOCK_ADDR_BASE(addr) ((addr) >> ARM_LPAE_L1D_BLOCK_ORDER) +#define ARM_LPAE_L2D_BLOCK_BASE_ADDR(base) ((base) << ARM_LPAE_L2D_BLOCK_ORDER) +#define ARM_LPAE_L2D_BLOCK_ADDR_BASE(addr) ((addr) >> ARM_LPAE_L2D_BLOCK_ORDER) +#define ARM_LPAE_L3D_BLOCK_BASE_ADDR(base) ((base) << ARM_LPAE_L3D_BLOCK_ORDER) +#define ARM_LPAE_L3D_BLOCK_ADDR_BASE(addr) ((addr) >> ARM_LPAE_L3D_BLOCK_ORDER) + +#define ARM_LPAE_TABLE_BASE_ADDR(base) ((base) << ARM_LPAE_PT_ORDER) +#define ARM_LPAE_TABLE_ADDR_BASE(addr) ((addr) >> ARM_LPAE_PT_ORDER) + +#define ARM_LPAE_TYPE_INVALID 0 +#define ARM_LPAE_TYPE_TABLE 3 +#define ARM_LPAE_L1D_TYPE_BLOCK 1 +#define ARM_LPAE_L2D_TYPE_BLOCK 1 +#define ARM_LPAE_L3D_TYPE_BLOCK 3 + +/** + * @name Second stage permission model. + * + * @{ + */ +#define ARM_LPAE_S2_PERM_NONE 0 +#define ARM_LPAE_S2_PERM_RO 1 +#define ARM_LPAE_S2_PERM_WO 2 +#define ARM_LPAE_S2_PERM_RW 3 +/*@}*/ + + +#endif /// ifndef _LPAE_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/lpae_types.h b/arch/arm/mvp/mvpkm/lpae_types.h new file mode 100644 index 0000000..292f0d9 --- /dev/null +++ b/arch/arm/mvp/mvpkm/lpae_types.h @@ -0,0 +1,124 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Large physical address extension types. + * + * See ARM PRD03-GENC-008469 11.0. + */ +#ifndef _LPAE_TYPES_H_ +#define _LPAE_TYPES_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "lpae_defs.h" + +/** + * @name ARM LPAE page table descriptors. See p7-8 ARM PRD03-GENC-008469 11.0. + * @{ + */ + +#define LOWER_PAGE_ATTRIBUTES_STAGE1 \ + uint64 attrIndx : 3; \ + uint64 ns : 1; \ + uint64 ap : 2; \ + uint64 sh : 2; \ + uint64 af : 1; \ + uint64 ng : 1; + +#define LOWER_PAGE_ATTRIBUTES_STAGE2 \ + uint64 memAttr : 4; \ + uint64 hap : 2; \ + uint64 sh : 2; \ + uint64 af : 1; \ + uint64 sbzL : 1; + +#define UPPER_PAGE_ATTRIBUTES_STAGE1 \ + uint64 contig : 1; \ + uint64 pxn : 1; \ + uint64 xn : 1; \ + uint64 sw : 4; \ + uint64 ignU : 5; + +#define UPPER_PAGE_ATTRIBUTES_STAGE2 \ + uint64 contig : 1; \ + uint64 sbzU : 1; \ + uint64 xn : 1; \ + uint64 sw : 4; \ + uint64 ignU : 5; + + +#define ARM_LPAE_DESC_TYPE(lvl,blen,sbzpad) \ + typedef union { \ + uint64 u; \ + \ + struct { \ + uint64 type : 2;\ + uint64 ign : 62; \ + } x; \ + \ + struct { \ + uint64 type : 2;\ + LOWER_PAGE_ATTRIBUTES_STAGE1 \ + sbzpad \ + uint64 base : blen; \ + uint64 sbz : 12; \ + UPPER_PAGE_ATTRIBUTES_STAGE1 \ + } blockS1; \ + \ + struct { \ + uint64 type : 2;\ + LOWER_PAGE_ATTRIBUTES_STAGE2 \ + sbzpad \ + uint64 base : blen; \ + uint64 sbz : 12; \ + UPPER_PAGE_ATTRIBUTES_STAGE2 \ + } blockS2; \ + \ + struct { \ + uint64 type : 2;\ + uint64 ign0 : 10; \ + uint64 base : 28; \ + uint64 sbz : 12; \ + uint64 ign1 : 7; \ + uint64 pxn : 1; \ + uint64 xn : 1; \ + uint64 ap : 2; \ + uint64 ns : 1; \ + } table; \ + \ + } ARM_LPAE_L##lvl##D; + + +ARM_LPAE_DESC_TYPE(1, ARM_LPAE_L1D_BLOCK_BITS, uint64 sbzP : 18;) +ARM_LPAE_DESC_TYPE(2, ARM_LPAE_L2D_BLOCK_BITS, uint64 sbzP : 9;) +ARM_LPAE_DESC_TYPE(3, ARM_LPAE_L3D_BLOCK_BITS, ) + +/*@}*/ + +#endif /// ifndef _LPAE_TYPES_H_ diff --git a/arch/arm/mvp/mvpkm/mksck.h b/arch/arm/mvp/mvpkm/mksck.h new file mode 100644 index 0000000..aac00f7 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mksck.h @@ -0,0 +1,153 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +#ifndef _MKSCK_H +#define _MKSCK_H + +/** + * @file + * + * @brief The monitor-kernel socket interface definitions. + * + * The monitor kernel socket interface was created for (what the name + * says) communications between the monitor and host processes. On the + * monitor side a special API is introduced, see mksck_vmm.h. On the + * host side the API is the standard Berkeley socket interface. Host + * process to host process or monitor to monitor communication is not + * supported. + * + * A generic address consists of two 16 bit fields: the vm id and the + * port id. Both hosts (vmx) and monitors (vmm) get their vm id + * automatically. The host vm id is assigned at the time the host + * process opens the mvpkm file descriptor, while the monitor vm id is + * assigned when the vmx.c:SetupWorldSwitchPage() calls + * Mvpkm_SetupIds(). As a vmx may create multiple monitors to service + * an MP guest, a vmx vm id may be associated with multiple monitor vm + * ids. A monitor id, however, has a single associated vmx host id, + * the id of its canonical vmx. + * + * Sockets on the host get their addresses either by explicit user + * call (the bind command) or implicitly by (issuing a send command + * first). At an explicit bind the user may omit one or both fields by + * providing MKSCK_VMID_UNDEF/MKSCK_PORT_UNDEF respectively. An + * implicit bind behaves as if both fields were omitted in an explicit + * bind. The default value of the vmid field is the vmid computed from + * the thread group id while that of a port is a new number. It is not + * invalid to bind a host process socket with a vm id different from + * the vmid computed from the tgid. + * + * Sockets of the monitor are automatically assigned a vmid, that of their + * monitor, at the time of their creation. The port id can be assigned by the + * user or left to the implementation to assign an unused one (by specifying + * MKSCK_PORT_UNDEF at @ref Mksck_Open). + * + * Host unconnected sockets may receive from any monitor sender, may send to any + * monitor socket. A socket can be connected to a peer address, that enables the + * use of the send command. + * + * One of many special predefined port (both host and monitor) is + * MKSCK_PORT_MASTER. It is used for initialization. + * + * Monitor sockets have to send their peer address explicitly (by + * Mksck_SetPeer()) or implicitly by receiving first. After the peer + * is set, monitor sockets may send or receive only to/from their + * peer. + */ + + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "vmid.h" + +/* + * The interface limits the size of transferable packets. + */ +#define MKSCK_XFER_MAX 1024 + +#define MKSCK_ADDR_UNDEF (uint32)0xffffffff + +#define MKSCK_PORT_UNDEF (uint16)0xffff +#define MKSCK_PORT_MASTER (MKSCK_PORT_UNDEF-1) +#define MKSCK_PORT_HOST_FB (MKSCK_PORT_UNDEF-2) +#define MKSCK_PORT_BALLOON (MKSCK_PORT_UNDEF-3) +#define MKSCK_PORT_HOST_HID (MKSCK_PORT_UNDEF-4) +#define MKSCK_PORT_CHECKPOINT (MKSCK_PORT_UNDEF-5) +#define MKSCK_PORT_COMM_EV (MKSCK_PORT_UNDEF-6) +#define MKSCK_PORT_HIGH (MKSCK_PORT_UNDEF-7) + +#define MKSCK_VMID_UNDEF VMID_UNDEF +#define MKSCK_VMID_HIGH (MKSCK_VMID_UNDEF-1) + +#define MKSCK_DETACH 3 + +typedef uint16 Mksck_Port; +typedef VmId Mksck_VmId; + +/** + * @brief Page descriptor for typed messages. Each page describes a region of + * the machine address space with base mpn and size 2^(12 + order) bytes. + */ +typedef struct { + uint32 mpn : 20; ///< Base MPN of region described by page + uint32 order : 12; ///< Region is 2^(12 + order) bytes. +} Mksck_PageDesc; + +/** + * @brief Typed message template macro. Allows us to avoid having two message + * types, one with page descriptor vector (for VMM), one without (for + * VMX). + * + * @param type C type of uninterpreted component of the message (following the + * page descriptor vector). + * @param pages number of page descriptors in vector. + */ +#define MKSCK_DESC_TYPE(type,pages) \ + struct { \ + type umsg; \ + Mksck_PageDesc page[pages]; \ + } + +/** + * @brief The monitor kernel socket interface address format + */ +typedef union { + uint32 addr; ///< the address + struct { /* The address is decomposed to two shorts */ + Mksck_Port port; ///< port unique within a vmid + Mksck_VmId vmId; ///< unique vmid + }; +} Mksck_Address; + +static inline uint32 +Mksck_AddrInit(Mksck_VmId vmId, Mksck_Port port) +{ + Mksck_Address aa; + aa.vmId = vmId; + aa.port = port; + return aa.addr; +} +#endif diff --git a/arch/arm/mvp/mvpkm/mksck_kernel.c b/arch/arm/mvp/mvpkm/mksck_kernel.c new file mode 100644 index 0000000..6811a68 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mksck_kernel.c @@ -0,0 +1,2589 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The monitor/kernel socket interface kernel extension. + */ + +#define __KERNEL_SYSCALLS__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include + +#include "mvp.h" +#include "actions.h" +#include "mvpkm_kernel.h" +#include "mksck_kernel.h" +#include "mksck_sockaddr.h" +#include "mutex_kernel.h" + +void NORETURN FatalError(char const *file, + int line, + FECode feCode, + int bugno, + char const *fmt, + ...) +{ + /* Lock around printing the error details so that the messages from multiple + * threads are not interleaved. */ + static DEFINE_MUTEX(fatalErrorMutex); + mutex_lock(&fatalErrorMutex); + + FATALERROR_COMMON(printk, vprintk, file, line, feCode, bugno, fmt); + + dump_stack(); + + /* done printing */ + mutex_unlock(&fatalErrorMutex); + + /* do_exit below exits the current thread but does not crash the kernel. + * Hence the stack dump will actually be readable from other user threads. + */ + do_exit(1); +} + + +/* + * The project uses a new address family: AF_MKSCK. Optimally this address + * family were accepted with the Linux community and a permanent number + * were assigned. This, however, is a dream only, not even the x86 team + * has been able to pull it off. + * + * Instead we ASSUME that DECnet is dead and re-use it's address family number. + * This is what the x86 world is moving too in the latest versions. + */ + +static struct proto mksckProto = { + .name = "AF_MKSCK", + .owner = THIS_MODULE, + .obj_size = sizeof (struct sock), +}; + +static int MksckCreate(struct net *net, + struct socket *sock, + int protocol, + int kern); + +static struct net_proto_family mksckFamilyOps = { + .family = AF_MKSCK, + .owner = THIS_MODULE, + .create = MksckCreate, +}; + +static int MksckFault(struct vm_area_struct *vma, struct vm_fault *vmf); + + +/** + * @brief Linux vma operations for receive windows established via Mksck + * mmap. + */ +static struct vm_operations_struct mksckVMOps = { + .fault = MksckFault +}; + +/* + * List of hosts and guests we know about. + */ +static spinlock_t mksckPageListLock; +static MksckPage *mksckPages[MKSCK_MAX_SHARES]; + +/* + * The following functions form the AF_MKSCK DGRAM operations. + */ +static int MksckRelease(struct socket *sock); +static int MksckBacklogRcv(struct sock *sk, struct sk_buff *skb); +static void MksckSkDestruct(struct sock *sk); +static int MksckBind(struct socket *sock, + struct sockaddr *addr, + int addrLen); +static int MksckBindGeneric(struct sock *sk, + Mksck_Address addr); +static int MksckDgramRecvMsg(struct kiocb *kiocb, + struct socket *sock, + struct msghdr *msg, + size_t len, + int flags); +static int MksckDgramSendMsg(struct kiocb *kiocb, + struct socket *sock, + struct msghdr *msg, + size_t len); +static int MksckGetName(struct socket *sock, + struct sockaddr *addr, + int *addrLen, + int peer); +static unsigned int MksckPoll(struct file *filp, + struct socket *sock, + poll_table *wait); +static int MksckDgramConnect(struct socket *sock, + struct sockaddr *addr, + int addrLen, + int flags); +static int MksckMMap(struct file *file, + struct socket *sock, + struct vm_area_struct *vma); + +static void MksckPageRelease(MksckPage *mksckPage); + +static struct proto_ops mksckDgramOps = { + .family = AF_MKSCK, + .owner = THIS_MODULE, + .release = MksckRelease, + .bind = MksckBind, + .connect = MksckDgramConnect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = MksckGetName, + .poll = MksckPoll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, /* MksckShutdown, */ + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = MksckDgramSendMsg, + .recvmsg = MksckDgramRecvMsg, + .mmap = MksckMMap, + .sendpage = sock_no_sendpage, +}; + + +/** + * @brief Initialize the MKSCK protocol + * + * @return 0 on success, -errno on failure + */ +int +Mksck_Init(void) +{ + int err; + + spin_lock_init(&mksckPageListLock); + + /* + * Create a slab to allocate socket structs from. + */ + err = proto_register(&mksckProto, 1); + if (err != 0) { + printk(KERN_INFO + "Mksck_Init: Cannot register MKSCK protocol, errno = %d.\n", err); + return err; + } + + /* + * Register the socket family + */ + err = sock_register(&mksckFamilyOps); + if (err < 0) { + printk(KERN_INFO + "Mksck_Init: Could not register address family AF_MKSCK" + " (errno = %d).\n", err); + return err; + } + + return 0; +} + + +/** + * @brief De-register the MKSCK protocol + */ +void +Mksck_Exit(void) +{ + sock_unregister(mksckFamilyOps.family); + proto_unregister(&mksckProto); +} + + +/** + * @brief Create a new MKSCK socket + * + * @param net network namespace (2.6.24 or above) + * @param sock user socket structure + * @param protocol protocol to be used + * @param kern called from kernel mode + * + * @return 0 on success, -errno on failure + */ +static int +MksckCreate(struct net *net, + struct socket *sock, + int protocol, + int kern) +{ + struct sock *sk; + uid_t currentUid = current_euid(); + + if (!(currentUid == 0 || + currentUid == Mvpkm_vmwareUid)) { + printk(KERN_WARNING + "MksckCreate: rejected from process %s tgid=%d, pid=%d euid:%d.\n", + current->comm, + task_tgid_vnr(current), + task_pid_vnr(current), + currentUid); + return -EPERM; + } + + if (!sock) { + return -EINVAL; + } + + if (protocol) { + return -EPROTONOSUPPORT; + } + + switch (sock->type) { + case SOCK_DGRAM: { + sock->ops = &mksckDgramOps; + break; + } + default: { + return -ESOCKTNOSUPPORT; + } + } + + sock->state = SS_UNCONNECTED; + + /* + * Most recently (in 2.6.24), sk_alloc() was changed to expect the + * network namespace, and the option to zero the sock was dropped. + */ + sk = sk_alloc(net, mksckFamilyOps.family, GFP_KERNEL, &mksckProto); + + if (!sk) { + return -ENOMEM; + } + + sock_init_data(sock, sk); + + sk->sk_type = SOCK_DGRAM; + sk->sk_destruct = MksckSkDestruct; + sk->sk_backlog_rcv = MksckBacklogRcv; + + /* + * On socket lock... + * + * A bound socket will have an associated private area, the Mksck + * structure part of MksckPage. That area is pointed to by + * sk->sk_protinfo. In addition, a connected socket will have the + * peer field in its associated area set to point to the associated + * private area of the peer socket. A mechanism is needed to ensure + * that these private areas area not freed while they are being + * accessed within the scope of a function. A simple lock would not + * suffice as the interface functions (like MksckDgramRecvMsg()) + * may block. Hence a reference count mechanism is employed. When + * the mentioned references (sk->sk_protinfo and mksck->peer) to + * the respective private areas are set a refcount is incremented, + * and decremented when the references are deleted. + * + * The refcounts of areas pointed to by sk->sk_protinfo and + * mksck->peer will be decremented under the lock of the socket. + * Hence these private areas cannot disappear as long as the socket + * lock is held. + * + * The interface functions will have one of the following + * structures: + * + * simpleFn(sk) + * { + * lock_sock(sk); + * if ((mksck = sk->sk_protinfo)) { + * + * } + * release_sock(sk); + * } + * + * complexFn(sk) + * { + * lock_sock(sk); + * if ((mksck = sk->sk_protinfo)) { + * IncRefc(mksck); + * } + * release_sock(sk); + * + * if (mksck) { + * + * DecRefc(mksck); + * } + * } + */ + sk->sk_protinfo = NULL; + sock_reset_flag(sk, SOCK_DONE); + + return 0; +} + + +/** + * @brief Delete a MKSCK socket + * + * @param sock user socket structure + * + * @return 0 on success, -errno on failure + */ +static int +MksckRelease(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + lock_sock(sk); + sock_orphan(sk); + release_sock(sk); + sock_put(sk); + } + + sock->sk = NULL; + sock->state = SS_FREE; + + return 0; +} + + +static int +MksckBacklogRcv(struct sock *sk, struct sk_buff *skb) +{ + /* + * We should never get these as we never queue an skb. + */ + printk("MksckBacklogRcv: should never get here\n"); + return -EIO; +} + + +/** + * @brief Callback at socket destruction + * + * @param sk pointer to kernel socket structure + */ +static void +MksckSkDestruct(struct sock *sk) +{ + Mksck *mksck; + + lock_sock(sk); + mksck = sk->sk_protinfo; + + if (mksck != NULL) { + sk->sk_protinfo = NULL; + Mksck_CloseCommon(mksck); + } + + if (sk->sk_user_data != NULL) { + sock_kfree_s(sk, sk->sk_user_data, sizeof(int)); + sk->sk_user_data = NULL; + } + + release_sock(sk); +} + + +/** + * @brief Set the local address of a MKSCK socket + * + * @param sk kernel socket structure + * @param addr the new address of the socket + * + * @return 0 on success, -errno on failure + * + * If addr.port is undefined a new random port is assigned. + * If addr.vmId is undefined then the vmId computed from the tgid is used. + * Hence the vmId of a socket does not determine the host all the time. + * + * Assumed that the socket is locked. + * This function is called by explicit set (MksckBind) and implicit (Send). + */ +static int +MksckBindGeneric(struct sock *sk, + Mksck_Address addr) +{ + int err; + Mksck *mksck; + MksckPage *mksckPage; + + if (sk->sk_protinfo != NULL) { + return -EISCONN; + } + + /* + * Locate the page for the given host and increment its reference + * count so it can't get freed off while we are working on it. + */ + if (addr.vmId == MKSCK_VMID_UNDEF) { + mksckPage = MksckPage_GetFromTgidIncRefc(); + } else { + printk(KERN_WARNING "MksckBind: host bind called on vmid 0x%X\n", addr.vmId); + mksckPage = MksckPage_GetFromVmIdIncRefc(addr.vmId); + } + + if (mksckPage == NULL) { + printk(KERN_INFO "MksckBind: no mksckPage for vm 0x%X\n", addr.vmId); + return -ENETUNREACH; + } + addr.vmId = mksckPage->vmId; + + /* + * Before we can find an unused socket port on the page we have to + * lock the page for exclusive access so another thread can't + * allocate the same port. + */ + err = Mutex_Lock(&mksckPage->mutex, MutexModeEX); + if (err < 0) { + goto outDec; + } + + addr.port = MksckPage_GetFreePort(mksckPage, addr.port); + if (addr.port == MKSCK_PORT_UNDEF) { + err = -EINVAL; + goto outUnlockDec; + } + + /* + * At this point we have the mksckPage locked for exclusive access + * and its reference count incremented. Also, addr is completely + * filled in with vmId and port that we want to bind. + * + * Find an available mksck struct on the shared page and initialize + * it. + */ + mksck = MksckPage_AllocSocket(mksckPage, addr); + if (mksck == NULL) { + err = -EMFILE; + goto outUnlockDec; + } + + /* + * Stable, release mutex. Leave mksckPage->refCount incremented so + * mksckPage can't be freed until socket is closed. + */ + Mutex_Unlock(&mksckPage->mutex, MutexModeEX); + + /* + * This is why we start mksck->refCount at 1. When sk_protinfo gets + * cleared, we decrement mksck->refCount. + */ + sk->sk_protinfo = mksck; + + PRINTK(KERN_DEBUG "MksckBind: socket bound to %08X\n", mksck->addr.addr); + + return 0; + +outUnlockDec: + Mutex_Unlock(&mksckPage->mutex, MutexModeEX); +outDec: + MksckPage_DecRefc(mksckPage); + return err; +} + + +/** + * @brief Test if the socket is already bound to a local address and, + * if not, bind it to an unused address. + * + * @param sk kernel socket structure + * @return 0 on success, -errno on failure + * + * Assumed that the socket is locked. + */ +static inline int +MksckTryBind(struct sock *sk) +{ + int err = 0; + + if (!sk->sk_protinfo) { + static const Mksck_Address addr = { .addr = MKSCK_ADDR_UNDEF }; + err = MksckBindGeneric(sk, addr); + } + return err; +} + + + +/** + * @brief Set the address of a MKSCK socket (user call) + * + * @param sock user socket structure + * @param addr the new address of the socket + * @param addrLen length of the address + * + * @return 0 on success, -errno on failure + */ +static int +MksckBind(struct socket *sock, + struct sockaddr *addr, + int addrLen) +{ + int err; + struct sock *sk = sock->sk; + struct sockaddr_mk *addrMk = (struct sockaddr_mk *)addr; + + if (addrLen != sizeof *addrMk) { + return -EINVAL; + } + if (addrMk->mk_family != AF_MKSCK) { + return -EAFNOSUPPORT; + } + + /* + * Obtain the socket lock and call the generic Bind function. + */ + lock_sock(sk); + err = MksckBindGeneric(sk, addrMk->mk_addr); + release_sock(sk); + + return err; +} + +/** + * @brief Lock the peer socket by locating it, incrementing its refc + * @param addr the address of the peer socket + * @param[out] peerMksckR set to the locked peer socket pointer + * upon successful lookup + * @return 0 on success, -errno on failure + */ +static int +LockPeer(Mksck_Address addr, Mksck **peerMksckR) +{ + int err = 0; + MksckPage *peerMksckPage = MksckPage_GetFromVmIdIncRefc(addr.vmId); + Mksck *peerMksck; + + /* + * Find corresponding destination shared page and increment its + * reference count so it can't be freed while we are sending to the + * socket. Make sure that the address is indeed an address of a + * monitor/guest socket. + */ + if (peerMksckPage == NULL) { + printk(KERN_INFO "LockPeer: vmId %x is not in use!\n", addr.vmId); + return -ENETUNREACH; + } + if (!peerMksckPage->isGuest) { + MksckPage_DecRefc(peerMksckPage); + printk(KERN_INFO "LockPeer: vmId %x does not belong to a guest!\n", + addr.vmId); + return -ENETUNREACH; + } + + + err = Mutex_Lock(&peerMksckPage->mutex, MutexModeSH); + if (err < 0) { + MksckPage_DecRefc(peerMksckPage); + return err; + } + + /* + * Find corresponding destination socket on that shared page and + * increment its reference count so it can't be freed while we are + * trying to send to it. + */ + peerMksck = MksckPage_GetFromAddr(peerMksckPage, addr); + + if (peerMksck) { + ATOMIC_ADDV(peerMksck->refCount, 1); + *peerMksckR = peerMksck; + } else { + printk(KERN_INFO "LockPeer: addr %x is not a defined socket!\n", + addr.addr); + err = -ENETUNREACH; + } + + Mutex_Unlock(&peerMksckPage->mutex, MutexModeSH); + MksckPage_DecRefc(peerMksckPage); + return err; +} + +/** + * @brief Set the peer address of a MKSCK socket + * + * @param sock user socket structure + * @param addr the new address of the socket + * @param addrLen length of the address + * @param flags flags + * + * @return 0 on success, -errno on failure + */ +static int +MksckDgramConnect(struct socket *sock, + struct sockaddr *addr, + int addrLen, + int flags) +{ + struct sock *sk = sock->sk; + Mksck *mksck; + struct sockaddr_mk *peerAddrMk = (struct sockaddr_mk *)addr; + int err = 0; + + if (addrLen != sizeof *peerAddrMk) { + printk(KERN_INFO "MksckConnect: wrong address length!\n"); + return -EINVAL; + } + if (peerAddrMk->mk_family != AF_MKSCK) { + printk(KERN_INFO "MksckConnect: wrong address family!\n"); + return -EAFNOSUPPORT; + } + + lock_sock(sk); + + if ((err = MksckTryBind(sk))) { + goto releaseSock; + } + mksck = sk->sk_protinfo; + + /* + * First severe any past peer connections + */ + Mksck_DisconnectPeer(mksck); + sock->state = SS_UNCONNECTED; + + /* + * Then build new connections ... + */ + if (peerAddrMk->mk_addr.addr != MKSCK_ADDR_UNDEF) { + sock->state = SS_CONNECTED; + mksck->peerAddr = peerAddrMk->mk_addr; + err = LockPeer(mksck->peerAddr, &mksck->peer); + PRINTK(KERN_DEBUG "MksckConnect: socket %x is connected to %x!\n", + mksck->addr.addr, mksck->peerAddr.addr); + } + +releaseSock: + release_sock(sk); + + return err; +} + + +/** + * @brief returns the address of a MKSCK socket/peer address + * + * @param sock user socket structure + * @param addr the new address of the socket + * @param addrLen length of the address + * @param peer 1 if the peer address is sought + * + * @return 0 on success, -errno on failure + */ +static int +MksckGetName(struct socket *sock, + struct sockaddr *addr, + int *addrLen, + int peer) +{ + int err; + Mksck *mksck; + struct sock *sk = sock->sk; + + // MAX_SOCK_ADDR is size of *addr, Linux doesn't export it! + // ASSERT_ON_COMPILE(sizeof (struct sockaddr_mk) <= MAX_SOCK_ADDR); + + lock_sock(sk); + mksck = sk->sk_protinfo; + + if (mksck == NULL) { + if (peer) { + err = -ENOTCONN; + } else { + ((struct sockaddr_mk *)addr)->mk_family = AF_MKSCK; + ((struct sockaddr_mk *)addr)->mk_addr.addr = MKSCK_ADDR_UNDEF; + *addrLen = sizeof (struct sockaddr_mk); + err = 0; + } + } else if (!peer) { + ((struct sockaddr_mk *)addr)->mk_family = AF_MKSCK; + ((struct sockaddr_mk *)addr)->mk_addr = mksck->addr; + *addrLen = sizeof (struct sockaddr_mk); + err = 0; + } else if (mksck->peerAddr.addr == MKSCK_ADDR_UNDEF) { + err = -ENOTCONN; + } else { + ((struct sockaddr_mk *)addr)->mk_family = AF_MKSCK; + ((struct sockaddr_mk *)addr)->mk_addr = mksck->peerAddr; + *addrLen = sizeof (struct sockaddr_mk); + err = 0; + } + + release_sock(sk); + + return err; +} + + +/** + * @brief VMX polling a receipted packet from VMM. + * + * @param filp kernel file pointer to poll for + * @param sock user socket structure + * @param wait kernel polling table where to poll if not null + * + * @return poll mask state given from socket state. + */ +static unsigned int MksckPoll(struct file *filp, + struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask = 0; + Mksck *mksck = NULL; + uint32 read; + int err; + + lock_sock(sk); + if ((err = MksckTryBind(sk))) { + release_sock(sk); + return err; + } + mksck = sk->sk_protinfo; + + /* + * To avoid mksck disappearing right after the release_sock the + * refcount needs to be incremented. For more details read the + * block comment on locking in MksckCreate. + */ + ATOMIC_ADDV(mksck->refCount, 1); + release_sock(sk); + + /* + * Wait to make sure this is the only thread trying to access socket. + */ + if ((err = Mutex_Lock(&mksck->mutex, MutexModeEX)) < 0) { + /* we might get in this situation if we are signaled + (select() may handle this, so leave) */ + PRINTK(KERN_INFO "MksckPoll: try to abort\n"); + return mask; + } + + /* + * See if packet in ring. + */ + read = mksck->read; + if (read != mksck->write) { + mask |= POLLIN | POLLRDNORM; /* readable, socket is unlocked */ + /* Note that if we are implementing support for POLLOUT, we SHOULD + change this Mutex_Unlock by Mutex_UnlPoll, because there is no + obvious knowledge about the sleepy reason that is intended by user */ + Mutex_Unlock(&mksck->mutex, MutexModeEX); + } else { + Mutex_UnlPoll(&mksck->mutex, MutexModeEX, MKSCK_CVAR_FILL, filp, wait); + } + + /* + * Note that locking rules differ a little inside MksckPoll, since we are + * not only given a pointer to the struct socket but also a pointer to a + * struct file. This means that during the whole operation of this function + * and during any pending wait (registered with poll_wait()), the file itself + * is reference counted up, and we should rely on that 'upper' reference + * counting to prevent from tearing the Mksck down. That holds true since one + * never re-bind sockets ! + */ + Mksck_DecRefc(mksck); + return mask; +} + +/** + * @brief Manage a set of Mksck_PageDesc from a message or a stored array. + * + * @param pd set of Mksck_PageDesc + * @param pages Mksck_PageDesc pages count for this management operation + * @param incr ternary used to indicate if we want to reference (+1), or + * dereference (-1), or count (0) 4k pages + * + * @return length of bytes processed. + */ +static size_t +MksckPageDescManage(Mksck_PageDesc *pd, + uint32 pages, + int incr) +{ + size_t payloadLen = 0; + uint32 i; + + for (i = 0; i < pages && pd[i].mpn != INVALID_MPN; ++i) { + uint32 j; + + for (j = 0; j < 1 << pd[i].order; ++j) { + struct page *page; + MPN currMPN = pd[i].mpn + j; + + /* + * The monitor tried to send an invalid MPN, bad. + */ + if (!pfn_valid(currMPN)) { + printk("MksckPageDescManage: Invalid MPN %x\n", currMPN); + } else { + page = pfn_to_page(currMPN); + + if (incr == +1) { + get_page(page); + } + if (incr == -1) { + put_page(page); + } + } + + payloadLen += PAGE_SIZE; + } + } + + return payloadLen; +} + +/** + * @brief Management values to be used as third parameter of MksckPageDescManage + */ +#define MANAGE_INCREMENT +1 +#define MANAGE_DECREMENT -1 +#define MANAGE_COUNT 0 + + +/** + * @brief Map a set of Mksck_PageDesc from a message or a stored array. + * + * @param pd set of Mksck_PageDesc + * @param pages pages count for this mapping + * @param iov vectored user virtual addresses of the recv commands + * @param iovCount size for iov parameter + * @param vma virtual memory area used for the mapping, note that + * this is mandatorily required MksckPageDescMap is used + * on an indirect PageDesc context (i.e whenever iov is + * not computed by the kernel but by ourselves). + * + * Since find_vma() and vm_insert_page() are used, this function must + * be called with current's mmap_sem locked, or inside an MMap operation. + * + * @return length of bytes mapped. + */ +static size_t +MksckPageDescMap(Mksck_PageDesc *pd, + uint32 pages, + struct iovec *iov, + int iovCount, + struct vm_area_struct *vma) +{ + size_t payloadLen = 0; + uint32 i; + + for (i = 0; i < pages && pd[i].mpn != INVALID_MPN; ++i) { + uint32 j; + + for (j = 0; j < 1 << pd[i].order; ++j) { + HUVA huva = 0; + struct page *page; + MPN currMPN = pd[i].mpn + j; + + while (iovCount > 0 && iov->iov_len == 0) { + iovCount--; + iov++; + } + + if (iovCount == 0) { + printk("MksckPageDescMap: Invalid iov length\n"); + goto map_done; + } + + huva = (HUVA)iov->iov_base; + + /* + * iovecs for receiving the typed component of the message should + * have page aligned base and size sufficient for page descriptor's + * mappings. + */ + if (huva & (PAGE_SIZE - 1) || iov->iov_len < PAGE_SIZE) { + printk("MksckPageDescMap: Invalid huva %x or iov_len %d\n", + huva, + iov->iov_len); + goto map_done; + } + + /* + * Might be in a new vma... + */ + if (vma == NULL || huva < vma->vm_start || huva >= vma->vm_end) { + vma = find_vma(current->mm, huva); + + /* + * Couldn't find a matching vma for huva. + */ + if (vma == NULL || + huva < vma->vm_start || + vma->vm_ops != &mksckVMOps) { + printk("MksckPageDescMap: Invalid vma\n"); + goto map_done; + } + } + + /* + * The monitor tried to send an invalid MPN, bad. + */ + if (!pfn_valid(currMPN)) { + printk("MksckPageDescMap: Invalid MPN %x\n", currMPN); + } else { + int rc; + + page = pfn_to_page(currMPN); + + /* + * Map into the receive window. + */ + rc = vm_insert_page(vma, huva, page); + if (rc) { + printk("MksckPageDescMap: Failed to insert %x at %x, error %d\n", + currMPN, + huva, + rc); + goto map_done; + } + + ASSERT(iov->iov_len >= PAGE_SIZE); + iov->iov_base += PAGE_SIZE; + iov->iov_len -= PAGE_SIZE; + } + + payloadLen += PAGE_SIZE; + } + } + +map_done: + return payloadLen; +} + + +/** + * @brief Check if the provided MsgHdr has still room for a receive operation. + * + * @param msg user buffer + * @return 1 if MsgHdr has IO space room in order to receive a mapping, 0 otherwise. + */ +static int +MsgHdrHasAvailableRoom(struct msghdr *msg) +{ + struct iovec *vec = msg->msg_iov; + uint32 count = msg->msg_iovlen; + + while (count > 0 && vec->iov_len == 0) { + count--; + vec++; + } + + return (count != 0); +} + + +/** + * Whenever a typed message is received from the monitor, we may choose to store + * all the page descriptor content in a linked state of descriptors, through the + * following information context + */ +typedef struct MksckPageDescInfo { + struct MksckPageDescInfo *next; + uint32 flags; + uint32 pages; + uint32 mapCounts; + Mksck_PageDesc descs[0]; +} MksckPageDescInfo; + +static void MksckPageDescSkDestruct(struct sock *sk); +static int MksckPageDescMMap(struct file *file, + struct socket *sock, + struct vm_area_struct *vma); +static int MksckPageDescIoctl(struct socket *sock, + unsigned int cmd, + unsigned long arg); + +/** + * @brief Delete a page descriptor container socket + * + * @param sock user socket structure + * @return 0 on success, -errno on failure + */ +static int +MksckPageDescRelease(struct socket *sock) +{ + /* This is generic socket release */ + struct sock *sk = sock->sk; + + if (sk) { + lock_sock(sk); + sock_orphan(sk); + release_sock(sk); + sock_put(sk); + } + + sock->sk = NULL; + sock->state = SS_FREE; + + return 0; +} + + +/** + * Whenever a typed message is received from the monitor, we may choose to store + * all the page descriptor content for a future mapping. One shall put a context + * usable by host userland, that means trough a file descriptor, and as a secure + * implementation we choose to define a strict set of operations that are used + * only for that purpose. This set of operation is reduced to leaving the + * default "PageDesc(s) accumulating" mode (inside ioctl), mapping the context, + * and generic socket destruction. + */ +static struct proto_ops mksckPageDescOps = { + .family = AF_MKSCK, + .owner = THIS_MODULE, + .release = MksckPageDescRelease, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = sock_no_poll, + .ioctl = MksckPageDescIoctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = MksckPageDescMMap, + .sendpage = sock_no_sendpage, +}; + + +/** + * @brief Create or accumulate to a PageDesc context, backed as a descriptor. + * + * @param sock user socket structure + * @param msg user buffer to receive the file descriptor as ancillary data + * @param pd source descriptor part of a message + * @param pages pages count for this mapping + * + * @return error if negative, 0 otherwise + * + */ +static int +MksckPageDescToFd(struct socket *sock, + struct msghdr *msg, + Mksck_PageDesc *pd, + uint32 pages) +{ + int retval; + int newfd; + struct socket *newsock; + struct sock *newsk; + struct sock *sk = sock->sk; + MksckPageDescInfo **pmpdi, *mpdi; + lock_sock(sk); + + /* + * Relation between any mk socket and the PageDesc context is as follow: + * + * From the mk socket to the PageDesc context: + * - sk->sk_user_data is a WEAK LINK, containing only a file descriptor + * numerical value such that accumulating is keyed on it. + * + * From the PageDesc context to the mk socket: + * - sk->sk_protinfo contains a MksckPageDescInfo struct. + * - sk->sk_user_data is a pointer REF-COUNTED sock_hold() LINK, also it is + * rarely dereferenced but usually used to check that the + * right socket pair is used. Full dereferencing is used + * only to break the described links. + */ + if (sk->sk_user_data) { + MksckPageDescInfo *mpdi2; + + /* continue any previous on-going mapping, i.e accumulate */ + newfd = *((int *)sk->sk_user_data); + newsock = sockfd_lookup(newfd, &retval); // promote the weak link + if (!newsock) { + retval = -EINVAL; + goto endProcessingReleaseSock; + } + + newsk = newsock->sk; + lock_sock(newsk); + sockfd_put(newsock); + + if (((struct sock *)newsk->sk_user_data) != sk) { + /* One way of going into this situation would be for userland to dup + the file descriptor just received, close the original number, and + open a new mk socket in the very same spot. The userland code have + a lot of way of interacting with the kernel without this driver + code to be notified. */ + retval = -EINVAL; + release_sock(newsk); + goto endProcessingReleaseSock; + } + + mpdi = sock_kmalloc(newsk, sizeof(MksckPageDescInfo) + + pages*sizeof(Mksck_PageDesc), GFP_KERNEL); + if (IS_ERR(mpdi)) { + retval = PTR_ERR(mpdi); + release_sock(newsk); + goto endProcessingReleaseSock; + } + + /* There is no mandatory needs for us to notify userland from + the progress in "appending" to the file descriptor, but it + would feel strange if the userland would have no mean to + tell if the received message was just not thrown away. So, in + order to be consistent one fill the ancillary message while + "creating" and "appending to" file descriptors. */ + retval = put_cmsg(msg, SOL_DECNET, 0, sizeof(int), &newfd); + if (retval < 0) { + goto endProcessingKFreeReleaseSock; + } + + release_sock(sk); + + mpdi2 = (MksckPageDescInfo *)newsk->sk_protinfo; + while (mpdi2->next) { + mpdi2 = mpdi2->next; + } + pmpdi = &(mpdi2->next); + + } else { + /* Create a new socket, new context and a new file descriptor. */ + retval = sock_create(sk->sk_family, sock->type, 0, &newsock); + if (retval < 0) { + goto endProcessingReleaseSock; + } + + newsk = newsock->sk; + lock_sock(newsk); + newsk->sk_destruct = &MksckPageDescSkDestruct; + newsk->sk_user_data = sk; + sock_hold(sk); // keeps a reference to parent mk socket + newsock->ops = &mksckPageDescOps; + + mpdi = sock_kmalloc(newsk, sizeof(MksckPageDescInfo) + + pages*sizeof(Mksck_PageDesc), GFP_KERNEL); + if (IS_ERR(mpdi)) { + retval = PTR_ERR(mpdi); + goto endProcessingFreeNewSock; + } + + sk->sk_user_data = sock_kmalloc(sk, sizeof(int), GFP_KERNEL); + if (IS_ERR(sk->sk_user_data)) { + retval = PTR_ERR(sk->sk_user_data); + sk->sk_user_data = NULL; + goto endProcessingKFreeAndNewSock; + } + + /* mapping to a file descriptor may fail if a thread is closing + in parallel of sock_map_fd/sock_alloc_fd, or kernel memory is full */ + newfd = sock_map_fd(newsock, O_CLOEXEC); + if (newfd < 0) { + retval = newfd; + sock_kfree_s(sk, sk->sk_user_data, sizeof(int)); + sk->sk_user_data = NULL; + goto endProcessingKFreeAndNewSock; + } + + /* notify userland from a new file descriptor, alike AF_UNIX ancillary */ + retval = put_cmsg(msg, SOL_DECNET, 0, sizeof(int), &newfd); + if (retval < 0) { + sock_kfree_s(sk, sk->sk_user_data, sizeof(int)); + sk->sk_user_data = NULL; + sock_kfree_s(newsk, mpdi, sizeof(MksckPageDescInfo) + + mpdi->pages*sizeof(Mksck_PageDesc)); + release_sock(newsk); + sockfd_put(newsock); + sock_release(newsock); + put_unused_fd(newfd); + goto endProcessingReleaseSock; + } + + *(int*)sk->sk_user_data = newfd; + release_sock(sk); + pmpdi = (MksckPageDescInfo **)(&(newsk->sk_protinfo)); + } + + mpdi->next = NULL; + mpdi->flags = 0; + mpdi->mapCounts = 0; + mpdi->pages = pages; + memcpy(mpdi->descs, pd, pages*sizeof(Mksck_PageDesc)); + + *pmpdi = mpdi; // link + release_sock(newsk); + + /* increment all reference counters for the pages */ + MksckPageDescManage(pd, pages, MANAGE_INCREMENT); + return 0; + +endProcessingKFreeAndNewSock: + sock_kfree_s(newsk, mpdi, sizeof(MksckPageDescInfo) + + mpdi->pages*sizeof(Mksck_PageDesc)); +endProcessingFreeNewSock: + release_sock(newsk); + sock_release(newsock); + release_sock(sk); + return retval; + +endProcessingKFreeReleaseSock: + sock_kfree_s(newsk, mpdi, sizeof(MksckPageDescInfo) + + mpdi->pages*sizeof(Mksck_PageDesc)); + release_sock(newsk); +endProcessingReleaseSock: + release_sock(sk); + return retval; +} + +/** + * @brief Callback at socket destruction + * + * @param sk pointer to kernel socket structure + */ +static void +MksckPageDescSkDestruct(struct sock *sk) +{ + struct sock *mkSk = NULL; + MksckPageDescInfo *mpdi; + lock_sock(sk); + mpdi = sk->sk_protinfo; + while (mpdi) { + MksckPageDescInfo *next = mpdi->next; + MksckPageDescManage(mpdi->descs, mpdi->pages, + MANAGE_DECREMENT); + sock_kfree_s(sk, mpdi, sizeof(MksckPageDescInfo) + + mpdi->pages*sizeof(Mksck_PageDesc)); + mpdi = next; + } + if (sk->sk_user_data) { + mkSk = (struct sock *)sk->sk_user_data; + sk->sk_user_data = NULL; + } + sk->sk_protinfo = NULL; + release_sock(sk); + /* clean the monki socket that we are holding */ + if (mkSk) { + lock_sock(mkSk); + sock_kfree_s(mkSk, mkSk->sk_user_data, sizeof(int)); + mkSk->sk_user_data = NULL; + release_sock(mkSk); + sock_put(mkSk); // revert of sock_hold() + } +} + +/** + * @brief The mmap operation of the PageDesc context file descriptor. + * + * The mmap command is used to mmap any detached (i.e. no more accumulating) + * PageDesc context, full of the content from its parent communication mk + * socket. Mapping may be done a specified number of times, so that the + * PageDesc context could become useless (as a security restriction). + * + * Also note that mapping from an offset different from zero is considered + * as a userland invalid operation. + * + * @param file user file structure + * @param sock user socket structure + * @param vma virtual memory area structure + * + * @return error code, 0 on success + */ +static int +MksckPageDescMMap(struct file *file, + struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + MksckPageDescInfo *mpdi; + struct iovec iov; + unsigned long vm_flags; + int freed = 0; + + iov.iov_base = (void*)vma->vm_start; + iov.iov_len = vma->vm_end - vma->vm_start; + + lock_sock(sk); + mpdi = sk->sk_protinfo; + + // vma->vm_pgoff is checked, since offsetting the map is not supported + if (!mpdi || sk->sk_user_data || vma->vm_pgoff) { + release_sock(sk); + printk(KERN_INFO "MMAP failed for virt %lx size %lx\n", + vma->vm_start, vma->vm_end - vma->vm_start); + return -EINVAL; + } + + vm_flags = mpdi->flags; + if ((vma->vm_flags & ~vm_flags) & (VM_READ|VM_WRITE)) { + release_sock(sk); + return -EACCES; + } + + while (mpdi) { + MksckPageDescInfo *next = mpdi->next; + MksckPageDescMap(mpdi->descs, mpdi->pages, &iov, 1, vma); + if (mpdi->mapCounts && !--mpdi->mapCounts) { + MksckPageDescManage(mpdi->descs, mpdi->pages, + MANAGE_DECREMENT); + sock_kfree_s(sk, mpdi, sizeof(MksckPageDescInfo) + + mpdi->pages*sizeof(Mksck_PageDesc)); + freed = 1; + } + mpdi = next; + } + + if (freed) { + sk->sk_protinfo = NULL; + } + vma->vm_ops = &mksckVMOps; + release_sock(sk); + return 0; +} + +/** + * @brief The ioctl operation of the PageDesc context file descriptor. + * + * The ioctl MKSCK_DETACH command is used to detach the PageDesc context + * from its parent communication mk socket. Once done, the context + * is able to remap the transferred PageDesc(s) of typed messages accumulated + * into the context. + * + * @param sock user socket structure + * @param cmd select which cmd function needs to be performed + * @param arg argument for command + * + * @return error code, 0 on success + */ +static int +MksckPageDescIoctl(struct socket *sock, + unsigned int cmd, + unsigned long arg) +{ + struct sock *monkiSk = NULL; + struct sock *sk = sock->sk; + MksckPageDescInfo *mpdi; + int retval = 0; + + switch (cmd) { + /** + * ioctl MKSCK_DETACH (in and out): + * Detach, compute size and define allowed protection access rights + * + * [in]: unsigned long flags, similar to prot argument of mmap() + * unsigned long number of available further mappings + * with 0 meaning unlimited number of mappings + * [out]: unsigned long size of the available mappable area + */ + case MKSCK_DETACH: { + unsigned long ul[2]; + lock_sock(sk); + mpdi = sk->sk_protinfo; + // read unsigned long argument that contains the mmap alike flags + if (copy_from_user(ul, (void *)arg, sizeof ul)) { + retval = -EFAULT; + // check that the file descriptor has a parent and some context there + } else if (!mpdi || !sk->sk_user_data) { + retval = -EINVAL; + } else { + /* compute mapping protection bits from argument and size of the + * mapping, that is also given back to userland as unsigned long. + */ + uint32 flags = calc_vm_prot_bits(ul[0]); + ul[0] = 0; + while (mpdi) { + MksckPageDescInfo *next = mpdi->next; + ul[0] += MksckPageDescManage(mpdi->descs, mpdi->pages, + MANAGE_COUNT); + mpdi->mapCounts = ul[1]; + mpdi = next; + } + if (copy_to_user((void *)arg, ul, sizeof(ul[0]))) { + retval = -EFAULT; + } else { + mpdi = sk->sk_protinfo; + mpdi->flags = flags; + monkiSk = (struct sock *)sk->sk_user_data; + sk->sk_user_data = NULL; + } + } + release_sock(sk); + // clean the monki socket that we are holding + if ((sk = monkiSk)) { + lock_sock(sk); + sock_kfree_s(sk, sk->sk_user_data, sizeof(int)); + sk->sk_user_data = NULL; + release_sock(sk); + sock_put(sk); + } + break; + } + default: { + retval = -EINVAL; + break; + } + } + return retval; +} + + +/** + * @brief VMX receiving a packet from VMM. + * + * @param kiocb kernel io control block (unused) + * @param sock user socket structure + * @param msg user buffer to receive the packet + * @param len size of the user buffer + * @param flags flags + * + * @return -errno on failure, else length of untyped portion + total number + * of bytes mapped for typed portion. + */ +static int +MksckDgramRecvMsg(struct kiocb *kiocb, + struct socket *sock, + struct msghdr *msg, + size_t len, + int flags) +{ + int err = 0; + struct sock *sk = sock->sk; + Mksck *mksck; + Mksck_Datagram *dg; + struct sockaddr_mk *fromAddr; + uint32 read; + struct iovec *iov; + size_t payloadLen, untypedLen; + uint32 iovCount; + + if (flags & MSG_OOB || flags & MSG_ERRQUEUE) { + return -EOPNOTSUPP; + } + + if ((msg->msg_name != NULL) && (msg->msg_namelen < sizeof *fromAddr)) { + return -EINVAL; + } + + lock_sock(sk); + if ((err = MksckTryBind(sk))) { + release_sock(sk); + return err; + } + mksck = sk->sk_protinfo; + + /* + * To avoid mksck disappearing right after the release_sock the + * refcount needs to be incremented. For more details read the + * block comment on locking in MksckCreate. + */ + ATOMIC_ADDV(mksck->refCount, 1); + release_sock(sk); + + /* + * Get pointer to next packet in ring to be dequeued. + */ + while (1) { + + /* + * Wait to make sure this is the only thread trying to access socket. + */ + if ((err = Mutex_Lock(&mksck->mutex, MutexModeEX)) < 0) { + goto decRefc; + } + + /* + * See if packet in ring. + */ + read = mksck->read; + if (read != mksck->write) { + break; + } + + /* + * Nothing there, if user wants us not to block then just return EAGAIN. + */ + if (flags & MSG_DONTWAIT) { + Mutex_Unlock(&mksck->mutex, MutexModeEX); + err = -EAGAIN; + goto decRefc; + } + + /* + * Nothing there, unlock socket and wait for data. + */ + mksck->foundEmpty ++; + err = Mutex_UnlSleep(&mksck->mutex, MutexModeEX, MKSCK_CVAR_FILL); + if (err < 0) { + PRINTK(KERN_INFO "MksckDgramRecvMsg: aborted\n"); + goto decRefc; + } + } + + /* + * Point to packet in ring. + */ + dg = (void *)&mksck->buff[read]; + + /* + * Provide the address of the sender. + */ + if (msg->msg_name != NULL) { + fromAddr = (void *)msg->msg_name; + fromAddr->mk_addr = dg->fromAddr; + fromAddr->mk_family = AF_MKSCK; + msg->msg_namelen = sizeof *fromAddr; + } else { + msg->msg_namelen = 0; + } + + /* + * Copy data from ring buffer to caller's buffer and remove packet from + * ring buffer. + */ + iov = msg->msg_iov; + iovCount = msg->msg_iovlen; + payloadLen = untypedLen = + dg->len - dg->pages * sizeof(Mksck_PageDesc) - dg->pad; + + /* + * Handle the untyped portion of the message. + */ + if (untypedLen <= len) { + err = memcpy_toiovec(iov, + dg->data, + untypedLen); + if (err < 0) { + printk("MksckDgramRecvMsg: Failed to memcpy_to_iovec untyped message component " + "(buf len %d datagram len %d (untyped %d))\n", + len, + dg->len, + untypedLen); + } + } else { + err = -EINVAL; + } + + /* + * Map in the typed descriptor. + */ + if (err >= 0 && dg->pages > 0) { + Mksck_PageDesc *pd = (Mksck_PageDesc *)(dg->data + untypedLen + dg->pad); + + /* + * There are 3 ways of receiving typed messages from the monitor. + * - The typed message is mapped directly into a VMA. To indicate this the + * userland sets msg_controllen == 0. + * - The typed message is mapped directly into a VMA and a file descriptor + * created for further mappings on the host (in same userland address + * space or an alternate userland address space). In this case + * msg_controllen should be set to sizeof(fd). + * - The typed message is not mapped directly into a VMA, but a file + * descriptor is created for later mapping on the host. In this case + * msg_controllen should be set to sizeof(fd) and the supplied iovec + * shall not specify a receive window. + * + * The conjuncts below decide on which of these 3 cases we've encountered. + */ + + if ((msg->msg_controllen <= 0) || + ((err = MksckPageDescToFd(sock, msg, pd, dg->pages)) != 0) || + (MsgHdrHasAvailableRoom(msg) != 0)) { + + down_write(¤t->mm->mmap_sem); // lock for a change of mapping + payloadLen += MksckPageDescMap(pd, dg->pages, iov, iovCount, NULL); + up_write(¤t->mm->mmap_sem); + } + } + + /* + * Now that packet is removed, it is safe to unlock socket so another thread + * can do a recv(). We also want to wake someone waiting for room to insert + * a new packet. + */ + if ((err >= 0) && Mksck_IncReadIndex(mksck, read, dg)) { + Mutex_UnlWake(&mksck->mutex, MutexModeEX, MKSCK_CVAR_ROOM, true); + } else { + Mutex_Unlock(&mksck->mutex, MutexModeEX); + } + + /* + * If memcpy error, return error status. + * Otherwise, return number of bytes copied. + */ + if (err >= 0) { + err = payloadLen; + } + +decRefc: + Mksck_DecRefc(mksck); + return err; +} + + +/** + * @brief VMX sending a packet to VMM. + * + * @param kiocb kernel io control block + * @param sock user socket structure + * @param msg packet to be transmitted + * @param len length of the packet + * + * @return length of the sent msg on success, -errno on failure + */ +static int +MksckDgramSendMsg(struct kiocb *kiocb, + struct socket *sock, + struct msghdr *msg, + size_t len) +{ + int err = 0; + struct sock *sk = sock->sk; + Mksck *peerMksck; + Mksck_Datagram *dg; + uint32 needed; + uint32 write; + Mksck_Address fromAddr; + + if (msg->msg_flags & MSG_OOB) { + return -EOPNOTSUPP; + } + + if (len > MKSCK_XFER_MAX) { + return -EMSGSIZE; + } + + /* + * In the next locked section peerMksck pointer needs to be set and + * its refcount needs to be incremented. + */ + lock_sock(sk); + do { + Mksck *mksck; + Mksck_Address peerAddr = + { .addr = (msg->msg_name ? + ((struct sockaddr_mk *)msg->msg_name)->mk_addr.addr : + MKSCK_ADDR_UNDEF) }; + + if ((err = MksckTryBind(sk))) { + break; + } + mksck = sk->sk_protinfo; + fromAddr = mksck->addr; + + /* + * If the socket is connected, use that address (no sendto for + * connected sockets). Otherwise, use the provided address if any. + */ + if ((peerMksck = mksck->peer)) { + if (peerAddr.addr != MKSCK_ADDR_UNDEF && + peerAddr.addr != mksck->peerAddr.addr) { + err = -EISCONN; + break; + } + /* + * To avoid mksckPeer disappearing right after the + * release_sock the refcount needs to be incremented. For + * more details read the block comment on locking in + * MksckCreate. + */ + ATOMIC_ADDV(peerMksck->refCount, 1); + } else if (peerAddr.addr == MKSCK_ADDR_UNDEF) { + err = -ENOTCONN; + } else { + /* + * LockPeer also increments the refc on the peer. + */ + err = LockPeer(peerAddr, &peerMksck); + } + } while(0); + release_sock(sk); + + if (err) { + return err; + } + + /* + * Get pointer to sufficient empty space in ring buffer. + */ + needed = MKSCK_DGSIZE(len); + while (1) { + /* + * Wait to make sure this is the only thread trying to write to ring. + */ + if ((err = Mutex_Lock(&peerMksck->mutex, MutexModeEX)) < 0) { + goto decRefc; + } + + /* + * Check if socket can receive data. + */ + if (peerMksck->shutDown & MKSCK_SHUT_RD) { + err = -ENOTCONN; + goto unlockDecRefc; + } + + /* + * See if there is room for the packet. + */ + write = Mksck_FindSendRoom(peerMksck, needed); + if (write != MKSCK_FINDSENDROOM_FULL) { + break; + } + + /* + * No room, unlock socket and maybe wait for room. + */ + if (msg->msg_flags & MSG_DONTWAIT) { + err = -EAGAIN; + goto unlockDecRefc; + } + + peerMksck->foundFull ++; + err = Mutex_UnlSleep(&peerMksck->mutex, + MutexModeEX, + MKSCK_CVAR_ROOM); + if (err < 0) { + PRINTK(KERN_INFO "MksckDgramSendMsg: aborted\n"); + goto decRefc; + } + } + + /* + * Point to room in ring and fill in message. + */ + dg = (void *)&peerMksck->buff[write]; + + dg->fromAddr = fromAddr; + dg->len = len; + + if ((err = memcpy_fromiovec(dg->data, msg->msg_iov, len)) != 0) { + goto unlockDecRefc; + } + + /* + * Increment past message. + */ + Mksck_IncWriteIndex(peerMksck, write, needed); + + /* + * Unlock socket and wake someone trying to receive, ie, we filled + * in a message. + */ + Mutex_UnlWake(&peerMksck->mutex, MutexModeEX, MKSCK_CVAR_FILL, false); + + /* + * Maybe guest is in a general 'wait for interrupt' wait or + * grinding away executing guest instructions. + * + * If it has a receive callback armed for the socket and is + * waiting a message, just wake it up. Else send an IPI to the CPU + * running the guest so it will interrupt whatever it is doing and + * read the message. + * + * Holding the mksckPage->mutex prevents mksckPage->vmHKVA from + * clearing on us. + */ + if (peerMksck->rcvCBEntryMVA != 0) { + MksckPage *peerMksckPage = Mksck_ToSharedPage(peerMksck); + + if ((err = Mutex_Lock(&peerMksckPage->mutex, MutexModeSH)) == 0) { + uint32 sockIdx = peerMksck->index; + MvpkmVM *vm = (MvpkmVM *) peerMksckPage->vmHKVA; + + /* + * The destruction of vm and wsp is blocked by the + * mksckPage->mutex. + */ + if (vm) { + WorldSwitchPage *wsp = vm->wsp; + + ASSERT(sockIdx < 8 * sizeof peerMksckPage->wakeVMMRecv); + ATOMIC_ORV(peerMksckPage->wakeVMMRecv, 1U << sockIdx); + + if (wsp) { + Mvpkm_WakeGuest(vm, ACTION_MKSCK); + } + } + Mutex_Unlock(&peerMksckPage->mutex, MutexModeSH); + } + } + + /* + * If all are happy tell the caller the number of transferred bytes. + */ + if (!err) { + err = len; + } + + /* + * Now that we are done with target socket, allow it to be freed. + */ +decRefc: + Mksck_DecRefc(peerMksck); + return err; + +unlockDecRefc: + Mutex_Unlock(&peerMksck->mutex, MutexModeEX); + goto decRefc; +} + + +/** + * @brief Page fault handler for receive windows. Since the host process + * should not be faulting in this region and only be accessing + * memory that has been established via a typed message transfer, + * we always signal the fault back to the process. + */ +static int +MksckFault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +/** + * @brief Establish a region in the host process suitable for use as a + * receive window. + * + * @param file file reference (ignored). + * @param sock user socket structure. + * @param vma Linux virtual memory area defining the region. + * + * @return 0 on success, otherwise error code. + */ +static int +MksckMMap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + /* + * All the hard work is done in MksckDgramRecvMsg. Here we simply mark the + * vma as belonging to Mksck. + */ + vma->vm_ops = &mksckVMOps; + + return 0; +} + +/** + * @brief This gets called after returning from the monitor. + * Since the monitor doesn't directly wake VMX threads when it sends + * something to VMX (for efficiency), this routine checks for the + * omitted wakes and does them. + * @param mksckPage some shared page that the monitor writes packets to, ie + * an host shared page + */ +void +Mksck_WakeBlockedSockets(MksckPage *mksckPage) +{ + Mksck *mksck; + uint32 i, wakeHostRecv; + + wakeHostRecv = mksckPage->wakeHostRecv; + if (wakeHostRecv != 0) { + mksckPage->wakeHostRecv = 0; + for (i = 0; wakeHostRecv != 0; i ++) { + if (wakeHostRecv & 1) { + mksck = &mksckPage->sockets[i]; + Mutex_CondSig(&mksck->mutex, MKSCK_CVAR_FILL, true); + } + wakeHostRecv >>= 1; + } + } +} + +/** + * @brief allocate and initialize a shared page. + * @return pointer to shared page.
+ * NULL on error + */ +MksckPage * +MksckPageAlloc(void) +{ + uint32 jj; + /* + * Ask for pages in the virtual kernel space. There is no + * requirement to be physically contiguous. + */ + MksckPage *mksckPage = vmalloc(MKSCKPAGE_SIZE); + + if (mksckPage) { + + /* + * Initialize its contents. Start refCount at 1 and decrement it + * when the worldswitch or VM page gets freed. + */ + memset(mksckPage, 0, MKSCKPAGE_SIZE); + ATOMIC_SETV(mksckPage->refCount, 1); + mksckPage->portStore = MKSCK_PORT_HIGH; + + Mutex_Init(&mksckPage->mutex); + for (jj = 0; jjsockets[jj].mutex); + } + } + + return mksckPage; +} + +/** + * @brief Release the allocated pages. + * @param mksckPage the address of the mksckPage to be released + */ +static void +MksckPageRelease(MksckPage *mksckPage) +{ + int ii; + + for (ii = 0; iisockets[ii].mutex); + } + Mutex_Destroy(&mksckPage->mutex); + + vfree(mksckPage); +} + +/** + * @brief Using the tgid locate the vmid of this process. + * Assumed that mksckPageListLock is held + * @return the vmId if page is already allocated, + * the first vacant vmid if not yet allocated.
+ * MKSCK_PORT_UNDEF if no slot is vacant + */ +static inline Mksck_VmId +GetHostVmId(void) +{ + uint32 jj; + Mksck_VmId vmId, vmIdFirstVacant = MKSCK_VMID_UNDEF; + MksckPage *mksckPage; + uint32 tgid = task_tgid_vnr(current); + /* + * Assign an unique vmId to the shared page. Start the search from + * the vmId that is the result of hashing tgid to 15 bits. As a + * used page with a given vmId can occupy only a given slot in the + * mksckPages array, it is enough to search through the + * MKSCK_MAX_SHARES slots for a vacancy. + */ + for (jj = 0, vmId = MKSCK_TGID2VMID(tgid); + jj < MKSCK_MAX_SHARES; + jj++, vmId++) { + if (vmId > MKSCK_VMID_HIGH) { + vmId = 0; + } + mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)]; + + if (mksckPage) { + if (mksckPage->tgid == tgid && + !mksckPage->isGuest) { + return mksckPage->vmId; + } + + } else if (vmIdFirstVacant == MKSCK_VMID_UNDEF) { + vmIdFirstVacant = vmId; + } + } + return vmIdFirstVacant; +} + + +/** + * @brief Locate the first empty slot + * Assumed that mksckPageListLock is held + * @return the first vacant vmid.
+ * MKSCK_PORT_UNDEF if no slot is vacant + */ +static inline Mksck_VmId +GetNewGuestVmId(void) +{ + Mksck_VmId vmId; + + for (vmId = 0; vmId < MKSCK_MAX_SHARES; vmId++) { + if (!mksckPages[MKSCK_VMID2IDX(vmId)]) { + return vmId; + } + } + return MKSCK_VMID_UNDEF; +} + + +/** + * @brief Find shared page for a given idx. The page referred to be the + * idx should exist and be locked by the caller. + * @param idx index of the page in the array + * @return pointer to shared page + */ +MksckPage * +MksckPage_GetFromIdx(uint32 idx) +{ + MksckPage *mksckPage = mksckPages[idx]; + ASSERT(mksckPage); + ASSERT(idxrefCount)); + return mksckPage; +} + +/** + * @brief find shared page for a given vmId + * The vmid should exist and be locked by the caller. + * @param vmId vmId to look for, either an host vmId or a guest vmId + * @return pointer to shared page + */ +MksckPage * +MksckPage_GetFromVmId(Mksck_VmId vmId) +{ + MksckPage *mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)]; + ASSERT(mksckPage); + ASSERT(mksckPage->vmId == vmId); + ASSERT(ATOMIC_GETO(mksckPage->refCount)); + return mksckPage; +} + + +/** + * @brief find shared page for a given vmId + * @param vmId vmId to look for, either an host vmId or a guest vmId + * @return NULL: no such shared page exists
+ * else: pointer to shared page. + * Call Mksck_DecRefc() when done with pointer + */ +MksckPage * +MksckPage_GetFromVmIdIncRefc(Mksck_VmId vmId) +{ + MksckPage *mksckPage; + + spin_lock(&mksckPageListLock); + mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)]; + + if (!mksckPage || (mksckPage->vmId != vmId)) { + printk(KERN_INFO "MksckPage_GetFromVmIdIncRefc: vmId %04X not found\n", + vmId); + mksckPage = NULL; + } else { + ATOMIC_ADDV(mksckPage->refCount, 1); + } + spin_unlock(&mksckPageListLock); + return mksckPage; +} + + +/** + * @brief find or allocate shared page using tgid + * @return NULL: no such shared page exists
+ * else: pointer to shared page. + * Call Mksck_DecRefc() when done with pointer + */ +MksckPage * +MksckPage_GetFromTgidIncRefc(void) +{ + MksckPage *mksckPage; + Mksck_VmId vmId; + + while (1) { + spin_lock(&mksckPageListLock); + vmId = GetHostVmId(); + + if (vmId == MKSCK_VMID_UNDEF) { + /* + * No vmId has been allocated yet and there is no free slot. + */ + spin_unlock(&mksckPageListLock); + return NULL; + } + + mksckPage = mksckPages[MKSCK_VMID2IDX(vmId)]; + if (mksckPage != NULL) { + /* + * There is a vmid already allocated, increment the refc on it. + */ + ATOMIC_ADDV(mksckPage->refCount, 1); + spin_unlock(&mksckPageListLock); + return mksckPage; + } + + /* + * Have to release spinlock to allocate a new page. + */ + spin_unlock(&mksckPageListLock); + mksckPage = MksckPageAlloc(); + if (mksckPage == NULL) { + return NULL; + } + + /* + * Re-lock and make sure no one else allocated while unlocked. + * If someone else did allocate, free ours off and use theirs. + */ + spin_lock(&mksckPageListLock); + vmId = GetHostVmId(); + if ((vmId != MKSCK_VMID_UNDEF) && + (mksckPages[MKSCK_VMID2IDX(vmId)] == NULL)) { + break; + } + spin_unlock(&mksckPageListLock); + MksckPageRelease(mksckPage); + } + + /* + * This is a successful new allocation. insert it into the table + * and initialize the fields. + */ + mksckPages[MKSCK_VMID2IDX(vmId)] = mksckPage; + mksckPage->vmId = vmId; + mksckPage->isGuest = false; + mksckPage->vmHKVA = 0; + mksckPage->tgid = task_tgid_vnr(current); + printk(KERN_DEBUG "New host mksck page is allocated: idx %x, vmId %x, tgid %d\n", + MKSCK_VMID2IDX(vmId), vmId, mksckPage->tgid); + + spin_unlock(&mksckPageListLock); + return mksckPage; +} + +/** + * @brief Initialize the VMX provided wsp. Allocate communication page. + * @param vm which virtual machine we're running + * @return 0 if all OK, error value otherwise + */ +int +Mksck_WspInitialize(MvpkmVM *vm) +{ + WorldSwitchPage *wsp = vm->wsp; + int err; + Mksck_VmId vmId; + MksckPage *mksckPage; + + if (wsp->guestId) { + err = -EBUSY; + } else if (!(mksckPage = MksckPageAlloc())) { + err = -ENOMEM; + } else { + spin_lock(&mksckPageListLock); + + if ((vmId = GetNewGuestVmId()) == MKSCK_VMID_UNDEF) { + + err = -EMFILE; + MksckPageRelease(mksckPage); + + printk(KERN_INFO "Mksck_WspInitialize: Cannot allocate vmId\n"); + + } else { + /* + * Now that the mksckPage is all initialized, let others see it. + */ + mksckPages[MKSCK_VMID2IDX(vmId)] = mksckPage; + mksckPage->vmId = vmId; + mksckPage->isGuest = true; + mksckPage->vmHKVA = (HKVA)vm; + /* mksckPage->tgid is undefined when isGuest is true */ + + wsp->guestId = vmId; + + printk(KERN_DEBUG "New guest mksck page is allocated: idx %x, vmId %x\n", + MKSCK_VMID2IDX(vmId), vmId); + + err = 0; + } + + /* + * All stable, ie, mksckPages[] written, ok to unlock now. + */ + spin_unlock(&mksckPageListLock); + } + + return err; +} + +/** + * @brief Release the wsp. Clean up after the monitor. Free the + * associated communication page. + * @param wsp which worldswitch page (VCPU) + */ +void +Mksck_WspRelease(WorldSwitchPage *wsp) +{ + int ii; + int err; + MksckPage *mksckPage = MksckPage_GetFromVmId(wsp->guestId); + + /* + * The worldswitch page for a particular VCPU is about to be freed + * off, so we know the monitor will never execute again. But the + * monitor most likely left some sockets open. Those may have + * outbound connections to host sockets that we must close. + * + * Loop through all possibly open sockets. + */ + uint32 isOpened = wsp->isOpened; + Mksck *mksck = mksckPage->sockets; + while (isOpened) { + if (isOpened & 1) { + ASSERT(ATOMIC_GETO(mksck->refCount) != 0); + /* + * The socket may be connected to a peer (host) socket, so we + * have to decrement that target socket's reference + * count. Unfortunately, Mksck_DisconnectPeer(mksck) cannot + * be called as mksck->peer is an mva not an hkva. Translate + * the address first. + */ + if (mksck->peer) { + MksckPage *mksckPagePeer = MksckPage_GetFromVmId(mksck->peerAddr.vmId); + ASSERT(mksckPagePeer); + mksck->peer = MksckPage_GetFromAddr(mksckPagePeer, mksck->peerAddr); + ASSERT(mksck->peer); + /* mksck->peer is now a hkva */ + } + + Mksck_CloseCommon(mksck); + } + isOpened >>= 1; + mksck++; + } + + /* + * A host socket may be in the process of sending to the guest. It + * will attempt to wake up the guest using mksckPage->vmHKVA and + * mksckPage->vmHKVA->wsp. To assure that the vm and wsp structures + * are not disappearing from under the sending thread we lock the + * page here. + */ + err = Mutex_Lock(&mksckPage->mutex, MutexModeEX); + ASSERT(!err); + mksckPage->vmHKVA = 0; + Mutex_Unlock(&mksckPage->mutex, MutexModeEX); + /* + * Decrement refcount set by MksckPageAlloc() call in + * Mksck_WspInitialize(). + */ + MksckPage_DecRefc(mksckPage); + + /* + * Decrement refcount set by VMM:Mksck_Init() referring to the local + * variable guestMksckPage. + */ + if (wsp->guestPageMapped) { + wsp->guestPageMapped = false; + MksckPage_DecRefc(mksckPage); + } + + /* + * Another task is to decrement the reference count on the mksck + * pages the monitor accessed. Those pages are listed in the + * wsp->isPageMapped list. They were locked by the monitor + * calling WSCALL_GET_PAGE_FROM_VMID + */ + for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) { + if (wsp->isPageMapped[ii]) { + MksckPage *mksckPageOther = MksckPage_GetFromIdx(ii); + + wsp->isPageMapped[ii] = false; + MksckPage_DecRefc(mksckPageOther); + } + } +} + +/** + * @brief disconnect from peer by decrementing + * peer socket's reference count and clearing the pointer. + * @param mksck local socket to check for connection + */ +void +Mksck_DisconnectPeer(Mksck *mksck) +{ + Mksck *peerMksck = mksck->peer; + if (peerMksck != NULL) { + mksck->peer = NULL; + mksck->peerAddr.addr = MKSCK_ADDR_UNDEF; + Mksck_DecRefc(peerMksck); + } +} + + +/** + * @brief decrement shared page reference count, free page if it goes zero. + * also do a dmb first to make sure all activity on the struct is + * finished before decrementing the ref count. + * @param mksckPage shared page + */ +void +MksckPage_DecRefc(MksckPage *mksckPage) +{ + uint32 oldRefc; + + DMB(); + do { + while ((oldRefc = ATOMIC_GETO(mksckPage->refCount)) == 1) { + + /* + * Find corresponding entry in list of known shared pages and + * clear it so we can't open any new sockets on this shared + * page, thus preventing its refCount from being incremented. + */ + spin_lock(&mksckPageListLock); + if (ATOMIC_SETIF(mksckPage->refCount, 0, 1)) { + uint32 ii = MKSCK_VMID2IDX(mksckPage->vmId); + ASSERT(ii < MKSCK_MAX_SHARES); + ASSERT(mksckPages[ii] == mksckPage); + mksckPages[ii] = NULL; + spin_unlock(&mksckPageListLock); + printk(KERN_DEBUG "%s mksck page is released: idx %x, vmId %x, tgid %d\n", + mksckPage->isGuest?"Guest":"Host", + ii, mksckPage->vmId, mksckPage->tgid); + MksckPageRelease(mksckPage); + return; + } + spin_unlock(&mksckPageListLock); + } + ASSERT(oldRefc != 0); + } while (!ATOMIC_SETIF(mksckPage->refCount, oldRefc - 1, oldRefc)); +} + +/** + * @brief Lookup if the provided mpn belongs to one of the Mksck pages. Map if found. + * @return 0 if all OK, error value otherwise + */ +int +MksckPage_LookupAndInsertPage(struct vm_area_struct *vma, + unsigned long address, + MPN mpn) +{ + int ii, jj; + MksckPage **mksckPagePtr = mksckPages; + + spin_lock(&mksckPageListLock); + for (jj = MKSCK_MAX_SHARES; jj--; mksckPagePtr++) { + if (*mksckPagePtr) { + for (ii = 0; ii < MKSCKPAGE_TOTAL; ii++) { + if (vmalloc_to_pfn((void*)(((HKVA)*mksckPagePtr) + ii*PAGE_SIZE)) == mpn && + vm_insert_page(vma, address, pfn_to_page(mpn)) == 0) { + spin_unlock(&mksckPageListLock); + return 0; + } + } + } + } + spin_unlock(&mksckPageListLock); + return -1; +} + + +/** + * @brief Print information on the allocated shared pages + * + * This function reports (among many other things) on the use of locks + * on the mksck page (page lock and individual socket locks). To avoid + * the Hiesenberg effect it avoids using locks unless there is a + * danger of dereferencing freed memory. In particular, holding + * mksckPageListLock ensures that the mksck page is not freed while it + * is read. But under very rare conditions this function may report + * inconsistent or garbage data. + */ +static int +MksckPageInfoShow(struct seq_file *m, void *private) +{ + int ii, jj; + uint32 isPageMapped = 0; + int err; + MvpkmVM *vm; + + /* + * Lock is needed to atomize the test and dereference of + * mksckPages[ii] + */ + spin_lock(&mksckPageListLock); + for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) { + MksckPage *mksckPage = mksckPages[ii]; + if (mksckPage != NULL && mksckPage->isGuest) { + /* + * After the refcount is incremented mksckPage will not be + * freed and it can continued to be dereferenced after the + * unlock of mksckPageListLock. + */ + ATOMIC_ADDV(mksckPage->refCount, 1); + spin_unlock(&mksckPageListLock); + + /* + * To dereference mksckPage->vmHKVA, we need to have the page + * lock. + */ + err = Mutex_Lock(&mksckPage->mutex, MutexModeEX); + vm = (MvpkmVM *) mksckPage->vmHKVA; + + if (err == 0 && vm && vm->wsp) { + for (jj = 0; jj < MKSCK_MAX_SHARES; jj++) { + if (vm->wsp->isPageMapped[jj]) isPageMapped |= 1<mutex, MutexModeEX); + /* + * Decrement the page refcount and relock the + * mksckPageListLock for the next for loop. + */ + MksckPage_DecRefc(mksckPage); + spin_lock(&mksckPageListLock); + break; + } + } + + /* mksckPageListLock is still locked, mksckPages[ii] can be dereferenced */ + for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) { + MksckPage *mksckPage = mksckPages[ii]; + if (mksckPage != NULL) { + uint32 lState = ATOMIC_GETO(mksckPage->mutex.state); + uint32 isOpened = 0; /* guest has an implicit ref */ + + seq_printf(m, "MksckPage[%02d]: { vmId = %4x(%c), refC = %2d%s", + ii, mksckPage->vmId, + mksckPage->isGuest?'G':'H', + ATOMIC_GETO(mksckPage->refCount), + (isPageMapped&(1<mutex.line, mksckPage->mutex.lineUnl); + } + + + if (!mksckPage->isGuest) { + struct task_struct *target; + seq_printf(m, ", tgid = %d", mksckPage->tgid); + + rcu_read_lock(); + + target = pid_task(find_vpid(mksckPage->tgid), PIDTYPE_PID); + seq_printf(m, "(%s)", target ? target->comm : "no such process"); + + rcu_read_unlock(); + } else { + ATOMIC_ADDV(mksckPage->refCount, 1); + spin_unlock(&mksckPageListLock); + + err = Mutex_Lock(&mksckPage->mutex, MutexModeEX); + vm = (MvpkmVM *) mksckPage->vmHKVA; + + if (err == 0 && vm && vm->wsp) { + isOpened = vm->wsp->isOpened; + } + Mutex_Unlock(&mksckPage->mutex, MutexModeEX); + MksckPage_DecRefc(mksckPage); + spin_lock(&mksckPageListLock); + /* + * As the mksckPageListLock was unlocked, nothing + * prevented the MksckPage_DecRefc from actually freeing + * the page. Lets verify that the page is still there. + */ + if (mksckPage != mksckPages[ii]) { + seq_printf(m, " released }\n"); + continue; + } + } + seq_printf(m, ", sockets[] = {"); + + for (jj = 0; jj < mksckPage->numAllocSocks; jj++, isOpened >>= 1) { + Mksck *mksck = mksckPage->sockets + jj; + + if (ATOMIC_GETO(mksck->refCount)) { + uint32 blocked; + lState = ATOMIC_GETO(mksck->mutex.state); + seq_printf(m, "\n { addr = %8x, refC = %2d%s%s%s", + mksck->addr.addr, + ATOMIC_GETO(mksck->refCount), + (isOpened & 1 ? "*" : ""), + (mksck->shutDown & MKSCK_SHUT_RD ? " SHUTD_RD":""), + (mksck->shutDown & MKSCK_SHUT_WR ? " SHUTD_WR":"")); + + if (mksck->peer) { + seq_printf(m, ", peerAddr = %8x", + mksck->peerAddr.addr); + } + + if (lState) { + seq_printf(m, ", lock=%x locked by line %d, unlocked by %d", + lState, mksck->mutex.line, mksck->mutex.lineUnl); + } + + if ((blocked = ATOMIC_GETO(mksck->mutex.blocked))) { + seq_printf(m, ", blocked=%d", blocked); + } + + seq_printf(m, " }"); + } + } + seq_printf(m, " } }\n"); + } + } + spin_unlock(&mksckPageListLock); + + return 0; +} + + +static int +MksckPageInfoOpen(struct inode *inode, struct file *file) +{ + return single_open(file, MksckPageInfoShow, inode->i_private); +} + +static const struct file_operations mksckPageInfoFops = { + .open = MksckPageInfoOpen, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *mksckPageDentry = NULL; + +void +MksckPageInfo_Init(void) +{ + mksckPageDentry = debugfs_create_file("mksckPage", + S_IROTH, + NULL, + NULL, + &mksckPageInfoFops); +} + +void +MksckPageInfo_Exit(void) +{ + if (mksckPageDentry) { + debugfs_remove(mksckPageDentry); + } +} diff --git a/arch/arm/mvp/mvpkm/mksck_kernel.h b/arch/arm/mvp/mvpkm/mksck_kernel.h new file mode 100644 index 0000000..233b780 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mksck_kernel.h @@ -0,0 +1,68 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The monitor-kernel socket interface kernel-only definitions. + */ + +#ifndef _MKSCK_KERNEL_H +#define _MKSCK_KERNEL_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mksck_shared.h" + +/* + * prototypes + */ +int Mksck_Init(void); +void Mksck_Exit(void); +void Mksck_WakeBlockedSockets(MksckPage *mksckPage); +MksckPage *MksckPage_GetFromTgidIncRefc(void); +MksckPage *MksckPage_GetFromVmIdIncRefc(Mksck_VmId vmId); +MksckPage *MksckPage_GetFromIdx(uint32 idx); +void MksckPageInfo_Init(void); +void MksckPageInfo_Exit(void); +int Mksck_WspInitialize(MvpkmVM *vm); +void Mksck_WspRelease(WorldSwitchPage *wsp); +int MksckPage_LookupAndInsertPage(struct vm_area_struct *vma, + unsigned long address, + MPN mpn); + +/* + * Mksck open request must come from this uid. + */ +extern uid_t Mvpkm_vmwareUid; + +#define MKSCK_DEVEL 0 + +#if MKSCK_DEVEL +#define PRINTK printk +#else +#define PRINTK if (0) printk +#endif + +#define HOST_CPUID_UNDEF (~0) + +#endif diff --git a/arch/arm/mvp/mvpkm/mksck_shared.c b/arch/arm/mvp/mvpkm/mksck_shared.c new file mode 100644 index 0000000..68c38fc6 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mksck_shared.c @@ -0,0 +1,343 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +#include "mvp.h" +#include "mksck_shared.h" + +/** + * @file + * + * @brief The mksck shared area functions used by the monitor and the + * kernel extension. + * + */ + +/** + * @brief try to locate a socket using an address. + * @param mksckPage which shared page to look on. + * ASSUMED: locked for shared access + * @param addr address to check + * @return pointer to mksck page with addr. + * NULL if not found + */ +Mksck * +MksckPage_GetFromAddr(MksckPage *mksckPage, Mksck_Address addr) +{ + Mksck *mksck = mksckPage->sockets; + uint32 ii; + + ASSERT(addr.vmId == mksckPage->vmId); + + for (ii = mksckPage->numAllocSocks; ii--; mksck++) { + if ((ATOMIC_GETO(mksck->refCount) != 0) && + (mksck->addr.addr == addr.addr)) { + return mksck; + } + } + return NULL; +} + +/** + * @brief Close a monitor socket. + * + * @param mksck pointer to the socket control block + */ +void +Mksck_CloseCommon(Mksck *mksck) +{ + /* + * If a peer was connected, release the peer. + */ + Mksck_DisconnectPeer(mksck); + + /* + * Signal senders that this socket won't be read anymore. + */ + while (Mutex_Lock(&mksck->mutex, MutexModeEX) < 0); + mksck->shutDown = MKSCK_SHUT_WR | MKSCK_SHUT_RD; + Mutex_UnlWake(&mksck->mutex, MutexModeEX, MKSCK_CVAR_ROOM, true); + + /* + * Decrement reference count because it was set to 1 when opened. It could + * still be non-zero after this if some other thread is currently sending to + * this socket. + */ + Mksck_DecRefc(mksck); +} + + +/** + * @brief decrement socket reference count, free if it goes zero. Also do a + * dmb first to make sure all activity on the struct is finished before + * decrementing the ref count. + * @param mksck socket + */ +void +Mksck_DecRefc(Mksck *mksck) +{ + uint32 oldRefc; + + DMB(); + do { + while ((oldRefc = ATOMIC_GETO(mksck->refCount)) == 1) { + + MksckPage *mksckPage = Mksck_ToSharedPage(mksck); + + /* + * Socket refcount is going zero on a socket that locks mksckPage in. + * Lock shared page exclusive to make sure no one is trying to look + * for this socket, thus preventing socket's refcount from being + * incremented non-zero once we decrement it to zero. + */ + + /* + * Lock failed probably because of an interrupt. Keep trying + * to lock until we succeed. + */ + while (Mutex_Lock(&mksckPage->mutex, MutexModeEX) < 0); + + /* + * No one is doing any lookups, so set refcount zero. + */ + if (ATOMIC_SETIF(mksck->refCount, 0, 1)) { +#if 0 + /** + * @knownjira{MVP-1349} + * The standard Log is not yet implemented in the kernel space. + */ + KNOWN_BUG(MVP-1349); + PRINTK(KERN_INFO "Mksck_DecRefc: %08X shutDown %u, foundEmpty %u," + " foundFull %u, blocked %u\n", + mksck->addr.addr, mksck->shutDown, + mksck->foundEmpty, mksck->foundFull, + ATOMIC_GETO(mksck->mutex.blocked)); +#endif + + /* + * Sockets can't have connected peers by the time their + * refc hits 0. The owner should have cleaned that up by + * now. + */ + ASSERT(mksck->peer == 0); + + /* + * Successfully set to zero, release mutex and decrement + * shared page ref count as it was incremented when the + * socket was opened. This may free the shared page. + */ + Mutex_Unlock(&mksckPage->mutex, MutexModeEX); + MksckPage_DecRefc(mksckPage); + return; + } + + /* + * Someone incremented refcount just before we locked the mutex, so + * try it all again. + */ + Mutex_Unlock(&mksckPage->mutex, MutexModeEX); + } + + /* + * Not going zero or doesn't lock mksckPage, simple decrement. + */ + ASSERT(oldRefc != 0); + } while (!ATOMIC_SETIF(mksck->refCount, oldRefc - 1, oldRefc)); +} + + +/** + * @brief Find an unused port. + * @param mksckPage which shared page to look in. + * Locked for exclusive access + * @param port if not MKSCK_PORT_UNDEF test only this port + * @return port allocated or MKSCK_PORT_UNDEF if none was found + */ +Mksck_Port +MksckPage_GetFreePort(MksckPage *mksckPage, Mksck_Port port) +{ + Mksck_Address addr = { .addr = Mksck_AddrInit(mksckPage->vmId, port) }; + uint32 ii; + + if (port == MKSCK_PORT_UNDEF) { + for (ii = 0; iiportStore--; + if (!addr.port) { + + /* + * Wrapped around, reset portStore + */ + mksckPage->portStore = MKSCK_PORT_HIGH; + } + + if (!MksckPage_GetFromAddr(mksckPage, addr)) { + return addr.port; + } + } + + } else if (!MksckPage_GetFromAddr(mksckPage, addr)) { + return addr.port; + } + + return MKSCK_PORT_UNDEF; +} + +/** + * @brief Find an unused slot in the sockets[] array and allocate it. + * @param mksckPage which shared page to look in. + * Locked for exclusive access + * @param addr what local address to assign to the socket + * @return NULL: no slots available
+ * else: pointer to allocated socket + */ +Mksck * +MksckPage_AllocSocket(MksckPage *mksckPage, Mksck_Address addr) +{ + Mksck *mksck; + uint32 i; + + for (i = 0; (offsetof(MksckPage, sockets[i+1]) <= MKSCKPAGE_SIZE) && + (i < 8 * sizeof mksckPage->wakeHostRecv) && + (i < 8 * sizeof mksckPage->wakeVMMRecv); i ++) { + mksck = &mksckPage->sockets[i]; + if (ATOMIC_GETO(mksck->refCount) == 0) { + ATOMIC_SETV(mksck->refCount, 1); + mksck->addr = addr; + mksck->peerAddr.addr = MKSCK_ADDR_UNDEF; + mksck->peer = NULL; + mksck->index = i; + mksck->write = 0; + mksck->read = 0; + mksck->shutDown = 0; + mksck->foundEmpty = 0; + mksck->foundFull = 0; + ATOMIC_SETV(mksck->mutex.blocked, 0); + mksck->rcvCBEntryMVA = 0; + mksck->rcvCBParamMVA = 0; + + if (mksckPage->numAllocSocks < ++ i) { + mksckPage->numAllocSocks = i; + } + + return mksck; + } + } + return NULL; +} + + +/** + * @brief increment read index over the packet just read + * @param mksck socket packet was read from. + * Locked for exclusive access + * @param read current value of mksck->read + * @param dg datagram at current mksck->read + * @return with mksck->read updated to next packet
+ * false: buffer not empty
+ * true: buffer now empty + */ +_Bool +Mksck_IncReadIndex(Mksck *mksck, uint32 read, Mksck_Datagram *dg) +{ + ASSERT(read == mksck->read); + ASSERT((void *)dg == (void *)&mksck->buff[read]); + + read += MKSCK_DGSIZE(dg->len); + if ((read > mksck->write) && (read >= mksck->wrap)) { + ASSERT(read == mksck->wrap); + read = 0; + } + mksck->read = read; + + return read == mksck->write; +} + + +/** + * @brief find index in buffer that has enough room for a packet + * @param mksck socket message is being sent to. + * Locked for exclusive access + * @param needed room needed, including dg header and rounded up + * @return MKSCK_FINDSENDROOM_FULL: not enough room available
+ * else: index in mksck->buff for packet + */ +uint32 +Mksck_FindSendRoom(Mksck *mksck, uint32 needed) +{ + uint32 read, write; + + /* + * We must leave at least one byte unused so receiver can distinguish full + * from empty. + */ + read = mksck->read; + write = mksck->write; + if (write == read) { + if (needed < MKSCK_BUFSIZE) { + mksck->read = 0; + mksck->write = 0; + return 0; + } + } else if (write < read) { + if (write + needed < read) { + return write; + } + } else { + if (write + needed < MKSCK_BUFSIZE) { + return write; + } + if ((write + needed == MKSCK_BUFSIZE) && (read > 0)) { + return write; + } + if (needed < read) { + mksck->wrap = write; + mksck->write = 0; + return 0; + } + } + + return MKSCK_FINDSENDROOM_FULL; +} + + +/** + * @brief increment read index over the packet just written + * @param mksck socket packet was written to. + * Locked for exclusive access + * @param write as returned by @ref Mksck_FindSendRoom + * @param needed as passed to @ref Mksck_FindSendRoom + * @return with mksck->write updated to next packet + */ +void +Mksck_IncWriteIndex(Mksck *mksck, uint32 write, uint32 needed) +{ + ASSERT(write == mksck->write); + write += needed; + if (write >= MKSCK_BUFSIZE) { + ASSERT(write == MKSCK_BUFSIZE); + mksck->wrap = MKSCK_BUFSIZE; + write = 0; + } + ASSERT(write != mksck->read); + mksck->write = write; +} diff --git a/arch/arm/mvp/mvpkm/mksck_shared.h b/arch/arm/mvp/mvpkm/mksck_shared.h new file mode 100644 index 0000000..2677ec1 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mksck_shared.h @@ -0,0 +1,189 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The monitor-kernel socket interface shared area definitions. + */ + +#ifndef _MKSCK_SHARED_H +#define _MKSCK_SHARED_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/* + * Allocated MksckPages are stored in an array of size + * MKSCK_MAX_SHARES. The vmid and the slot index of a shared page is + * not unrelated: vmid = idx%MKSCK_MAX_SHARES. + */ +#define MKSCK_MAX_SHARES_LOG2 4 // 16: one per VM + one per VCPU +#define MKSCK_MAX_SHARES (1U << MKSCK_MAX_SHARES_LOG2) +#define MKSCK_VMID2IDX(idx) ((idx)%MKSCK_MAX_SHARES) +#define MKSCK_TGID2VMID(tgid) (((((tgid)<<1)^((tgid)>>15))&0xfffe)|1) +/* + * The size of a shared page determines how many sockets can be open + * concurrently. + */ +#define MKSCKPAGE_TOTAL 8 // number of shared pages +#define MKSCKPAGE_SIZE (PAGE_SIZE * MKSCKPAGE_TOTAL) +#define MKSCK_SOCKETS_PER_PAGE ((MKSCKPAGE_SIZE-offsetof(MksckPage, sockets[0])) / \ + sizeof(Mksck)) + +/* + * Individual datagrams are aligned on a MKSCK_ALIGNMENT byte boundary + * in the data receive area of a socket. + */ +#define MKSCK_ALIGNMENT 8 // data packet alignment +#define MKSCK_ALIGN(x) MVP_ALIGN(x, MKSCK_ALIGNMENT) +#define MKSCK_DGSIZE(len) offsetof(Mksck_Datagram, data[MKSCK_ALIGN(len)]) +#define MKSCK_BUFSIZE MKSCK_DGSIZE(MKSCK_XFER_MAX + 1) + +/* + * Conditional variables for sleeping on. + */ +#define MKSCK_CVAR_ROOM 0 // senders waiting for room for message +#define MKSCK_CVAR_FILL 1 // receivers waiting for a message to fetch + +#define MKSCK_FINDSENDROOM_FULL 0xFFFFFFFFU + +/* + * Shutdown bits + */ +#define MKSCK_SHUT_WR (1 << 0) // socket can't send data anymore +#define MKSCK_SHUT_RD (1 << 1) // socket can't receive data anymore + +typedef struct Mksck Mksck; +typedef struct Mksck_Datagram Mksck_Datagram; +typedef struct MksckPage MksckPage; + +#include "atomic.h" +#include "mksck.h" +#include "mmu_defs.h" +#include "mutex.h" +#include "arm_inline.h" + +/** + * @brief Monitor-kernel socket datagram structure + */ +struct Mksck_Datagram { + Mksck_Address fromAddr; ///< source address + uint32 len : 16; ///< length of the data + uint32 pad : 3; ///< padding between untyped message and mpn + ///< array. + uint32 pages : 13; ///< number of pages in mpn array + uint8 data[1] ///< start of the data + __attribute__((aligned(MKSCK_ALIGNMENT))); +}; + +/** + * @brief one particular socket's shared page data. + */ +struct Mksck { + AtmUInt32 refCount; ///< when zero, struct is free + ///< ... increment only with mksckPage->mutex + ///< ... decrement at any time + Mksck_Address addr; ///< this socket's address if open + ///< ... MKSCK_ADDR_UNDEF if closed + ///< ... open only with mksckPage->mutex + Mksck_Address peerAddr; ///< peer's address if connected + ///< ... MKSCK_ADDR_UNDEF if not + struct Mksck *peer; ///< connected peer's ptr or NULL if not + ///< ... ptr is MVA for monitor sockets and + ///< ... HKVA for sockets of host processes + ///< ... holds ref count on target socket + uint32 index; ///< index of this socket in page + + ///< empty ring indicated by read == write + ///< ring never completely fills, always at + ///< least room for one more byte so we can tell + ///< empty from full + + uint32 write; ///< index within buff to insert next data + ///< ... always < MKSCK_BUFSIZE + uint32 read; ///< index within buff to remove next data + ///< ... always < MKSCK_BUFSIZE + uint32 wrap; ///< current wrapping point + ///< ... valid only whenever write < read + uint32 shutDown; ///< MKSCK_SHUT_RD, MKSCK_SHUT_WR bitfield + uint32 foundEmpty; ///< number of times a receive has blocked + uint32 foundFull; ///< number of times a send has blocked + Mutex mutex; ///< locks the ring buffer + MVA rcvCBEntryMVA; ///< monitor's receive callback entrypoint + MVA rcvCBParamMVA; ///< monitor's receive callback parameter + uint8 buff[MKSCK_BUFSIZE] ///< data going TO this socket + __attribute__((aligned(MKSCK_ALIGNMENT))); +}; + + +/** + * @brief the shared page of an address domain (vmId) + */ +struct MksckPage { + _Bool isGuest; ///< the page belongs to a monitor/guest + uint32 tgid; ///< thread group id if isGuest=true + ///< undefined otherwise + volatile HKVA vmHKVA; ///< host side local data structure for vm + AtmUInt32 refCount; ///< page cannot be freed unless this is zero + ///< ... increment only with mksckPageListLock + ///< ... decrement at any time + ///< ... initialized to 1 for wsp->mksckPage* pointers + uint32 wakeHostRecv; ///< bitmask of sockets[] to be woken for receive + ///< ... access from VCPU thread only + AtmUInt32 wakeVMMRecv; ///< likewise for monitor receive callbacks + Mutex mutex; ///< locks list of open sockets + Mksck_VmId vmId; ///< hostId or guestId these sockets are for + Mksck_Port portStore; ///< used to assign ephemeral port numbers + uint32 numAllocSocks; ///< number of elements in sockets[] array + Mksck sockets[1]; ///< array of sockets (to fill MKSCKPAGE_SIZE) +}; + +MksckPage *MksckPage_GetFromVmId(Mksck_VmId vmId); +Mksck_Port MksckPage_GetFreePort(MksckPage *mksckPage, Mksck_Port port); +Mksck *MksckPage_GetFromAddr(MksckPage *mksckPage, Mksck_Address addr); +Mksck *MksckPage_AllocSocket(MksckPage *mksckPage, Mksck_Address addr); +void MksckPage_DecRefc(MksckPage *mksckPage); + +void Mksck_DecRefc(Mksck *mksck); +void Mksck_CloseCommon(Mksck *mksck); +_Bool Mksck_IncReadIndex(Mksck *mksck, uint32 read, Mksck_Datagram *dg); +uint32 Mksck_FindSendRoom(Mksck *mksck, uint32 needed); +void Mksck_IncWriteIndex(Mksck *mksck, uint32 write, uint32 needed); +void Mksck_DisconnectPeer(Mksck *mksck); + + +/** + * @brief determine which shared page a given socket is on + * Note that this process does not rely on any directory. + * @param mksck pointer to socket + * @return pointer to shared page + */ +static inline MksckPage * +Mksck_ToSharedPage(Mksck *mksck) +{ + return (MksckPage*)((char*)(mksck - mksck->index) + - offsetof(MksckPage, sockets)); +} +#endif diff --git a/arch/arm/mvp/mvpkm/mksck_sockaddr.h b/arch/arm/mvp/mvpkm/mksck_sockaddr.h new file mode 100644 index 0000000..e99d1f5 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mksck_sockaddr.h @@ -0,0 +1,50 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Host user space definitions for mksck sockets. + */ + +#ifndef _MKSCK_SOCKADDR_H_ +#define _MKSCK_SOCKADDR_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mksck.h" + +/* no one ever uses DECnet anymore? */ +#define AF_MKSCK AF_DECnet +#define PF_MKSCK PF_DECnet + +/* Address structure used by the host user socket interface. */ +struct sockaddr_mk { + sa_family_t mk_family; + Mksck_Address mk_addr; +}; + +#endif diff --git a/arch/arm/mvp/mvpkm/mmu_defs.h b/arch/arm/mvp/mvpkm/mmu_defs.h new file mode 100644 index 0000000..340b91b --- /dev/null +++ b/arch/arm/mvp/mvpkm/mmu_defs.h @@ -0,0 +1,218 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MMU-related definitions. + */ + +#ifndef _MMU_DEFS_H_ +#define _MMU_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/** + * @name ARM address space identifier. + * @{ + */ +#define ARM_ASID_BITS 8 +#define ARM_ASID_NUM (1 << ARM_ASID_BITS) +#define ARM_ASID_MASK (ARM_ASID_NUM - 1) +/*@}*/ + +/** + * @name ARM level 1 and 2 page table sizes. + * @{ + */ +#define ARM_L1PT_ORDER 14 +#define ARM_L2PT_FINE_ORDER 12 +#define ARM_L2PT_COARSE_ORDER 10 + +#define ARM_L1D_SECTION_ORDER 20 +#define ARM_L1D_SUPERSECTION_ORDER 24 + +#define ARM_L2D_SMALL_ORDER 12 +#define ARM_L2D_LARGE_ORDER 16 + +#define ARM_L1PT_SIZE (1 << ARM_L1PT_ORDER) +#define ARM_L2PT_FINE_SIZE (1 << ARM_L2PT_FINE_ORDER) +#define ARM_L2PT_COARSE_SIZE (1 << ARM_L2PT_COARSE_ORDER) + +#define ARM_L1D_SECTION_SIZE (1 << ARM_L1D_SECTION_ORDER) +#define ARM_L1D_SUPERSECTION_SIZE (1 << ARM_L1D_SUPERSECTION_ORDER) + +#define ARM_L2D_SMALL_SIZE (1 << ARM_L2D_SMALL_ORDER) +#define ARM_L2D_LARGE_SIZE (1 << ARM_L2D_LARGE_ORDER) + +#define ARM_L2PT_COARSE_PER_PAGE (PAGE_SIZE / ARM_L2PT_COARSE_SIZE) + +#define ARM_L1PT_ENTRIES (ARM_L1PT_SIZE / sizeof(ARM_L1D)) +#define ARM_L2PT_FINE_ENTRIES (ARM_L2PT_FINE_SIZE / sizeof(ARM_L2D)) +#define ARM_L2PT_COARSE_ENTRIES (ARM_L2PT_COARSE_SIZE / sizeof(ARM_L2D)) +/*@}*/ + +/** + * @brief Level 1 descriptor type field values. + * @{ + */ +#define ARM_L1D_TYPE_INVALID 0 +#define ARM_L1D_TYPE_COARSE 1 +#define ARM_L1D_TYPE_SECTION 2 +#define ARM_L1D_TYPE_SUPERSECTION 2 +/*@}*/ + +/** + * @name Decomposition of virtual addresses for page table indexing. + * @{ + */ +#define ARM_L1PT_INDX(addr) MVP_EXTRACT_FIELD((addr), 20, 12) +#define ARM_L2PT_COARSE_INDX(addr) MVP_EXTRACT_FIELD((addr), 12, 8) +/*@}*/ + +/** + * @name Mapping from the VA/PA/MA of a LxD entry to its table index. + * @{ + */ +#define ARM_L1D_PTR_INDX(l1dp) MVP_BITS((uint32)(l1dp), 2, ARM_L1PT_ORDER - 1) +#define ARM_L2D_PTR_INDX(l2dp) MVP_BITS((uint32)(l2dp), 2, ARM_L2PT_COARSE_ORDER - 1) +/*@}*/ + +/** + * @name L1D base index <-> MA. + * @{ + */ +#define ARM_L1D_BASE_ADDR(base) ((base) << ARM_L1PT_ORDER) +#define ARM_L1D_ADDR_BASE(addr) ((addr) >> ARM_L1PT_ORDER) +/*@}*/ + +/** + * @brief Which 1 MB section of a 16 MB supersection does the given addr lie in? + */ +#define ARM_SUPER_SECTION_INDEX(addr) MVP_EXTRACT_FIELD((addr), 20, 4) + +/** + * @name L1D entry base <-> either MA or MA of a second-level table. + * @{ + */ +#define ARM_L1D_SUPERSECTION_BASE_ADDR(base) ((base) << ARM_L1D_SUPERSECTION_ORDER) +#define ARM_L1D_SUPERSECTION_ADDR_BASE(addr) ((addr) >> ARM_L1D_SUPERSECTION_ORDER) +#define ARM_L1D_SECTION_BASE_ADDR(base) ((base) << ARM_L1D_SECTION_ORDER) +#define ARM_L1D_SECTION_ADDR_BASE(addr) ((addr) >> ARM_L1D_SECTION_ORDER) +#define ARM_L1D_COARSE_BASE_ADDR(base) ((base) << ARM_L2PT_COARSE_ORDER) +#define ARM_L1D_COARSE_ADDR_BASE(addr) ((addr) >> ARM_L2PT_COARSE_ORDER) +#define ARM_L1D_FINE_BASE_ADDR(base) ((base) << ARM_L2PT_FINE_ORDER) +#define ARM_L1D_FINE_ADDR_BASE(addr) ((addr) >> ARM_L2PT_FINE_ORDER) +/*@}*/ + +/* + * The number of L1 page directory pages the service the entire + * virtual space + */ +#define ARM_L1PT_PAGES (1<<(ARM_L1PT_ORDER - PAGE_ORDER)) + + +/** + * @name Level 2 descriptor type field values. + * @{ + */ +#define ARM_L2D_TYPE_INVALID 0 +#define ARM_L2D_TYPE_LARGE 0 +#define ARM_L2D_TYPE_SMALL 1 +#define ARM_L2D_XTYPE_LARGE 1 +#define ARM_L2D_XTYPE_SMALL 2 +#define ARM_L2D_XTYPE_SMALL_NX 3 +/*@}*/ + +/** + * @name Small/Large L2D (in coarse table) base <-> MA conversion. + * @{ + */ +#define ARM_L2D_LARGE_BASE_ADDR(base) ((base) << ARM_L2D_LARGE_ORDER) +#define ARM_L2D_LARGE_ADDR_BASE(addr) ((addr) >> ARM_L2D_LARGE_ORDER) +#define ARM_L2D_SMALL_BASE_ADDR(base) ((base) << ARM_L2D_SMALL_ORDER) +#define ARM_L2D_SMALL_ADDR_BASE(addr) ((addr) >> ARM_L2D_SMALL_ORDER) + +#define ARM_L2D_SMALL_PAGE_NUMBER(addr) ARM_L2D_SMALL_ADDR_BASE(addr) +#define ARM_L2D_SMALL_PAGE_OFFSET(addr) ((addr) & (PAGE_SIZE - 1)) +/* @}*/ + +/** + * @brief ARM page table descriptor access permissions for the AP field. + * @{ + */ +#define ARM_PERM_NONE 0 +#define ARM_PERM_PRIV_RW 1 +#define ARM_PERM_USER_RO 2 +#define ARM_PERM_USER_RW 3 +/*@}*/ + +/** + * @name Simplified access permission model introduced in ARMv7. + * + * AP[0] is an access flag, AP[2:1] are one of the following. + * + * @{ + */ +#define ARM_SIMPLE_PERM_KERN_RW 0 +#define ARM_SIMPLE_PERM_USER_RW 1 +#define ARM_SIMPLE_PERM_KERN_RO 2 +#define ARM_SIMPLE_PERM_USER_RO 3 + +#define ARM_SIMPLE_PERM_AP_KERN 1 +#define ARM_SIMPLE_PERM_AP_USER 3 + +#define ARM_SIMPLE_PERM_APX_RW 0 +#define ARM_SIMPLE_PERM_APX_RO 1 + +#define ARM_SIMPLE_PERM_AP(x) ((MVP_BIT(x, 0) << 1) | 1) +#define ARM_SIMPLE_PERM_APX(x) MVP_BIT(x, 1) +/*@}*/ + +/** + * @name ARM domains. + * @{ + */ +#define ARM_DOMAINS 16 + +#define ARM_DOMAIN_NOACCESS 0 +#define ARM_DOMAIN_CLIENT 1 +#define ARM_DOMAIN_RESERVED 2 +#define ARM_DOMAIN_MANAGER 3 +/*@}*/ + +#define ARM_DOMAIN_INDEX(dacr,dom) MVP_EXTRACT_FIELD((dacr), 2*(dom), 2) +#define ARM_DOMAIN_ACCESS(dom,access) ((access) << (2*(dom))) + +/* + * Cache-related definitions. + */ +#define ARM_CACHE_LEVELS_MAX 8 +#define ARM_CACHE_LINE_SIZE_MAX 2048 + +#endif /// _MMU_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/mmu_types.h b/arch/arm/mvp/mvpkm/mmu_types.h new file mode 100644 index 0000000..da8a6fa --- /dev/null +++ b/arch/arm/mvp/mvpkm/mmu_types.h @@ -0,0 +1,226 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MMU-related types. + */ + +#ifndef _MMU_TYPES_H_ +#define _MMU_TYPES_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mmu_defs.h" + +/** + * @brief ARM level 1 page table descriptor. See B3-8 ARM DDI 0406B. + */ +typedef union { + uint32 u; + + struct { + uint32 type : 2; + uint32 xx : 30; + } x; + + struct { + uint32 type : 2; + uint32 sbz1 : 1; + uint32 ns : 1; + uint32 sbz2 : 1; + uint32 domain : 4; + uint32 imp : 1; + uint32 base : 22; + } coarse; + + struct { + uint32 type : 2; + uint32 cb : 2; + uint32 xn : 1; + uint32 domain : 4; + uint32 imp : 1; + uint32 ap : 2; + uint32 tex : 3; + uint32 apx : 1; + uint32 s : 1; + uint32 ng : 1; + uint32 sbz : 1; + uint32 ns : 1; + uint32 base : 12; + } section; + + struct { + uint32 type : 2; + uint32 cb : 2; + uint32 xn : 1; + uint32 xbase2 : 4; + uint32 imp : 1; + uint32 ap : 2; + uint32 tex : 3; + uint32 apx : 1; + uint32 s : 1; + uint32 ng : 1; + uint32 sbo : 1; + uint32 ns : 1; + uint32 xbase1 : 4; + uint32 base : 8; + } supersection; +} ARM_L1D; + +/** + * @brief ARM level 2 page table descriptor. See B3-10 ARM DDI 0406B. + */ +typedef union { + uint32 u; + + struct { + uint32 type : 2; + uint32 cb : 2; + uint32 xx : 28; + } x; + + struct { + uint32 type : 2; + uint32 cb : 2; + uint32 ap : 2; + uint32 sbz : 3; + uint32 apx : 1; + uint32 s : 1; + uint32 ng : 1; + uint32 tex : 3; + uint32 xn : 1; + uint32 base : 16; + } large; + + struct { + uint32 xn : 1; + uint32 type : 1; + uint32 cb : 2; + uint32 ap : 2; + uint32 tex : 3; + uint32 apx : 1; + uint32 s : 1; + uint32 ng : 1; + uint32 base : 20; + } small; +} ARM_L2D; + +/** + * @brief Get the simplified access permissions from a small L2 descriptor. + * + * @param l2D value of L2 descriptor. + * + * @return Simplified access permissions. + */ +static inline uint8 +ARM_L2DSimpleAP(ARM_L2D l2D) +{ + ASSERT(l2D.small.type == ARM_L2D_TYPE_SMALL); + return (l2D.small.apx << 1) | (l2D.small.ap >> 1); +} + +/** + * @brief Permissions for a page - intermediate format. + */ +typedef struct { + uint8 ap : 2; + uint8 apx : 1; + uint8 xn : 1; +} ARM_AccessPerms; + +/** + * @brief ARM domain (0-15). + */ +typedef uint8 ARM_Domain; + +/** + * @brief ARM Domain Access Control Register, see B4.9.4 ARM DDI 0100I. + */ +typedef uint32 ARM_DACR; + +/** + * @brief ARM address space identifier. + * 8-bits with an "invalid ASID" value + * representation. + */ +typedef uint32 ARM_ASID; + +#define ARM_INVALID_ASID ((uint32)(-1)) + +/** + * @brief Page shareability property. + * + * LPAE encoding, see p8 ARM PRD03-GENC-008469 11.0. + */ +typedef enum { + ARM_SHARE_ATTR_NONE, + ARM_SHARE_ATTR_RESERVED, + ARM_SHARE_ATTR_OUTER, + ARM_SHARE_ATTR_INNER, +} PACKED ARM_ShareAttr; + +/** + * @brief Page cacheability property (TEX Remap disabled). + * + * ARM C/B bits, see B4.4.1 ARM DDI 0100I. + */ +typedef enum { + ARM_CB_UNBUFFERED = 0, + ARM_CB_UNCACHED = 1, + ARM_CB_WRITETHROUGH = 2, + ARM_CB_WRITEBACK = 3 +} PACKED ARM_CB; + +/** + * @brief Normal page cacheability property (TEX Remap enabled). + * + * NMRR encoding, see B3-146 ARM DDI 0406B. + */ +typedef enum { + ARM_CACHE_ATTR_NORMAL_NONE, + ARM_CACHE_ATTR_NORMAL_WB_WALLOC, + ARM_CACHE_ATTR_NORMAL_WT, + ARM_CACHE_ATTR_NORMAL_WB +} PACKED ARM_CacheAttrNormal; + +/** + * @brief Normal page memory attributes. + * + * Captures the general case of distinct inner/outer cacheability/shareability. + * See A3-30 ARM DDI 0406B for a discussion of shareability domains and + * cacheability attributes. + */ +typedef struct { + ARM_ShareAttr share; + ARM_CacheAttrNormal innerCache; + ARM_CacheAttrNormal outerCache; +} ARM_MemAttrNormal; + +#endif /// _MMU_TYPES_H_ diff --git a/arch/arm/mvp/mvpkm/montimer_kernel.c b/arch/arm/mvp/mvpkm/montimer_kernel.c new file mode 100644 index 0000000..e2f8ef8 --- /dev/null +++ b/arch/arm/mvp/mvpkm/montimer_kernel.c @@ -0,0 +1,102 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MVP host kernel implementation of monitor timers + * + * The monitor sends requests that are simply a 64-bit absolute time that it + * wants a reply. If it changes its mind, it simply sends a different 64-bit + * absolute time. It is tolerant of us replying too soon, so if we miss the + * update to a later time, it doesn't matter, the monitor will re-send the + * request for the later time. The only time we should miss an update to a + * sooner time is when we are about to send the reply to the old time anyway, + * in which case the monitor sees a reply as quickly as we can generate them, + * so no harm there either. + */ + +#include +#include + +#include "mvp.h" +#include "mvp_timer.h" +#include "actions.h" +#include "mvpkm_kernel.h" + +/** + * @brief Linux timer callback + * @param timer The linux timer raised + * @return Status to not restart the timer + */ +static enum hrtimer_restart +MonitorTimerCB(struct hrtimer *timer) +{ + MvpkmVM *vm = container_of(timer, MvpkmVM, monTimer.timer); + Mvpkm_WakeGuest(vm, ACTION_TIMER); + return HRTIMER_NORESTART; +} + +/** + * @brief Initialize vm associated timer + * @param vm which virtual machine we're running + */ +void +MonitorTimer_Setup(MvpkmVM *vm) +{ + MonTimer *monTimer = &vm->monTimer; + monTimer->vm = vm; + + hrtimer_init(&monTimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + monTimer->timer.function = MonitorTimerCB; +} + +/** + * @brief New timer request from monitor + * @param monTimer Monitor timer + * @param when64 Timer target value + */ +void +MonitorTimer_Request(MonTimer *monTimer, uint64 when64) +{ + if (when64) { + ktime_t kt; + + /* + * Simple conversion, assuming RATE64 is 1e+9 + */ + kt = ns_to_ktime(when64); + ASSERT_ON_COMPILE(MVP_TIMER_RATE64 == 1000000000); + + /* + * Start the timer. If it was already active, it will remove + * the previous expiration time. Linux handles correctly timer + * with deadline in the past, and forces a safety minimal delta + * for closer timer deadlines. + */ + hrtimer_start(&monTimer->timer, kt, HRTIMER_MODE_ABS); + } else { + /* + * Cancel a pending request. If there is none, this will do nothing. + * If it's too late, monitor tolerance will forgive us. + */ + hrtimer_cancel(&monTimer->timer); + } +} diff --git a/arch/arm/mvp/mvpkm/montimer_kernel.h b/arch/arm/mvp/mvpkm/montimer_kernel.h new file mode 100644 index 0000000..6817a83 --- /dev/null +++ b/arch/arm/mvp/mvpkm/montimer_kernel.h @@ -0,0 +1,47 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The monitor-kernel socket interface kernel-only definitions. + */ + +#ifndef _MONITOR_TIMER_KERNEL_H +#define _MONITOR_TIMER_KERNEL_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include + +/** + * @brief Monitor Timer structure + */ +typedef struct { + struct MvpkmVM *vm; ///< Associated vm + struct hrtimer timer; ///< Linux timer +} MonTimer; + +void MonitorTimer_Setup(struct MvpkmVM *vm); +void MonitorTimer_Request(MonTimer *monTimer, uint64 when64); + +#endif diff --git a/arch/arm/mvp/mvpkm/monva_common.h b/arch/arm/mvp/mvpkm/monva_common.h new file mode 100644 index 0000000..de3dd1a --- /dev/null +++ b/arch/arm/mvp/mvpkm/monva_common.h @@ -0,0 +1,106 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Constant definitions that describing the monitor memory layout + * (common to both LPV and VE monitors). + * + */ + +#ifndef _MONVA_COMMON_H_ +#define _MONVA_COMMON_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mmu_defs.h" +#include "mmu_types.h" + +/* + * The monitor occupies a hole in the guest virtual address space. + * The following macros define that hole. + */ + +#define MONITOR_VA_START ((MVA)0xE8000000) +#define MONITOR_VA_LEN 0x03000000 + +/* + * Worldswitch page gets mapped right after the stack guard. + */ +#define MONITOR_VA_WORLDSWITCH \ + ((MVA)(MONITOR_VA_START + 3 * PAGE_SIZE)) + +#define MONITOR_VA_WORLDSWITCH_CODE \ + (MONITOR_VA_WORLDSWITCH + PAGE_SIZE) + +#define MONITOR_VA_UART \ + (MONITOR_VA_WORLDSWITCH_CODE + PAGE_SIZE) + +/** + * @brief Type of physmem region mapping that we want the VMX to know about. + * Helps to identify Guest page allocations. + */ +typedef enum { + MEMREGION_MAINMEM = 1, + MEMREGION_MODULE = 2, + MEMREGION_WSP = 3, + MEMREGION_MONITOR_MISC = 4, + MEMREGION_DEFAULT = 0 +} PACKED PhysMem_RegionType; + +typedef struct MonVA { /* Note that this struct is VE only */ + MA l2BaseMA; ///< MA of monitor L2 page table page + MVA excVec; ///< Monitor exception vector virtual address +} MonVA; + +/** + * @brief Monitor VA mapping type, device or memory. + * + * These values are used to index HMAIR0 in the VE monitor - do not change + * without making the required update to HMAIR0. + */ +typedef enum { + MVA_MEMORY = 0, + MVA_DEVICE = 1 +} MVAType; + +/** + * @name Monitor types, used in VMX, Mvpkm and monitors. + * + * This is not a C enumeration, as we may want to use the values in CPP macros. + * + * @{ + */ +#define MONITOR_TYPE_LPV 0 +#define MONITOR_TYPE_VE 1 +#define MONITOR_TYPE_UNKNOWN 0xf + +typedef uint32 MonitorType; +/*@}*/ + +#endif diff --git a/arch/arm/mvp/mvpkm/mutex.h b/arch/arm/mvp/mvpkm/mutex.h new file mode 100644 index 0000000..30de97d --- /dev/null +++ b/arch/arm/mvp/mvpkm/mutex.h @@ -0,0 +1,107 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Common mutex definitions. + */ + +#ifndef _MUTEX_H +#define _MUTEX_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define MUTEX_CVAR_MAX 2 ///< maximum number of condition variables supported + ///< on a given mutex + +typedef enum MutexMode MutexMode; +typedef struct HKWaitQ HKWaitQ; +typedef struct Mutex Mutex; + +/** + * @brief modes for locking + */ +enum MutexMode { + MutexModeSH = 1, ///< minimum value that can be saved in low + ///< 16 bits of 'state', ie, it won't allow + ///< any other EXs in there without overflowing. + ///< it also will block if there are already + ///< 0xFFFF other shared accesses, but it should + ///< be of little consequence. + + MutexModeEX = 0xFFFF ///< maximum value that can be saved in low + ///< 16 bits of 'state', ie, it won't allow + ///< any other EXs or SHs in there without + ///< overflowing, thus causing a block. +}; + +#include "atomic.h" + +typedef union Mutex_State { + uint32 state; ///< for atomic setting/reading + struct { + uint16 mode; ///< the sum of mode values of MutexMode + uint16 blck; ///< The number of threads blocked + }; +} Mutex_State; + +/** + * @brief shareable mutex struct. + */ +struct Mutex { + HKVA mtxHKVA; ///< mutex's host kernel virtual address + AtmUInt32 state; ///< low 16 bits: # of granted shared accessors + ///< or FFFF if granted exclusive + ///< high 16 bits: # of blocked threads + AtmUInt32 waiters; ///< number of threads on all condWaitQs + ///< ... increment only with mutex locked EX + ///< ... decrement any time + AtmUInt32 blocked; ///< number times blocked (stats only) + HKVA lockWaitQ; ///< threads blocked for mutex to be unlocked + HKVA cvarWaitQs[MUTEX_CVAR_MAX]; ///< condition variables + /* + * Padding to keep binary compatibility @see{MVP-1876} + * These padding bytes can be used for debugging. + */ + int line; + int lineUnl; + uint32 pad3; + uint32 pad4; + uint32 pad5; + uint32 pad6; +}; + +#define Mutex_Lock(a, b) Mutex_LockLine(a, b, __FILE__, __LINE__) +#define Mutex_Unlock(a, b) Mutex_UnlockLine(a, b, __LINE__) +#define Mutex_UnlSleep(a, b, c) Mutex_UnlSleepLine(a, b, c, __FILE__, __LINE__) +#define Mutex_UnlSleepTest(a, b, c, d, e) Mutex_UnlSleepTestLine(a, b, c, d, e, __FILE__, __LINE__) +int Mutex_LockLine(Mutex *mutex, MutexMode mode, const char *file, int line); +void Mutex_UnlockLine(Mutex *mutex, MutexMode mode, int line); +int Mutex_UnlSleepLine(Mutex *mutex, MutexMode mode, uint32 cvi, const char *file, int line); +int Mutex_UnlSleepTestLine(Mutex *mutex, MutexMode mode, uint32 cvi, AtmUInt32 *test, uint32 mask, const char *file, int line); +void Mutex_UnlWake(Mutex *mutex, MutexMode mode, uint32 cvi, _Bool all); + +#endif diff --git a/arch/arm/mvp/mvpkm/mutex_kernel.c b/arch/arm/mvp/mvpkm/mutex_kernel.c new file mode 100644 index 0000000..7b76bfcf --- /dev/null +++ b/arch/arm/mvp/mvpkm/mutex_kernel.c @@ -0,0 +1,480 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The host kernel mutex functions. These mutexes can be located in + * shared address space with the monitor. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "mvp.h" + +#include "arm_inline.h" +#include "coproc_defs.h" +#include "mutex_kernel.h" + +#define POLL_IN_PROGRESS_FLAG (1<<(30-MUTEX_CVAR_MAX)) + +#define INITWAITQ(waitQ) do { \ + init_waitqueue_head((wait_queue_head_t *)(waitQ)); \ +} while (0) + +#define WAKEUPALL(waitQ) do { \ + wake_up_all((wait_queue_head_t *)(waitQ)); \ +} while (0) + +#define WAKEUPONE(waitQ) do { \ + wake_up((wait_queue_head_t *)(waitQ)); \ +} while (0) + +/** + * @brief initialize mutex + * @param[in,out] mutex mutex to initialize + */ +void +Mutex_Init(Mutex *mutex) +{ + wait_queue_head_t *wq; + int i; + + wq = kcalloc(MUTEX_CVAR_MAX + 1, sizeof(wait_queue_head_t), 0); + FATAL_IF(wq == NULL); + + memset(mutex, 0, sizeof *mutex); + mutex->mtxHKVA = (HKVA)mutex; + mutex->lockWaitQ = (HKVA)&wq[0]; + INITWAITQ(mutex->lockWaitQ); + for (i = 0; i < MUTEX_CVAR_MAX; i ++) { + mutex->cvarWaitQs[i] = (HKVA)&wq[i + 1]; + INITWAITQ(mutex->cvarWaitQs[i]); + } +} + +/** + * @brief Check if it is ok to sleep + * @param file the file of the caller code + * @param line the line number of the caller code + */ +static void +MutexCheckSleep(const char *file, int line) +{ +#ifdef MVP_DEVEL + static unsigned long prev_jiffy; /* ratelimiting: 1/s */ + +#ifdef CONFIG_PREEMPT + if (preemptible() && !irqs_disabled()) { + return; + } +#else + if (!irqs_disabled()) { + return; + } +#endif + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) { + return; + } + prev_jiffy = jiffies; + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "irqs_disabled(): %d, preemtible(): %d, pid: %d, name: %s\n", + irqs_disabled(), + preemptible(), + current->pid, current->comm); + dump_stack(); +#endif +} + +/** + * @brief destroy mutex + * @param[in,out] mutex mutex to destroy + */ +void +Mutex_Destroy(Mutex *mutex) +{ + kfree((void*)mutex->lockWaitQ); +} + +/** + * @brief Lock the mutex. Also does a data barrier after locking so the + * locking is complete before any shared data is accessed. + * @param[in,out] mutex which mutex to lock + * @param mode mutex lock mode + * @param file the file of the caller code + * @param line the line number of the code that called this function + * @return rc = 0: mutex now locked by caller
+ * < 0: interrupted + */ +int +Mutex_LockLine(Mutex *mutex, MutexMode mode, const char *file, int line) +{ + Mutex_State newState, oldState; + + MutexCheckSleep(file, line); + + /* + * If uncontended, just set new lock state and return success status. + * If contended, mark state saying there is a waiting thread to wake. + */ + do { +lock_start: + /* + * Get current state and calculate what new state would be. + * New state adds 1 for shared and 0xFFFF for exclusive. + * If the 16 bit field overflows, there is contention. + */ + oldState.state = ATOMIC_GETO(mutex->state); + newState.mode = oldState.mode + mode; + newState.blck = oldState.blck; + + /* + * So we are saying there is no contention if new state + * indicates no overflow. + * + * On fairness: The test here allows a new-comer thread to grab + * the lock even if there is a blocked thread. For example 2 + * threads repeatedly obtaining shared access can starve a third + * wishing to obtain an exclusive lock. Currently this is only a + * hypothetical situation as mksck use exclusive lock only and + * the code never has more than 2 threads using the same mutex. + */ + if ((uint32)newState.mode >= (uint32)mode) { + if (!ATOMIC_SETIF(mutex->state, newState.state, oldState.state)) { + goto lock_start; + } + DMB(); + mutex->line = line; + mutex->lineUnl = -1; + return 0; + } + + /* + * There is contention, so increment the number of blocking threads. + */ + newState.mode = oldState.mode; + newState.blck = oldState.blck + 1; + } while (!ATOMIC_SETIF(mutex->state, newState.state, oldState.state)); + + /* + * Statistics... + */ + ATOMIC_ADDV(mutex->blocked, 1); + + /* + * Mutex is contended, state has been updated to say there is a blocking + * thread. + * + * So now we block till someone wakes us up. + */ + do { + DEFINE_WAIT(waiter); + + /* + * This will make sure we catch any wakes done after we check the lock + * state again. + */ + prepare_to_wait((wait_queue_head_t *)mutex->lockWaitQ, + &waiter, + TASK_INTERRUPTIBLE); + + /* + * Now that we will catch wakes, check the lock state again. If now + * uncontended, mark it locked, abandon the wait and return success. + */ + +set_new_state: + /* + * Same as the original check for contention above, except that we + * must decrement the number of waiting threads by one + * if we are successful in locking the mutex. + */ + oldState.state = ATOMIC_GETO(mutex->state); + newState.mode = oldState.mode + mode; + newState.blck = oldState.blck - 1; + ASSERT(oldState.blck); + + if ((uint32)newState.mode >= (uint32)mode) { + if (!ATOMIC_SETIF(mutex->state, newState.state, oldState.state)) { + goto set_new_state; + } + /* + * Mutex is no longer contended and we were able to lock it. + */ + finish_wait((wait_queue_head_t *)mutex->lockWaitQ, &waiter); + DMB(); + mutex->line = line; + mutex->lineUnl = -1; + return 0; + } + + /* + * Wait for a wake that happens any time after prepare_to_wait() + * returned. + */ + WARN(!schedule_timeout(10*HZ), "Mutex_Lock: soft lockup - stuck for 10s!\n"); + finish_wait((wait_queue_head_t *)mutex->lockWaitQ, &waiter); + } while (!signal_pending(current)); + + /* + * We aren't waiting anymore, so decrement the number of waiting threads. + */ + do { + oldState.state = ATOMIC_GETO(mutex->state); + newState.mode = oldState.mode; + newState.blck = oldState.blck - 1; + + ASSERT(oldState.blck); + + } while (!ATOMIC_SETIF(mutex->state, newState.state, oldState.state)); + + return -ERESTARTSYS; +} + + +/** + * @brief Unlock the mutex. Also does a data barrier before unlocking so any + * modifications made before the lock gets released will be completed + * before the lock is released. + * @param mutex as passed to Mutex_Lock() + * @param mode as passed to Mutex_Lock() + * @param line the line number of the code that called this function + */ +void +Mutex_UnlockLine(Mutex *mutex, MutexMode mode, int line) +{ + Mutex_State newState, oldState; + + DMB(); + do { + oldState.state = ATOMIC_GETO(mutex->state); + newState.mode = oldState.mode - mode; + newState.blck = oldState.blck; + mutex->lineUnl = line; + + ASSERT(oldState.mode >= mode); + } while (!ATOMIC_SETIF(mutex->state, newState.state, oldState.state)); + + /* + * If another thread was blocked, then wake it up. + */ + if (oldState.blck) { + if (mode == MutexModeSH) { + WAKEUPONE(mutex->lockWaitQ); + } else { + WAKEUPALL(mutex->lockWaitQ); + } + } +} + + +/** + * @brief Unlock the mutex and sleep. Also does a data barrier before + * unlocking so any modifications made before the lock gets released + * will be completed before the lock is released. + * @param mutex as passed to Mutex_Lock() + * @param mode as passed to Mutex_Lock() + * @param cvi which condition variable to sleep on + * @param file the file of the caller code + * @param line the line number of the caller code + * @return rc = 0: successfully waited
+ * < 0: error waiting + */ +int +Mutex_UnlSleepLine(Mutex *mutex, MutexMode mode, uint32 cvi, const char *file, int line) +{ + return Mutex_UnlSleepTestLine(mutex, mode, cvi, NULL, 0, file, line); +} + +/** + * @brief Unlock the mutex and sleep. Also does a data barrier before + * unlocking so any modifications made before the lock gets released + * will be completed before the lock is released. + * @param mutex as passed to Mutex_Lock() + * @param mode as passed to Mutex_Lock() + * @param cvi which condition variable to sleep on + * @param test sleep only if null or pointed atomic value mismatches mask + * @param mask bitfield to check test against before sleeping + * @param file the file of the caller code + * @param line the line number of the caller code + * @return rc = 0: successfully waited
+ * < 0: error waiting + */ +int +Mutex_UnlSleepTestLine(Mutex *mutex, MutexMode mode, uint32 cvi, AtmUInt32 *test, uint32 mask, const char *file, int line) +{ + DEFINE_WAIT(waiter); + + MutexCheckSleep(file, line); + + ASSERT(cvi < MUTEX_CVAR_MAX); + + /* + * Tell anyone who might try to wake us that they need to actually call + * WAKEUP***(). + */ + ATOMIC_ADDV(mutex->waiters, 1); + + /* + * Be sure to catch any wake that comes along just after we unlock the mutex + * but before we call schedule(). + */ + prepare_to_wait_exclusive((wait_queue_head_t *)mutex->cvarWaitQs[cvi], + &waiter, + TASK_INTERRUPTIBLE); + + /* + * Release the mutex, someone can wake us up now. + * They will see mutex->waiters non-zero so will actually do the wake. + */ + Mutex_Unlock(mutex, mode); + + /* + * Wait to be woken or interrupted. + */ + if (test == NULL || (ATOMIC_GETO(*test) & mask) == 0) { + schedule(); + } + finish_wait((wait_queue_head_t *)mutex->cvarWaitQs[cvi], &waiter); + + /* + * Done waiting, don't need a wake any more. + */ + ATOMIC_SUBV(mutex->waiters, 1); + + /* + * If interrupted, return error status. + */ + if (signal_pending(current)) { + return -ERESTARTSYS; + } + + /* + * Wait completed, return success status. + */ + return 0; +} + + +/** + * @brief Unlock the mutex and prepare to sleep on a kernel polling table + * given as anonymous parameters for poll_wait + * @param mutex as passed to Mutex_Lock() + * @param mode as passed to Mutex_Lock() + * @param cvi which condition variable to sleep on + * @param filp which file to poll_wait upon + * @param wait which poll_table to poll_wait upon + */ +void +Mutex_UnlPoll(Mutex *mutex, MutexMode mode, uint32 cvi, void *filp, void *wait) +{ + ASSERT(cvi < MUTEX_CVAR_MAX); + + /* poll_wait is done with mutex locked to prevent any wake that comes and + * defer them just after we unlock the mutex but before kernel polling + * tables are used + * Note that the kernel is probably avoiding an exclusive wait in that case + * and also increments the usage for the file given in filp + */ + poll_wait(filp, (wait_queue_head_t *)mutex->cvarWaitQs[cvi], wait); + + /* + * Tell anyone who might try to wake us that they need to actually call + * WAKEUP***(). This is done in putting ourselves in a "noisy" mode since + * there is no guaranty that we would really sleep, or if we would be + * wakening the sleeping thread with that socket or condition. This is + * done using a POLL_IN_PROGRESS_FLAG, but unfortunately it has to be + * a per-cvi flag, in case we would poll independently on different cvi + */ + DMB(); + ATOMIC_ORO(mutex->waiters, (POLL_IN_PROGRESS_FLAG << cvi)); + + /* + * Release the mutex, someone can wake us up now. + * They will see mutex->waiters non-zero so will actually do the wake. + */ + Mutex_Unlock(mutex, mode); +} + + +/** + * @brief Unlock the semaphore and wake sleeping threads. Also does a data + * barrier before unlocking so any modifications made before the lock + * gets released will be completed before the lock is released. + * @param mutex as passed to Mutex_Lock() + * @param mode as passed to Mutex_Lock() + * @param cvi which condition variable to signal + * @param all false: wake a single thread
+ * true: wake all threads + */ +void +Mutex_UnlWake(Mutex *mutex, MutexMode mode, uint32 cvi, _Bool all) +{ + Mutex_Unlock(mutex, mode); + Mutex_CondSig(mutex, cvi, all); +} + + +/** + * @brief Signal condition variable, ie, wake up anyone waiting. + * @param mutex mutex that holds the condition variable + * @param cvi which condition variable to signal + * @param all false: wake a single thread
+ * true: wake all threads + */ +void +Mutex_CondSig(Mutex *mutex, uint32 cvi, _Bool all) +{ + uint32 waiters; + + ASSERT(cvi < MUTEX_CVAR_MAX); + + waiters = ATOMIC_GETO(mutex->waiters); + if (waiters != 0) { + /* Cleanup the effects of Mutex_UnlPoll() but only when it is SMP safe, + * considering that atomic and wakeup operations should also do memory + * barriers accordingly. This is mandatory otherwise rare SMP races are + * even possible, since Mutex_CondSig is called with the associated mutex + * unlocked, and that does not prevent from select() to run parallel ! + */ + if ((waiters >= POLL_IN_PROGRESS_FLAG) && + !waitqueue_active((wait_queue_head_t *)mutex->cvarWaitQs[cvi])) { + ATOMIC_ANDO(mutex->waiters, ~(POLL_IN_PROGRESS_FLAG << cvi)); + } + DMB(); + + if (all) { + WAKEUPALL(mutex->cvarWaitQs[cvi]); + } else { + WAKEUPONE(mutex->cvarWaitQs[cvi]); + } + } +} diff --git a/arch/arm/mvp/mvpkm/mutex_kernel.h b/arch/arm/mvp/mvpkm/mutex_kernel.h new file mode 100644 index 0000000..4bdf0e1 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mutex_kernel.h @@ -0,0 +1,41 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The host kernel mutex definitions. + */ + +#ifndef _MUTEX_KERNEL_H +#define _MUTEX_KERNEL_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mutex.h" + +void Mutex_Init(Mutex *mutex); +void Mutex_Destroy(Mutex *mutex); +void Mutex_CondSig(Mutex *mutex, uint32 cvi, _Bool all); +void Mutex_UnlPoll(Mutex *mutex, MutexMode mode, uint32 cvi, void *filp, void *wait); + +#endif diff --git a/arch/arm/mvp/mvpkm/mvp.h b/arch/arm/mvp/mvpkm/mvp.h new file mode 100644 index 0000000..e21b8a0 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp.h @@ -0,0 +1,48 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief top-level include for all basic includes. + * This file should not define anything of its own. + */ + +#ifndef _MVP_H +#define _MVP_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "mvp_compiler.h" +#include "utils.h" +#include "mvp_assert.h" +#include "mvp_types.h" +#include "platdefx.h" + +#endif diff --git a/arch/arm/mvp/mvpkm/mvp_assert.h b/arch/arm/mvp/mvpkm/mvp_assert.h new file mode 100644 index 0000000..9ee6fc0 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_assert.h @@ -0,0 +1,125 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief ASSERT() and related macros. + */ + +#ifndef _MVP_ASSERT_H +#define _MVP_ASSERT_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define ASSERT(_x) ASSERT_BUG((_x),0) + +#ifndef NDEBUG +#define ASSERT_BUG(_x,_tkt) do { \ + if (UNLIKELY(!(_x))) { \ + FatalError(__FILE__, __LINE__, FECodeAssert, _tkt, NULL); \ + } \ +} while (0) + +#define ASSERTF(_x, ...) do { \ + if (UNLIKELY(!(_x))) { \ + FatalError(__FILE__, \ + __LINE__, \ + FECodeAssert, \ + 0, \ + __VA_ARGS__); \ + } \ +} while (0) +#else + +#define ASSERT_BUG(_x,_tkt) (void)sizeof((int)(_x)) +#define ASSERTF(_x, ...) ASSERT_BUG(_x, 0) + +#endif + +/* + * Compile-time assertions. + * + * ASSERT_ON_COMPILE does not use the common + * switch (0) { case 0: case (e): ; } trick because some compilers (e.g. MSVC) + * generate code for it. + * + * The implementation uses both enum and typedef because the typedef alone is + * insufficient; gcc allows arrays to be declared with non-constant expressions + * (even in typedefs, where it makes no sense). + */ +#ifdef __COVERITY__ +#define ASSERT_ON_COMPILE(e) ASSERT(e) +#else +#define ASSERT_ON_COMPILE(e) \ + do { \ + enum { AssertOnCompileMisused = ((e) ? 1 : -1) }; \ + typedef char AssertOnCompileFailed[AssertOnCompileMisused]; \ + } while (0) +#endif + +/* + * To put an ASSERT_ON_COMPILE() outside a function, wrap it + * in MY_ASSERTS(). The first parameter must be unique in + * each .c file where it appears. For example, + * + * MY_ASSERTS(FS3_INT, + * ASSERT_ON_COMPILE(sizeof(FS3_DiskLock) == 128); + * ASSERT_ON_COMPILE(sizeof(FS3_DiskLockReserved) == DISK_BLOCK_SIZE); + * ASSERT_ON_COMPILE(sizeof(FS3_DiskBlock) == DISK_BLOCK_SIZE); + * ASSERT_ON_COMPILE(sizeof(Hardware_DMIUUID) == 16); + * ) + * + * Caution: ASSERT() within MY_ASSERTS() is silently ignored. + * The same goes for anything else not evaluated at compile time. + */ + +#define MY_ASSERTS(name, assertions) \ + static inline void name(void) { \ + assertions \ + } + +#define KNOWN_BUG(_tkt) + +#define NOT_IMPLEMENTED() NOT_IMPLEMENTED_JIRA(0) +#define NOT_IMPLEMENTED_JIRA(_tkt,...) FatalError(__FILE__, __LINE__, FECodeNI, _tkt, NULL) + +#define NOT_IMPLEMENTED_IF(_x) NOT_IMPLEMENTED_IF_JIRA((_x),0) +#define NOT_IMPLEMENTED_IF_JIRA(_x,_tkt,...) do { if (UNLIKELY(_x)) NOT_IMPLEMENTED_JIRA(_tkt); } while (0) +/* + * All sites tagged with this are @knownjira{MVP-1855}. + */ +#define NOT_IMPLEMENTEDF(...) FatalError(__FILE__, __LINE__, FECodeNI, 0, __VA_ARGS__) + +#define NOT_REACHED() FatalError(__FILE__, __LINE__, FECodeNR, 0, NULL) + +#include "fatalerror.h" +#include "nottested.h" + +#endif diff --git a/arch/arm/mvp/mvpkm/mvp_balloon.h b/arch/arm/mvp/mvpkm/mvp_balloon.h new file mode 100644 index 0000000..9df5669 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_balloon.h @@ -0,0 +1,217 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Common guest/host balloon state machine. + */ +#ifndef _MVP_BALLOON_H +#define _MVP_BALLOON_H + +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_MODULE +#include "include_check.h" + +/** + * @brief Balloon watchdog timeout (in seconds). + * + * If we don't hear back from the guest balloon driver in this amount of time, + * we terminate the guest. + * + * This can sound arbitrary long but we need to deal with checkpointing. The + * watchdog goal is only to not let not-responding VM running for ages. + */ +#define BALLOON_WATCHDOG_TIMEOUT_SECS 90 + +/** + * @brief MVP_BALLOON_GET_DELTA return. + */ +typedef union { + struct { + int32 delta : 21; ///< Number/direction balloon adjustment in pages. + }; + uint32 u; +} Balloon_GetDeltaRet; + +/** + * @name Guest settings for lowmemorykiller oom_adj and minfree thresholds, as reflected in + * the guest's /sys/module/lowmemorykiller/parameters/{minfree,adj}. + * + * @{ + */ + +/** + * @brief Android oom_adj levels for the various thresholds. + */ +typedef enum { + BALLOON_ANDROID_GUEST_OOM_ADJ_FOREGROUND_APP = 0, + BALLOON_ANDROID_GUEST_OOM_ADJ_VISIBLE_APP = 1, + BALLOON_ANDROID_GUEST_OOM_ADJ_SECONDARY_SERVER = 2, + BALLOON_ANDROID_GUEST_OOM_ADJ_BACKUP_APP = 2, + BALLOON_ANDROID_GUEST_OOM_ADJ_HOME_APP = 4, + BALLOON_ANDROID_GUEST_OOM_ADJ_HIDDEN_APP_MIN = 7, + BALLOON_ANDROID_GUEST_OOM_ADJ_CONTENT_PROVIDER = 14, + BALLOON_ANDROID_GUEST_OOM_ADJ_EMPTY_APP = 15 +} Balloon_AndroidGuestOOMAdj; + +/** + * @brief Android low memory killer thresholds (in pages). + */ +typedef enum { + BALLOON_ANDROID_GUEST_MIN_FREE_FOREGROUND_APP_PAGES = 1536, + BALLOON_ANDROID_GUEST_MIN_FREE_VISIBLE_APP_PAGES = 2048, + BALLOON_ANDROID_GUEST_MIN_FREE_SECONDARY_SERVER_PAGES = 4096, + BALLOON_ANDROID_GUEST_MIN_FREE_BACKUP_APP_PAGES = 4096, + BALLOON_ANDROID_GUEST_MIN_FREE_HOME_APP_PAGES = 4096, + BALLOON_ANDROID_GUEST_MIN_FREE_HIDDEN_APP_PAGES = 5120, + BALLOON_ANDROID_GUEST_MIN_FREE_CONTENT_PROVIDER_PAGES = 5632, + BALLOON_ANDROID_GUEST_MIN_FREE_EMPTY_APP_MEM_PAGES = 6144 +} Balloon_AndroidGuestMinFreePages; + +/* @} */ +/** + * @brief Calculate distance to the point at which Android will terminate + * processes. + * + * In the balloon policy we strive to maintain the low memory killer minfree + * value (e.g. max(freePages, filePages)) above the threshold for terminating + * empty apps (as per the Android low memory killer's logic). Here we measure + * the number of pages we have buffering us from this point. + * + * We chose the empty app threshold instead instead of the home app threshold, + * the threshold we ultimately want to avoid crossing for two reasons: + * - We want to avoid any error introduced by the use of max(free, file) when + * between the two thresholds from interfering with the errorBackground term + * in the balloon policy. If we instead measure the distance to the home app + * threshold, we can get into the situation that even when both sides have + * balanced background pages and the same low memory distance, different + * free/file ratios in the two worlds introduces a further bias. + * - It's helpful in avoiding extreme situations where the balloon won't be able + * to adapt quickly to leave a buffer. With empty app minfree as the target, + * when background pages drops to zero, and both worlds are below the empty + * app minfree target, the balloon will stop adjusting, leaving each world to + * fend for itself. At this point, the worlds have a maximum of 8192 pages + * (using the above logic) until they start killing services and foreground + * apps, which seems like a reasonable buffer to have in place. Another way of + * putting it is that at this point, we are unsure that rebalancing the + * balloon won't harm the side it balances against by eating into its buffer. + * + * We assume that normally filePages only decreases as a result of freePages + * being close to zero, when vmscan reclaiming kicks in. Based on this, + * there are two cases when computing the distance. + * + * - filePages >= emptyAppPages: + * freePages + filePages - emptyAppPages + * - filePages < emptyAppPages: + * MAX(0, freePages - emptyAppPages) + * + * @param freePages number of free pages. + * @param filePages number of pages in the page cache. + * @param emptyAppPages number of free/file pages at which the + * lowmemorykiller will start killing empty apps. + * + * @return Low memory distance measure (in pages). + */ +static inline uint32 +Balloon_LowMemDistance(uint32 freePages, uint32 filePages, uint32 emptyAppPages) +{ + return filePages >= emptyAppPages ? + freePages + (filePages - emptyAppPages) : + (freePages > emptyAppPages ? freePages - emptyAppPages : 0); +} + +#ifdef __KERNEL__ +/** + * @brief Obtain approximation of # anonymous pages belonging to Android + * background processes. + * + * Used to inform balloon policy. Note that this is a coarse approximation only, + * since we use RSS. More precise accounting is possible but potentially costly + * as it's not available directly in the task struct. + * + * @param hiddenAppOOMAdj minimum oom_adj for hidden apps. + * + * @return sum of empty, content provider and hidden app anon resident pages. + */ +static uint32 +Balloon_AndroidBackgroundPages(uint32 minHiddenAppOOMAdj) +{ + uint32 backgroundPages = 0, nonBackgroundPages = 0; + struct task_struct *t; + + /* + * Traverse the tasklist to replicate the behavior of the Android low memory + * killer. + */ + rcu_read_lock(); + + for_each_process(t) { + int oom_adj = 0; + + task_lock(t); + + if (t->signal == NULL) { + task_unlock(t); + continue; + } else { + oom_adj = t->signal->oom_adj; + } + + if (t->mm != NULL) { +#ifdef BALLOON_DEBUG_PRINT_ANDROID_PAGES + printk("Balloon_AndroidBackgroundPages: %d %d %s\n", + oom_adj, + (int)get_mm_counter(t->mm, MM_ANONPAGES), + t->comm); +#endif + + if (oom_adj >= (int)minHiddenAppOOMAdj) { + /* + * Unlike the Android low memory killer, we only consider anonymous + * memory here, since we already account for file pages in the + * balloon policy using global_page_state(NR_FILE_PAGES). + */ + backgroundPages += get_mm_counter(t->mm, MM_ANONPAGES); + } else { + nonBackgroundPages += get_mm_counter(t->mm, MM_ANONPAGES); + } + } + + task_unlock(t); + } + + rcu_read_unlock(); + +#ifdef BALLOON_DEBUG_PRINT_ANDROID_PAGES + printk("Balloon_AndroidBackgroundPages: non-background pages: %d " + "background pages: %d\n", + nonBackgroundPages, + backgroundPages); +#endif + + return backgroundPages; +} +#endif + +#endif diff --git a/arch/arm/mvp/mvpkm/mvp_compiler.h b/arch/arm/mvp/mvpkm/mvp_compiler.h new file mode 100644 index 0000000..58825a0 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_compiler.h @@ -0,0 +1,56 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Compiler-related definitions and directives. + */ + +#ifndef _MVP_COMPILER_H_ +#define _MVP_COMPILER_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#ifdef __GNUC__ +#include "mvp_compiler_gcc.h" +#else /* __GNUC__ */ +#include "mvp_compiler_other.h" +#endif /* __GNUC__ */ + +/** + * @brief Find last set bit. + * + * @param n unsigned 32-bit integer. + * + * @return 0 if n == 0 otherwise 32 - the number of leading zeroes in n. + */ +#define FLS(n) (32 - CLZ(n)) + +#endif /// ifndef _MVP_COMPILER_H_ diff --git a/arch/arm/mvp/mvpkm/mvp_compiler_gcc.h b/arch/arm/mvp/mvpkm/mvp_compiler_gcc.h new file mode 100644 index 0000000..ab35ebd --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_compiler_gcc.h @@ -0,0 +1,87 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief common definitions for GCC + */ + +#ifndef _MVP_COMPILER_GCC_H +#define _MVP_COMPILER_GCC_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/** + * @brief Count leading zeroes. + * + * @param n unsigned 32-bit integer. + * + * @return 32 if n == 0 otherwise 31 - the bit position of the most significant 1 + * in n. + */ +#ifdef __COVERITY__ +static inline int +CLZ(unsigned int n) +{ + unsigned int r = 0; + + while (n) { + r++; + n >>= 1; + } + + return 32 - r; +} +#else +#define CLZ(n) __builtin_clz(n) +#endif + +#define PACKED __attribute__ ((packed)) +#define ALLOC __attribute__ ((malloc, warn_unused_result)) +#define UNUSED __attribute__ ((unused)) +#define PURE __attribute__ ((pure)) +#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result)) +#define FORMAT(x,y,z) __attribute__ ((format(x,y,z))) +#define LIKELY(x) __builtin_expect(!!(x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) + +/* + * For debug builds, we want to omit __attribute__((noreturn)) so that gcc will + * keep stack linkages and then we will have useful core dumps. For non-debug + * builds, we don't care about the stack frames and want the little bit of + * optimization that noreturn gives us. + */ +#if defined(__COVERITY__) || !defined(MVP_DEBUG) +#define NORETURN __attribute__((noreturn)) +#else +#define NORETURN +#endif + +#endif diff --git a/arch/arm/mvp/mvpkm/mvp_math.h b/arch/arm/mvp/mvpkm/mvp_math.h new file mode 100644 index 0000000..7017bc8 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_math.h @@ -0,0 +1,133 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Math library. + */ + +#ifndef _MVP_MATH_H_ +#define _MVP_MATH_H_ + +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#define INCLUDE_ALLOW_HOSTUSER +#include "include_check.h" + +#include "mvp_compiler_gcc.h" + +/** + * @brief Compute floor log2 of a given 32-bit unsigned integer. + * + * @param n 32-bit unsigned integer, n > 0. + * + * @return floor(log2(n)). + */ +#define LOG2(n) \ +( \ + __builtin_constant_p(n) ? ( \ + (n) & (1UL << 31) ? 31 : \ + (n) & (1UL << 30) ? 30 : \ + (n) & (1UL << 29) ? 29 : \ + (n) & (1UL << 28) ? 28 : \ + (n) & (1UL << 27) ? 27 : \ + (n) & (1UL << 26) ? 26 : \ + (n) & (1UL << 25) ? 25 : \ + (n) & (1UL << 24) ? 24 : \ + (n) & (1UL << 23) ? 23 : \ + (n) & (1UL << 22) ? 22 : \ + (n) & (1UL << 21) ? 21 : \ + (n) & (1UL << 20) ? 20 : \ + (n) & (1UL << 19) ? 19 : \ + (n) & (1UL << 18) ? 18 : \ + (n) & (1UL << 17) ? 17 : \ + (n) & (1UL << 16) ? 16 : \ + (n) & (1UL << 15) ? 15 : \ + (n) & (1UL << 14) ? 14 : \ + (n) & (1UL << 13) ? 13 : \ + (n) & (1UL << 12) ? 12 : \ + (n) & (1UL << 11) ? 11 : \ + (n) & (1UL << 10) ? 10 : \ + (n) & (1UL << 9) ? 9 : \ + (n) & (1UL << 8) ? 8 : \ + (n) & (1UL << 7) ? 7 : \ + (n) & (1UL << 6) ? 6 : \ + (n) & (1UL << 5) ? 5 : \ + (n) & (1UL << 4) ? 4 : \ + (n) & (1UL << 3) ? 3 : \ + (n) & (1UL << 2) ? 2 : \ + (n) & (1UL << 1) ? 1 : \ + (n) & (1UL << 0) ? 0 : \ + 0xffffffff \ + ) : (uint32)(CLZ(1) - CLZ(n)) \ +) + +/** + * @brief Multiplicative hash function for 32-bit key and p-bit range. See p229 + * Introduction to Algorithms, Cormen, Leiserson and Rivest, 1996. + * + * @param key 32-bit key. + * @param p range order, <= 32. + * + * @return hash value in range [0..2^p) + */ +static inline uint32 +Math_MultiplicativeHash(uint32 key, uint32 p) +{ + return (key * 2654435769UL) >> (32 - p); +} + +/** + * @brief Compute ceiling log2 of a given 32-bit unsigned integer. + * + * @param n 32-bit unsigned integer, n > 0. + * + * @return ceiling(log2(n)). + */ +static inline uint32 CLOG2(uint32 n) +{ + return LOG2(n) + ((n & -n) != n); +} + + +/** + * @brief djb2 String hashing function by Dan Bernstein, see + * http://www.cse.yorku.ca/~oz/hash.html + * @param str String to hash + * @return 32-bit hash value + */ +static inline +uint32 Math_Djb2Hash(uint8 *str) +{ + uint32 hash = 5381; + int32 c; + + while ((c = *str++)) { + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + } + + return hash; +} + +#endif // ifndef _MVP_MATH_H_ diff --git a/arch/arm/mvp/mvpkm/mvp_timer.h b/arch/arm/mvp/mvpkm/mvp_timer.h new file mode 100644 index 0000000..0bd073a --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_timer.h @@ -0,0 +1,72 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief timer definitions + */ + +#ifndef _MVP_TIMER_H +#define _MVP_TIMER_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/** + * @brief timer tick rate as returned by MVPTimer_Now64 as a uint64 and used by + * MVPTimer.when64. + * + * For example 1,000,000 means the counter is in microseconds. + * + * Current implementation requires MVP_TIMER_RATE64 <= 1,000,000,000 and that + * it evenly divide 1,000,000,000. Currently 1,000,000,000 to avoid a multiply + * or divide in MVPTimer_Now64. + */ +#define MVP_TIMER_RATE64 1000000000 + +/* + * Extract current UNIX-style time_t date/time from the 64-bit time as returned + * by MVPTimer_Now64(). + */ +#define MVP_TIMER_RATE64_TIME_T(time64) ((time_t)((time64) / MVP_TIMER_RATE64)) + +typedef struct MVPTimer MVPTimer; + +/** + * @brief timer entry struct + */ +struct MVPTimer { + MVPTimer *next; ///< next in timers list + uint64 when64; ///< absolute expiration + void (*entry)(uint64 now64, MVPTimer *timer); ///< callback entrypoint + void *param; ///< callback parameter +}; + +void MVPTimer_InitVMX(void); +uint64 MVPTimer_Now64(void); +void MVPTimer_Start(MVPTimer *timer); +_Bool MVPTimer_Cancel(MVPTimer *timer); + +#endif diff --git a/arch/arm/mvp/mvpkm/mvp_types.h b/arch/arm/mvp/mvpkm/mvp_types.h new file mode 100644 index 0000000..035efd7 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_types.h @@ -0,0 +1,94 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief basic type definitions. + * These may need to be conditionalized for different compilers/platforms. + */ + +#ifndef _MVPTYPES_H +#define _MVPTYPES_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; +typedef unsigned long long uint64; + +typedef signed char int8; +typedef short int16; +typedef int int32; +typedef long long int64; + +typedef uint32 CVA; // whatever we are compiling the code as +typedef uint32 GVA; // guest virtual addresses +typedef uint32 MVA; // monitor virtual addresses +typedef uint32 HKVA; // host kernel virtual addresses +typedef uint32 HUVA; // host user virtual addresses +typedef uint64 PA; // (guest) physical addresses (40-bit) +typedef uint32 MA; // (host) machine addresses + +typedef uint32 PPN; // PA/PAGE_SIZE +typedef uint32 MPN; // MA/PAGE_SIZE + +typedef uint64 cycle_t; + +/** + * @brief Page segment. + * + * Specifies a segment within a single page. + */ +typedef struct { + uint16 off; + uint16 len; +} PageSeg; + +/* + * GCC's argument checking for printf-like functions + * + * fmtPos is the position of the format string argument, beginning at 1 + * varPos is the position of the variable argument, beginning at 1 + */ + +#if defined(__GNUC__) +# define PRINTF_DECL(fmtPos, varPos) __attribute__((__format__(__printf__, fmtPos, varPos))) +#else +# define PRINTF_DECL(fmtPos, varPos) +#endif + +#if defined(__GNUC__) +# define SCANF_DECL(fmtPos, varPos) __attribute__((__format__(__scanf__, fmtPos, varPos))) +#else +# define SCANF_DECL(fmtPos, varPos) +#endif + +#endif /* _MVPTYPES_H */ diff --git a/arch/arm/mvp/mvpkm/mvp_version.h b/arch/arm/mvp/mvpkm/mvp_version.h new file mode 100644 index 0000000..31274dd --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvp_version.h @@ -0,0 +1,116 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief What version is this? + * + */ + +#ifndef _MVP_VERSION_H_ +#define _MVP_VERSION_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#define INCLUDE_ALLOW_HOSTUSER +#include "include_check.h" +#include "utils.h" + +/* + * MVP Internal Version Numbering + * + * + * There are 4 different usage areas of version information. + * + * Version Name. This is a marketing string that is used to sell the + * product. The update of this string has legal consequences, it + * should be done infrequently. Currently we use "V1.0" like + * terms. Developer builds have E.X.P as Version Name. + * + * Android Version Code. This is an integer associated with + * com.vmware.mvp.apk on Google Play (a.k.a Android Market). If our + * product is multi-apk (that is, we release individual apks for the + * different Android versions) then the Android Version Code must + * satisfy certain constrains. Typically the Android API level is + * the high order 2 digits. + * + * Engineering Version Code. During an update process of one of the + * 3 components on the handset (MVP, VVP, OEK) compatibility needs + * to be verified. The Engineering Version Code is a single number + * associated with each of the 4 components and it serves as a basis + * of this compatibility test. It reflects time, bigger number is + * associated with newer code. + * + * Git Revision. The git hash is a unique identifier of the + * source. If picked up from a log, engineers can go to the code + * depos and check out the exact code used for the build. For MVP, + * VVP, and OEK this is the main/mvp.git, for HMM it is + * main/mdm.git. Note that git hash is not ordered, it cannot be + * used to directly determine precedence. + * + */ + +#define MVP_VERSION_CODE 16800005 +#define MVP_VERSION_CODE_FORMATSTR "%s_%d" +#define MVP_VERSION_CODE_FORMATARGSV(V_) MVP_STRINGIFY(1.1.3), (V_) +#define MVP_VERSION_CODE_FORMATARGS \ + MVP_VERSION_CODE_FORMATARGSV(MVP_VERSION_CODE) + +#define MVP_VERSION_FORMATSTR \ + MVP_VERSION_CODE_FORMATSTR \ + " compiled at %s based on revision %s by user %s." + +#define MVP_VERSION_FORMATARGS \ + MVP_VERSION_CODE_FORMATARGS, \ + __DATE__, \ + MVP_STRINGIFY(5c995a85564cd060562bdbcd1422709e7a326301), \ + MVP_STRINGIFY() + +#define MvpVersion_Map(map_, version_) \ + ({ \ + uint32 ii_; \ + uint32 versionApi_ = 0; \ + for (ii_ = 0; ii_ < NELEM(map_); ii_++) { \ + if (map_[ii_] <= version_) { \ + versionApi_ = map_[ii_]; \ + } \ + } \ + versionApi_; \ + }) + +/* + * MVP.apk must communicate to VVP and OEK on many of its APIs. To + * ensure compatibility, it is mandated that any VVP and OEK version + * younger than the minimums defined below can be serviced on all of + * the various APIs. + * + * During the deprecation process, first a marketing decision is made + * that the limit below can be raised. After the new minimums are + * determined, they must be entered here. Then the various APIs can + * remove code that has been obsoleted before the new minimum versions. + */ +#define VVP_VERSION_CODE_MIN 0x0100020e +#define OEK_VERSION_CODE_MIN 0x01000001 + +#endif /* _MVP_VERSION_H_ */ diff --git a/arch/arm/mvp/mvpkm/mvpkm_comm_ev.c b/arch/arm/mvp/mvpkm/mvpkm_comm_ev.c new file mode 100644 index 0000000..cb0ce26 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvpkm_comm_ev.c @@ -0,0 +1,60 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief mvpkm kernel hooks for Comm event signaling + */ + +#include +#include "comm_transp_impl.h" + +int (*CommTranspEvProcess)(CommTranspID* id, CommTranspIOEvent event); + + +/** + * @brief Register a processing callback for the host when a signal + * is received from the guest. Supports only a single comm "service" + * on the host. + * @param commProcessFunc function pointer to process a signal + */ + +void +Mvpkm_CommEvRegisterProcessCB(int (*commProcessFunc)(CommTranspID*, + CommTranspIOEvent)) +{ + CommTranspEvProcess = commProcessFunc; +} + +/** + * @brief Unregister the processing callback for the host when a signal + * is received from the guest. + */ + +void +Mvpkm_CommEvUnregisterProcessCB(void) +{ + CommTranspEvProcess = NULL; +} + + +EXPORT_SYMBOL(Mvpkm_CommEvRegisterProcessCB); +EXPORT_SYMBOL(Mvpkm_CommEvUnregisterProcessCB); diff --git a/arch/arm/mvp/mvpkm/mvpkm_comm_ev.h b/arch/arm/mvp/mvpkm/mvpkm_comm_ev.h new file mode 100644 index 0000000..2e3c960 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvpkm_comm_ev.h @@ -0,0 +1,53 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief mvpkm kernel hooks for comm event signaling + */ + +#ifndef _MVPKM_COMM_EV_H +#define _MVPKM_COMM_EV_H + +extern int (*CommTranspEvProcess)(CommTranspID* id, CommTranspIOEvent event); + +/** + * @brief Forward any guest signal requests to the commkm module + * @param id transport channel id + * @param event comm event type + */ + +static inline void +Mvpkm_CommEvSignal(CommTranspID *id, CommTranspIOEvent event) +{ + if (CommTranspEvProcess) { + CommTranspEvProcess(id, event); + } +} + +void +Mvpkm_CommEvRegisterProcessCB(int (*commProcessFunc)(CommTranspID*, + CommTranspIOEvent)); +void Mvpkm_CommEvUnregisterProcessCB(void); + + + +#endif diff --git a/arch/arm/mvp/mvpkm/mvpkm_kernel.h b/arch/arm/mvp/mvpkm/mvpkm_kernel.h new file mode 100644 index 0000000..19ba6ce --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvpkm_kernel.h @@ -0,0 +1,83 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +#ifndef _MVPKM_KERNEL_H +#define _MVPKM_KERNEL_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include +#include +#include + +#ifdef CONFIG_HAS_WAKELOCK +#include +#endif + +#include "atomic.h" +#include "montimer_kernel.h" +#include "worldswitch.h" + +/** + * @file + * + * @brief The kernel level driver. + */ + +struct MvpkmVM { + struct kobject kobj; ///< used to hook into sysfs + struct kset *devicesKSet; ///< kset to list virtual device entries + struct kset *miscKSet; ///< kset to list miscellaneous entries + _Bool haveKObj; ///< used to properly release instance + struct rb_root lockedRoot; ///< locked page RB tree root + struct rw_semaphore lockedSem; ///< linked list rw semaphore + AtmUInt32 usedPages; ///< number of MEMREGION_MAINMEM pages + _Bool isMonitorInited; ///< Has SetupMonitor been called already? + WorldSwitchPage *wsp; ///< worldswitch page + wait_queue_head_t wfiWaitQ; ///< guest VCPU is waiting-for-interrupt + struct rw_semaphore wspSem; /*< prevents entries the WFI + wait Q from disappearing + underneath us in + MvpkmShrink. */ + MonTimer monTimer; /*< monitor timers, there + should be one of these + per VCPU */ + MPN stubPageMPN; /*< stub page to be used for + unmappable pages */ + struct vm_struct *wspHkvaArea; ///< VM area struct for wspHkvaArea + HKVA wspHKVADummyPage;///< Dummy page used for backing wspHkvaArea +#ifdef CONFIG_HAS_WAKELOCK + struct wake_lock wakeLock; ///< guest running wake lock +#endif + struct rw_semaphore monThreadTaskSem;/*< prevents monThreadTask from + disappearing underneath us */ + struct task_struct *monThreadTask; + struct timer_list balloonWDTimer; /// Balloon watchdog timer + _Bool balloonWDEnabled; /// Balloon watchdog enabled? +}; + +typedef struct MvpkmVM MvpkmVM; + +void Mvpkm_WakeGuest(MvpkmVM *vm, int why); +struct kset *Mvpkm_FindVMNamedKSet(int vmID, const char *name); + +#endif diff --git a/arch/arm/mvp/mvpkm/mvpkm_main.c b/arch/arm/mvp/mvpkm/mvpkm_main.c new file mode 100644 index 0000000..61df1a1 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvpkm_main.c @@ -0,0 +1,2690 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief The kernel level driver. + */ + +#define __KERNEL_SYSCALLS__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HAS_WAKELOCK +#include +#endif + +#include + +#include +#include +#include +#include +#include + +#include "mvp.h" +#include "mvp_version.h" +#include "mvpkm_types.h" +#include "mvpkm_private.h" +#include "mvpkm_kernel.h" +#include "actions.h" +#include "wscalls.h" +#include "arm_inline.h" +#include "tsc.h" +#include "mksck_kernel.h" +#include "mmu_types.h" +#include "mvp_timer.h" +#include "qp.h" +#include "qp_host_kernel.h" +#include "cpufreq_kernel.h" +#include "mvpkm_comm_ev.h" +#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER +#include "mvp_balloon.h" +#endif + + +/********************************************************************* + * + * Definition of the file operations + * + *********************************************************************/ +static _Bool LockedListAdd(MvpkmVM *vm, + __u32 mpn, + __u32 order, + PhysMem_RegionType forRegion); +static _Bool LockedListDel(MvpkmVM *vm, __u32 mpn); +static void LockedListUnlockAll(MvpkmVM *vm); +static _Bool LockedListLookup(MvpkmVM *vm, __u32 mpn); +static int SetupMonitor(MvpkmVM *vm); +static int RunMonitor(MvpkmVM *vm); +static MPN AllocZeroedFreePages(MvpkmVM *vm, + uint32 order, + _Bool highmem, + PhysMem_RegionType forRegion, + HKVA *hkvaRet); +static HKVA MapWSPHKVA(MvpkmVM *vm, HkvaMapInfo *mapInfo); +static void UnmapWSPHKVA(MvpkmVM *vm); +static int MvpkmWaitForInt(MvpkmVM *vm, _Bool suspend); +static void ReleaseVM(MvpkmVM *vm); + +/* + * Mksck open request must come from this uid. It must be root until + * it is set via an ioctl from mvpd. + */ +uid_t Mvpkm_vmwareUid = 0; +EXPORT_SYMBOL(Mvpkm_vmwareUid); + +/* + * Minimum hidden app oom_adj, provided by mvpd, since we can't get it directly + * from the lowmemorykiller module. + */ +static int minHiddenAppOOMAdj; + +/* + * vCPU cpu affinity to let monitor/guest run on some CPUs only (when possible) + */ +static DECLARE_BITMAP(vcpuAffinity, NR_CPUS); + +/********************************************************************* + * + * Sysfs nodes + * + *********************************************************************/ +/* + * kobject for our sysfs representation, used for global nodes. + */ +static struct kobject *mvpkmKObj; + +/* + * kobject for the balloon exports. + */ +static struct kobject *balloonKObj; + +/** + * @brief sysfs show function for global version attribute. + * + * @param kobj reference to kobj nested in MvpkmVM struct. + * @param attr kobj_attribute reference, not used. + * @param buf PAGE_SIZEd buffer to write to. + * + * @return number of characters printed (not including trailing null character). + */ +static ssize_t +version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS); +} + +static struct kobj_attribute versionAttr = __ATTR_RO(version); + +/** + * @brief sysfs show function for global background_pages attribute. + * + * Used by vmx balloon policy controller to gauge the amount of freeable + * anonymous memory. + * + * @param kobj reference to kobj nested in MvpkmVM struct. + * @param attr kobj_attribute reference, not used. + * @param buf PAGE_SIZEd buffer to write to. + * + * @return number of characters printed (not including trailing null character). + */ +static ssize_t +background_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ +#ifndef CONFIG_ANDROID_LOW_MEMORY_KILLER + return snprintf(buf, PAGE_SIZE, "0\n"); +#else + return snprintf(buf, PAGE_SIZE, "%d\n", Balloon_AndroidBackgroundPages(minHiddenAppOOMAdj)); +#endif +} + +static struct kobj_attribute backgroundAttr = __ATTR_RO(background); + +/** + * @brief sysfs show function to export the other_file calculation in + * lowmemorykiller. + * + * It's helpful, in the balloon controller, to know what the lowmemorykiller + * module is using to know when the system has crossed a minfree threshold. + * Since there exists a number of different other_file calculations in various + * lowmemorykiller patches (@see{MVP-1674}), and the module itself doesn't + * provide a clean export of this figure, we provide it on a case-by-case basis + * for the various supported hosts here. + * + * @param kobj reference to kobj nested in MvpkmVM struct. + * @param attr kobj_attribute reference, not used. + * @param buf PAGE_SIZEd buffer to write to. + * + * @return number of characters printed (not including trailing null character). + */ +static ssize_t +other_file_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + int32 other_file = 0; + +#ifndef LOWMEMKILLER_VARIANT +#define LOWMEMKILLER_VARIANT 0 +#endif + +#ifndef LOWMEMKILLER_MD5 +#define LOWMEMKILLER_MD5 0 +#endif + +#ifndef LOWMEMKILLER_SHRINK_MD5 +#define LOWMEMKILLER_SHRINK_MD5 0 +#endif + + /* + * The build system hashes the lowmemorykiller section related to the + * other_file calculation in the kernel source for us, here we have to + * provide the code. + */ +#if LOWMEMKILLER_VARIANT == 1 + /* + * This is the same as the non-exported global_reclaimable_pages() when there + * is no swap. + */ + other_file = global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE); +#elif LOWMEMKILLER_VARIANT == 2 + other_file = global_page_state(NR_FILE_PAGES); +#elif LOWMEMKILLER_VARIANT == 3 + other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM); +#elif LOWMEMKILLER_VARIANT == 4 + /* + * Here free/file pages are fungible and max(free, file) isn't used, but we + * can continue to use max(free, file) since max(free, file) = other_file in + * this case. + */ + other_file = global_page_state(NR_FREE_PAGES) + global_page_state(NR_FILE_PAGES); +#elif defined(NONANDROID) + /* + * Non-Android host platforms don't have ballooning enabled. + */ +#else + /* + * If you get this message, you need to run 'make lowmem-info' and inspect + * lowmemorykiller.c. If the "other_file = ..." calculation in lowmem_shrink + * appears above, simply add the "Shrink#" to an existing entry in + * lowmemkiller-variant.sh, pointing to the variant number above. Otherwise, + * provide a new entry above and variant number, with the appropriate + * other_file calculation and update lowmemkiller-variant.sh accordingly. + */ +//#warning "Unknown lowmemorykiller variant in hosted/module/mvpkm_main.c, falling back on default (see other_file_show for the remedy)" + /* + * Fall back on default - this may bias strangely for/against the host, but + * nothing catastrophic should result. + */ + other_file = global_page_state(NR_FILE_PAGES); +#endif + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + return snprintf(buf, + PAGE_SIZE, + "%d %d %s %s\n", + other_file, + LOWMEMKILLER_VARIANT, + STRINGIFY(LOWMEMKILLER_MD5), + STRINGIFY(LOWMEMKILLER_SHRINK_MD5)); +#undef _STRINGIFY +#undef STRINGIFY +} + +static struct kobj_attribute otherFileAttr = __ATTR_RO(other_file); + +/* + * kset for our sysfs representation, used for per-VM nodes. + */ +static struct kset *mvpkmKSet; + +static ssize_t MvpkmAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf); +static ssize_t MvpkmAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count); + +static void MvpkmKObjRelease(struct kobject *kobj) + __attribute__ ((optimize ("-fomit-frame-pointer"))); + + +/** + * @brief Releases the vm structure containing the kobject. + * + * @param kobj the vm's kobject. + */ + +static void +MvpkmKObjRelease(struct kobject *kobj) +{ + MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); + + ReleaseVM(vm); + + module_put(THIS_MODULE); +} + + +/** + * @name mvpkm ktype attribute structures for locked_pages. + * + * @{ + */ +static struct sysfs_ops mvpkmSysfsOps = { + .show = MvpkmAttrShow, + .store = MvpkmAttrStore +}; + +static struct attribute mvpkmLockedPagesAttr = { + .name = "locked_pages", + .mode = 0444, +}; + +static struct attribute mvpkmBalloonWatchdogAttr = { + .name = "balloon_watchdog", + .mode = 0666 +}; + +static struct attribute mvpkmMonitorAttr = { + .name = "monitor", + .mode = 0400, +}; + +static struct attribute *mvpkmDefaultAttrs[] = { + &mvpkmLockedPagesAttr, + &mvpkmBalloonWatchdogAttr, + &mvpkmMonitorAttr, + NULL, +}; + +static struct kobj_type mvpkmKType = { + .sysfs_ops = &mvpkmSysfsOps, + .release = MvpkmKObjRelease, + .default_attrs = mvpkmDefaultAttrs, +}; +/*@}*/ + +/* + * As it is not very common for host kernels to have SYS_HYPERVISOR enabled and + * you have to "hack" a Kconfig file to enable it, just include the + * functionality inline if it is not enabled. + */ +#ifndef CONFIG_SYS_HYPERVISOR +struct kobject *hypervisor_kobj; +EXPORT_SYMBOL_GPL(hypervisor_kobj); +#endif + + +/* + * kobject and kset utilities. + */ + +extern struct kobject *kset_find_obj(struct kset *, const char *) + __attribute__((weak)); + + +/** + * @brief Finds a kobject in a kset. The actual implementation is copied from + * kernel source in lib/kobject.c. Although the symbol is extern-declared, + * it is not EXPORT_SYMBOL-ed. We use a weak reference in case the symbol + * might be exported in future kernel versions. + * + * @param kset set to search. + * @param name object name. + * + * @return retained kobject if found, NULL otherwise. + */ + +struct kobject * +kset_find_obj(struct kset *kset, + const char *name) +{ + struct kobject *k; + struct kobject *ret = NULL; + + spin_lock(&kset->list_lock); + list_for_each_entry(k, &kset->list, entry) { + if (kobject_name(k) && !strcmp(kobject_name(k), name)) { + ret = kobject_get(k); + break; + } + } + spin_unlock(&kset->list_lock); + return ret; +} + + +/** + * @brief Finds one of the VM's pre-defined ksets. + * + * @param vmID a VM ID. + * @param name name of one of the VM's pre-defined ksets. + * + * @return retained kset if found, NULL otherwise. + */ + +struct kset * +Mvpkm_FindVMNamedKSet(int vmID, + const char *name) +{ + MvpkmVM *vm; + struct kobject *kobj; + char vmName[32] = {}; /* Large enough to hold externally-formatted int32. */ + struct kset *res = NULL; + + if (!mvpkmKSet) { + return NULL; + } + + snprintf(vmName, sizeof vmName, "%d", vmID); + vmName[sizeof vmName - 1] = '\0'; /* Always null-terminate, no overflow. */ + + kobj = kset_find_obj(mvpkmKSet, vmName); + if (!kobj) { + return NULL; + } + + vm = container_of(kobj, MvpkmVM, kobj); + + if (!strcmp(name, "devices")) { + res = kset_get(vm->devicesKSet); + } else if (!strcmp(name, "misc")) { + res = kset_get(vm->miscKSet); + } + + kobject_put(kobj); + return res; +} + +EXPORT_SYMBOL(Mvpkm_FindVMNamedKSet); + + + +/********************************************************************* + * + * Standard Linux miscellaneous device registration + * + *********************************************************************/ + +MODULE_LICENSE("GPL"); // for kallsyms_lookup_name + +static int MvpkmFault(struct vm_area_struct *vma, struct vm_fault *vmf); + + +/** + * @brief Linux vma operations for /dev/mem-like kernel module mmap. We + * enforce the restriction that only MPNs that have been allocated + * to the opened VM may be mapped and also increment the reference + * count (via vm_insert_page), so that even if the memory is later + * freed by the VM, host process vma's containing the MPN can't + * compromise the system. + * + * However, only trusted host processes (e.g. the vmx) should be allowed + * to use this interface, since you can mmap the monitor's code/data/ + * page tables etc. with it. Untrusted host processes are limited to + * typed messages for sharing memory with the monitor. Unix file system + * access permissions are the intended method of restricting access. + * Unfortunately, today _any_ host process utilizing Mksck requires + * access to mvpkm to setup its Mksck pages and obtain socket info via + * ioctls - we probably should be exporting two devices, one for trusted + * and one for arbitrary host processes to avoid this confusion of + * concerns. + */ +static struct vm_operations_struct mvpkmVMOps = { + .fault = MvpkmFault +}; + +/* + * Generic kernel module file ops. These functions will be registered + * at the time the kernel module is loaded. + */ +static long MvpkmUnlockedIoctl(struct file *filep, + unsigned int cmd, + unsigned long arg); +static int MvpkmOpen(struct inode *inode, struct file *filp); +static int MvpkmRelease(struct inode *inode, struct file *filp); +static int MvpkmMMap(struct file *file, struct vm_area_struct *vma); + +/** + * @brief the file_operation structure contains the callback functions + * that are registered with Linux to handle file operations on + * the mvpkm device. + * + * The structure contains other members that the mvpkm device + * does not use. Those members are auto-initialized to NULL. + * + * WARNING, this structure has changed after Linux kernel 2.6.19: + * readv/writev are changed to aio_read/aio_write (neither is used here). + */ +static const struct file_operations mvpkmFileOps = { + .owner = THIS_MODULE, + .unlocked_ioctl = MvpkmUnlockedIoctl, + .open = MvpkmOpen, + .release = MvpkmRelease, + .mmap = MvpkmMMap +}; + +/** + * @brief The mvpkm device identifying information to be used to register + * the device with the Linux kernel. + */ +static struct miscdevice mvpkmDev = { + .minor = 165, + .name = "mvpkm", + .fops = &mvpkmFileOps +}; + +/** + * Mvpkm is loaded by mvpd and only mvpd will be allowed to open + * it. There is a very simple way to verify that: record the process + * id (thread group id) at the time the module is loaded and test it + * at the time the module is opened. + */ +static struct pid *initTgid; + + +#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER +/** + * @name Slab shrinker for triggering balloon adjustment. + * + * @note shrinker us used as a trigger for guest balloon. + * + * @{ + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) +static int MvpkmShrink(struct shrinker *this, struct shrink_control *sc); +#else +static int MvpkmShrink(struct shrinker *this, int nrToScan, gfp_t gfpMask); +#endif + +static struct shrinker mvpkmShrinker = { + .shrink = MvpkmShrink, + .seeks = DEFAULT_SEEKS +}; +/*@}*/ +#endif + +module_param_array(vcpuAffinity, ulong, NULL, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(vcpuAffinity, "vCPU affinity"); + + +/** + * @brief Initialize the mvpkm device, register it with the Linux kernel. + * + * @return A zero is returned on success and a negative errno code for failure. + * (Same as the return policy of misc_register(9).) + */ + +static int __init +MvpkmInit(void) +{ + int err = 0; + _Bool mksckInited = false; + _Bool cpuFreqInited = false; + + printk(KERN_INFO "Mvpkm: " MVP_VERSION_FORMATSTR "\n", MVP_VERSION_FORMATARGS); + printk(KERN_INFO "Mvpkm: loaded from process %s tgid=%d, pid=%d\n", + current->comm, + task_tgid_vnr(current), + task_pid_vnr(current)); + + if (bitmap_empty(vcpuAffinity, NR_CPUS)) { + bitmap_copy(vcpuAffinity, cpumask_bits(cpu_possible_mask), NR_CPUS); + } + + if ((err = misc_register(&mvpkmDev))) { + return -ENOENT; + } + + if ((err = Mksck_Init())) { + goto error; + } else { + mksckInited = true; + } + + QP_HostInit(); + + CpuFreq_Init(); + cpuFreqInited = true; + + /* + * Reference mvpd (module loader) tgid struct, so that we can avoid + * attacks based on pid number wraparound. + */ + initTgid = get_pid(task_tgid(current)); + +#ifndef CONFIG_SYS_HYPERVISOR + hypervisor_kobj = kobject_create_and_add("hypervisor", NULL); + if (!hypervisor_kobj) { + err = -ENOMEM; + goto error; + } +#endif + + if (!(mvpkmKObj = kobject_create_and_add("mvp", hypervisor_kobj)) || + !(balloonKObj = kobject_create_and_add("lowmem", mvpkmKObj)) || + !(mvpkmKSet = kset_create_and_add("vm", NULL, mvpkmKObj))) { + err = -ENOMEM; + goto error; + } + + if ((err = sysfs_create_file(mvpkmKObj, &versionAttr.attr))) { + goto error; + } + + if ((err = sysfs_create_file(balloonKObj, &backgroundAttr.attr))) { + goto error; + } + + if ((err = sysfs_create_file(balloonKObj, &otherFileAttr.attr))) { + goto error; + } + +#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER + register_shrinker(&mvpkmShrinker); +#endif + + MksckPageInfo_Init(); + + return 0; + +error: + if (mvpkmKSet) { + kset_unregister(mvpkmKSet); + } + + if (balloonKObj) { + kobject_del(balloonKObj); + kobject_put(balloonKObj); + } + + if (mvpkmKObj) { + kobject_del(mvpkmKObj); + kobject_put(mvpkmKObj); + } + +#ifndef CONFIG_SYS_HYPERVISOR + if (hypervisor_kobj) { + kobject_del(hypervisor_kobj); + kobject_put(hypervisor_kobj); + } +#endif + + if (cpuFreqInited) { + CpuFreq_Exit(); + } + + if (mksckInited) { + Mksck_Exit(); + } + + if (initTgid) { + put_pid(initTgid); + } + + misc_deregister(&mvpkmDev); + return err; +} + +/** + * @brief De-register the mvpkm device with the Linux kernel. + */ +void +MvpkmExit(void) +{ + PRINTK(KERN_INFO "MvpkmExit called !\n"); + + MksckPageInfo_Exit(); + +#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER + unregister_shrinker(&mvpkmShrinker); +#endif + + kset_unregister(mvpkmKSet); + kobject_del(balloonKObj); + kobject_put(balloonKObj); + kobject_del(mvpkmKObj); + kobject_put(mvpkmKObj); +#ifndef CONFIG_SYS_HYPERVISOR + kobject_del(hypervisor_kobj); + kobject_put(hypervisor_kobj); +#endif + + CpuFreq_Exit(); + + Mksck_Exit(); + + put_pid(initTgid); + + misc_deregister(&mvpkmDev); +} + +/* + * The standard module registration macros of Linux. + */ +module_init(MvpkmInit); +module_exit(MvpkmExit); + +module_param_named(minHiddenAppOOMAdj, minHiddenAppOOMAdj, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(minHiddenAppOOMAdj, "minimum hidden app oom_adj, as per lowmemorykiller"); + +#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER +/** + * @brief Balloon watchdog timeout callback. + * + * Terminate the VM since it's not responsive. + * + * @param data vm reference representation. + */ +static void +WatchdogCB(unsigned long data) +{ + MvpkmVM *vm = (MvpkmVM *)data; + + printk("Balloon watchdog expired (%d s)!\n", BALLOON_WATCHDOG_TIMEOUT_SECS); + + Mvpkm_WakeGuest(vm, ACTION_ABORT); +} + +/** + * @brief Slab shrinker. + * + * Called by Linux kernel when we're under memory pressure. We treat all locked + * pages as a slab for this purpose, similar to the Android low memory killer. + * + * @param this reference to registered shrinker for callback context. + * @param nrToScan number of entries to scan. If 0 then just return the number + * of present entries. We ignore the value of nrToScan when > 1 + * since the shrinker is a trigger to readjust guest balloons, + * where the actual balloon size is determined in conjunction + * with the guest. + * @param gfpMask ignored. + * + * @return number of locked pages. + */ +static int +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) +MvpkmShrink(struct shrinker *this, struct shrink_control *sc) +#else +MvpkmShrink(struct shrinker *this, int nrToScan, gfp_t gfpMask) +#endif +{ + uint32 locked = 0; + struct kobject *k; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) + int nrToScan = sc->nr_to_scan; +#endif + + spin_lock(&mvpkmKSet->list_lock); + + list_for_each_entry(k, &mvpkmKSet->list, entry) { + MvpkmVM *vm = container_of(k, MvpkmVM, kobj); + + locked += ATOMIC_GETO(vm->usedPages); + + /* + * Try and grab the WSP semaphore - if we fail, we must be VM setup or + * teardown, no point trying to wake the guest. + */ + if (nrToScan > 0 && + down_read_trylock(&vm->wspSem)) { + + if (vm->wsp) { + Mvpkm_WakeGuest(vm, ACTION_BALLOON); + + /* + * Balloon watchdog. + */ + if (vm->balloonWDEnabled) { + struct timer_list *t = &vm->balloonWDTimer; + + if (!timer_pending(t)) { + t->data = (unsigned long)vm; + t->function = WatchdogCB; + t->expires = jiffies + BALLOON_WATCHDOG_TIMEOUT_SECS * HZ; + add_timer(t); + } + } + } + + up_read(&vm->wspSem); + } + } + + spin_unlock(&mvpkmKSet->list_lock); + + return locked; +} +#endif + + +/** + * @brief The open file operation. Initializes the vm specific structure. + */ +int +MvpkmOpen(struct inode *inode, struct file *filp) +{ + MvpkmVM *vm; + + if (initTgid != task_tgid(current)) { + printk(KERN_ERR "%s: MVPKM can be opened only from MVPD (process %d).\n", + __FUNCTION__, pid_vnr(initTgid)); + return -EPERM; + } + printk(KERN_DEBUG "%s: Allocating an MvpkmVM structure from process %s tgid=%d, pid=%d\n", + __FUNCTION__, + current->comm, + task_tgid_vnr(current), + task_pid_vnr(current)); + + vm = kmalloc(sizeof(MvpkmVM), GFP_KERNEL); + if (!vm) { + return -ENOMEM; + } + + memset(vm, 0, sizeof *vm); + + init_timer(&vm->balloonWDTimer); + init_rwsem(&vm->lockedSem); + init_rwsem(&vm->wspSem); + init_rwsem(&vm->monThreadTaskSem); + vm->monThreadTask = NULL; + vm->isMonitorInited = false; + + filp->private_data = vm; + + if (!Mvpkm_vmwareUid) { + Mvpkm_vmwareUid = current_euid(); + } + + return 0; +} + +/** + * @brief Releases a VMs resources + * @param vm vm to release + */ +static void +ReleaseVM(MvpkmVM *vm) +{ + del_timer_sync(&vm->balloonWDTimer); + + down_write(&vm->wspSem); + + if (vm->isMonitorInited) { + MonitorTimer_Request(&vm->monTimer, 0); +#ifdef CONFIG_HAS_WAKELOCK + wake_lock_destroy(&vm->wakeLock); +#endif + Mksck_WspRelease(vm->wsp); + vm->wsp = NULL; + } + + up_write(&vm->wspSem); + + LockedListUnlockAll(vm); + + UnmapWSPHKVA(vm); + + /* + * All sockets potentially connected to sockets of this vm's vmId will fail + * at send now. DGRAM sockets are note required to tear down connection + * explicitly. + */ + + kfree(vm); +} + +/** + * @brief The release file operation. Releases the vm specific + * structure including all the locked pages. + * + * @param inode Unused + * @param filp which VM we're dealing with + * @return 0 + */ +int +MvpkmRelease(struct inode *inode, struct file *filp) +{ + MvpkmVM *vm = filp->private_data; + + /* + * Tear down any queue pairs associated with this VM + */ + if (vm->isMonitorInited) { + ASSERT(vm->wsp); + QP_DetachAll(vm->wsp->guestId); + } + + /* + * Release the VM's ksets. + */ + + kset_unregister(vm->miscKSet); + kset_unregister(vm->devicesKSet); + + if (vm->haveKObj) { + /* + * Release the VM's kobject. + * 'vm' will be kfree-d in its kobject's release function. + */ + + kobject_del(&vm->kobj); + kobject_put(&vm->kobj); + } else { + ReleaseVM(vm); + } + + filp->private_data = NULL; + + printk(KERN_INFO "%s: Released MvpkmVM structure from process %s tgid=%d, pid=%d\n", + __FUNCTION__, + current->comm, + task_tgid_vnr(current), + task_pid_vnr(current)); + + return 0; +} + +/** + * @brief Page fault handler for /dev/mem-like regions (see mvpkmVMOps + * block comment). + */ +static int +MvpkmFault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + unsigned long address = (unsigned long)vmf->virtual_address; + MPN mpn = vmf->pgoff; + MvpkmVM *vm = vma->vm_file->private_data; + + + /* + * Only insert pages belonging to the VM. The check is slow, O(n) in the + * number of MPNs associated with the VM, but it doesn't matter - the mmap + * interface should only be used by trusted processes at initialization + * time and for debugging. + * + * The mpn can be either in the memory reserved the monitor or mvpd + * through the regular mechanisms or it could be a mksck page. + */ + if (!pfn_valid(mpn)) { + printk(KERN_ERR "MvpkmMMap: Failed to insert %x @ %lx, mpn invalid\n", + mpn, + address); + } else if (LockedListLookup(vm, mpn)) { + if (vm_insert_page(vma, address, pfn_to_page(mpn)) == 0) { + return VM_FAULT_NOPAGE; + } + + printk(KERN_ERR "MvpkmMMap: Failed to insert %x @ %lx \n", + mpn, + address); + } else if (MksckPage_LookupAndInsertPage(vma, address, mpn) == 0) { + return VM_FAULT_NOPAGE; + } + + if (vm->stubPageMPN) { + if (vm_insert_page(vma, address, pfn_to_page(vm->stubPageMPN)) == 0) { + printk(KERN_INFO "MvpkmMMap: mapped the stub page at %x @ %lx \n", + mpn, + address); + return VM_FAULT_NOPAGE; + } + + printk(KERN_ERR "MvpkmMMap: Could not insert stub page %x @ %lx \n", + mpn, + address); + + } + + return VM_FAULT_SIGBUS; +} + +/** + * @brief sysfs show function for per-VM locked_pages attribute. + * + * @param kobj reference to kobj nested in MvpkmVM struct. + * @param attr attribute reference. + * @param buf PAGE_SIZEd buffer to write to. + * + * @return number of characters printed (not including trailing null character). + */ +static ssize_t +MvpkmAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + if (attr == &mvpkmLockedPagesAttr) { + MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); + + return snprintf(buf, PAGE_SIZE, "%d\n", ATOMIC_GETO(vm->usedPages)); + } else if (attr == &mvpkmMonitorAttr) { + MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); + + return snprintf(buf, + PAGE_SIZE, + "hostActions %x callno %d\n", + ATOMIC_GETO(vm->wsp->hostActions), + WSP_Params(vm->wsp)->callno); + } else { + return -EPERM; + } +} + +/** + * @brief sysfs store function for per-VM locked_pages attribute. + * + * @param kobj reference to kobj nested in MvpkmVM struct. + * @param attr attribute reference. + * @param buf PAGE_SIZEd buffer to write to. + * @param buf input buffer. + * @param count input buffer length. + * + * @return number of bytes consumed or negative error code. + */ +static ssize_t +MvpkmAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count) +{ + if (attr == &mvpkmBalloonWatchdogAttr) { + MvpkmVM *vm = container_of(kobj, MvpkmVM, kobj); + + /* + * Enable balloon watchdog on first write. This includes all ballooning + * capable guest. + */ + vm->balloonWDEnabled = true; + del_timer_sync(&vm->balloonWDTimer); + + return 1; + } else { + return -EPERM; + } +} + +/** + * @brief Map machine address space region into host process. + * + * @param file file reference (ignored). + * @param vma Linux virtual memory area defining the region. + * + * @return 0 on success, otherwise error code. + */ +static int +MvpkmMMap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &mvpkmVMOps; + + return 0; +} + +#ifdef CONFIG_ARM_LPAE +/** + * @brief Determine host cacheability/shareability attributes. + * + * Used to ensure monitor/guest shared mappings are consistent with + * those of host user/kernel. + * + * @param[out] attribMAN when setting up the HW monitor this provides the + * attributes in the generic ARM_MemAttrNormal form, + * suitable for configuring the monitor and guest's + * [H]MAIR0 and setting the shareability attributes of + * the LPAE descriptors. + */ +static void +DetermineMemAttrLPAE(ARM_MemAttrNormal *attribMAN) +{ + /* + * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for + * normal kernel/user L2D mappings. These bits should be consistent both + * with each other and what we use in the monitor since we share various + * pages with both host processes, the kernel module and monitor, and the + * ARM ARM requires that synonyms have the same cacheability attributes, + * see end of A3.5.{4,7} ARM DDI 0406A. + */ + HKVA hkva = __get_free_pages(GFP_KERNEL, 0); + + ARM_LPAE_L3D *pt = (ARM_LPAE_L3D *)hkva; + ARM_LPAE_L3D *kernL3D = &pt[0], *userL3D = &pt[1]; + uint32 attr, mair0, mair1; + + set_pte_ext((pte_t *)kernL3D, pfn_pte(0, PAGE_KERNEL), 0); + set_pte_ext((pte_t *)userL3D, pfn_pte(0, PAGE_NONE), 0); + + printk(KERN_INFO + "DetermineMemAttr: Kernel L3D AttrIndx=%x SH=%x\n", + kernL3D->blockS1.attrIndx, + kernL3D->blockS1.sh); + + printk(KERN_INFO + "DetermineMemAttr: User L3D AttrIndx=%x SH=%x\n", + userL3D->blockS1.attrIndx, + userL3D->blockS1.sh); + + ASSERT(kernL3D->blockS1.attrIndx == userL3D->blockS1.attrIndx); + ASSERT(kernL3D->blockS1.sh == userL3D->blockS1.sh); + + switch (kernL3D->blockS1.sh) { + case 0: { + attribMAN->share = ARM_SHARE_ATTR_NONE; + break; + } + case 2: { + attribMAN->share = ARM_SHARE_ATTR_OUTER; + break; + } + case 3: { + attribMAN->share = ARM_SHARE_ATTR_INNER; + break; + } + default: { + FATAL(); + } + } + + ARM_MRC_CP15(MAIR0, mair0); + ARM_MRC_CP15(MAIR1, mair1); + + attr = MVP_EXTRACT_FIELD(kernL3D->blockS1.attrIndx >= 4 ? mair1 : mair0, + 8 * (kernL3D->blockS1.attrIndx % 4), + 8); + + /* + * See B4-1615 ARM DDI 0406C-2c for magic. + */ +#define MAIR_ATTR_2_CACHE_ATTR(x, y) \ + switch (x) { \ + case 2: { \ + (y) = ARM_CACHE_ATTR_NORMAL_WT; \ + break; \ + } \ + case 3: { \ + (y) = ARM_CACHE_ATTR_NORMAL_WB; \ + break; \ + } \ + default: { \ + FATAL(); \ + } \ + } + + MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 2, 2), attribMAN->innerCache); + MAIR_ATTR_2_CACHE_ATTR(MVP_EXTRACT_FIELD(attr, 6, 2), attribMAN->outerCache); + +#undef MAIR_ATTR_2_CACHE_ATTR + + printk(KERN_INFO + "DetermineMemAttr: innerCache %x outerCache %x share %x\n", + attribMAN->innerCache, + attribMAN->outerCache, + attribMAN->share); + + free_pages(hkva, 0); +} + +#else + +/** + * @brief Determine host cacheability/shareability attributes. + * + * Used to ensure monitor/guest shared mappings are consistent with + * those of host user/kernel. + * + * @param[out] attribL2D when setting up the LPV monitor a template L2D + * containing cacheability attributes {S, TEX,CB} used by + * host kernel for normal memory mappings. These may be + * used directly for monitor/guest mappings, since both + * worlds share a common {TRE, PRRR, NMRR}. + * @param[out] attribMAN when setting up TTBR0 in the LPV monitor and the page + * tables for the HW monitor this provides the attributes + * in the generic ARM_MemAttrNormal form, suitable for + * configuring TTBR0 + the monitor and guest's [H]MAIR0 + * and setting the shareability attributes of the LPAE + * descriptors. + */ +static void +DetermineMemAttrNonLPAE(ARM_L2D *attribL2D, ARM_MemAttrNormal *attribMAN) +{ + /* + * We use set_pte_ext to sample what {S,TEX,CB} bits Linux is using for + * normal kernel/user L2D mappings. These bits should be consistent both + * with each other and what we use in the monitor since we share various + * pages with both host processes, the kernel module and monitor, and the + * ARM ARM requires that synonyms have the same cacheability attributes, + * see end of A3.5.{4,7} ARM DDI 0406A. + */ + HKVA hkva = __get_free_pages(GFP_KERNEL, 0); + uint32 sctlr; + ARM_L2D *pt = (ARM_L2D *)hkva; + ARM_L2D *kernL2D = &pt[0], *userL2D = &pt[1]; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 38) + /* + * Linux uses the magic 2048 offset in set_pte_ext. See include/asm/pgtable.h + * for PAGE_NONE and PAGE_KERNEL semantics. + */ + const uint32 set_pte_ext_offset = 2048; +#else + /* + * Linux 2.6.38 switched the order of Linux vs hardware page tables. + * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6. + */ + const uint32 set_pte_ext_offset = 0; +#endif + + set_pte_ext((pte_t *)(kernL2D + set_pte_ext_offset/sizeof(ARM_L2D)), + pfn_pte(0, PAGE_KERNEL), + 0); + set_pte_ext((pte_t *)(userL2D + set_pte_ext_offset/sizeof(ARM_L2D)), + pfn_pte(0, PAGE_NONE), + 0); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 38) + /* + * Linux 2.6.38 switched the order of Linux vs hardware page tables. + * See mainline d30e45eeabefadc6039d7f876a59e5f5f6cb11c6. + */ + kernL2D += 2048/sizeof(ARM_L2D); + userL2D += 2048/sizeof(ARM_L2D); +#endif + + printk(KERN_INFO + "DetermineMemAttr: Kernel L2D TEX=%x CB=%x S=%x\n", + kernL2D->small.tex, + kernL2D->small.cb, + kernL2D->small.s); + + printk(KERN_INFO + "DetermineMemAttr: User L2D TEX=%x CB=%x S=%x\n", + userL2D->small.tex, + userL2D->small.cb, + userL2D->small.s); + + ASSERT((kernL2D->small.tex & 1) == (userL2D->small.tex & 1)); + ASSERT(kernL2D->small.cb == userL2D->small.cb); + ASSERT(kernL2D->small.s == userL2D->small.s); + + *attribL2D = *kernL2D; + + /* + * We now decode TEX remap and obtain the more generic form for use in + * the LPV monitor's TTBR0 initialization and the HW monitor. + */ + + ARM_MRC_CP15(CONTROL_REGISTER, sctlr); + + if (sctlr & ARM_CP15_CNTL_TRE) { + uint32 prrr, nmrr, indx, type, innerCache, outerCache, outerShare, + share; + + printk(KERN_INFO + "DetermineMemAttr: TEX remapping enabled\n"); + + ARM_MRC_CP15(PRIMARY_REGION_REMAP, prrr); + ARM_MRC_CP15(NORMAL_MEMORY_REMAP, nmrr); + + printk(KERN_INFO + "DetermineMemAttr: PRRR=%x NMRR=%x\n", + prrr, + nmrr); + + /* + * Decode PRRR/NMRR below. See B3.7 ARM DDI 0406B for register + * encodings, tables and magic numbers. + */ + + indx = (MVP_BIT(kernL2D->small.tex, 0) << 2) | kernL2D->small.cb; + + /* + * Only normal memory makes sense here. + */ + type = MVP_EXTRACT_FIELD(prrr, 2 * indx, 2); + ASSERT(type == 2); + + innerCache = MVP_EXTRACT_FIELD(nmrr, 2 * indx, 2); + outerCache = MVP_EXTRACT_FIELD(nmrr, 16 + 2 * indx, 2); + outerShare = !MVP_BIT(prrr, 24 + indx); + share = MVP_BIT(prrr, 18 + kernL2D->small.s); + + printk(KERN_INFO + "DetermineMemAttr: type %x innerCache %x outerCache %x" + " share %x outerShare %x\n", + type, + innerCache, + outerCache, + share, + outerShare); + + if (share) { + if (outerShare) { + attribMAN->share = ARM_SHARE_ATTR_OUTER; + } else { + attribMAN->share = ARM_SHARE_ATTR_INNER; + } + } else { + attribMAN->share = ARM_SHARE_ATTR_NONE; + } + + attribMAN->innerCache = innerCache; + attribMAN->outerCache = outerCache; + } else { + NOT_IMPLEMENTED_JIRA(1849); + } + + free_pages(hkva, 0); +} +#endif + +/** + * @brief The ioctl file operation. + * + * The ioctl command is the main communication method between the + * vmx and the mvpkm kernel module. + * + * @param filp which VM we're dealing with + * @param cmd select which cmd function needs to be performed + * @param arg argument for command + * @return error code, 0 on success + */ +long +MvpkmUnlockedIoctl(struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + MvpkmVM *vm = filp->private_data; + int retval = 0; + + switch (cmd) { + + + case MVPKM_DISABLE_FAULT: { + if (!vm->stubPageMPN) { + uint32 *ptr; + + vm->stubPageMPN = + AllocZeroedFreePages(vm, 0, false, MEMREGION_MAINMEM, (HKVA*)&ptr); + if (!vm->stubPageMPN) { + break; + } + ptr[0] = MVPKM_STUBPAGE_BEG; + ptr[PAGE_SIZE/sizeof(uint32) - 1] = MVPKM_STUBPAGE_END; + } + break; + } + + /* + * Allocate some pinned pages from kernel. + * Returns -ENOMEM if no host pages available for allocation. + */ + case MVPKM_LOCK_MPN: { + struct MvpkmLockMPN buf; + + if (copy_from_user(&buf, (void *)arg, sizeof buf)) { + return -EFAULT; + } + + buf.mpn = AllocZeroedFreePages(vm, + buf.order, + false, + buf.forRegion, + NULL); + if (buf.mpn == 0) { + return -ENOMEM; + } + + if (copy_to_user((void *)arg, &buf, sizeof buf)) { + return -EFAULT; + } + break; + } + + case MVPKM_UNLOCK_MPN: { + struct MvpkmLockMPN buf; + + if (copy_from_user(&buf, (void *)arg, sizeof buf)) { + return -EFAULT; + } + + if (!LockedListDel(vm, buf.mpn)) { + return -EINVAL; + } + break; + } + + case MVPKM_MAP_WSPHKVA: { + MvpkmMapHKVA mvpkmMapInfo; + HkvaMapInfo mapInfo[WSP_PAGE_COUNT]; + + if (copy_from_user(&mvpkmMapInfo, (void *)arg, sizeof mvpkmMapInfo)) { + return -EFAULT; + } + + if (copy_from_user(mapInfo, (void *)mvpkmMapInfo.mapInfo, sizeof mapInfo)) { + return -EFAULT; + } + + mvpkmMapInfo.hkva = MapWSPHKVA(vm, mapInfo); + BUG_ON(mvpkmMapInfo.hkva == 0); + + if (mvpkmMapInfo.forRegion == MEMREGION_WSP) { + vm->wsp = (WorldSwitchPage *) mvpkmMapInfo.hkva; + } + + if (copy_to_user((void *)arg, &mvpkmMapInfo, sizeof mvpkmMapInfo)) { + return -EFAULT; + } + break; + } + + case MVPKM_RUN_MONITOR: { + if (!vm->isMonitorInited) { + vm->isMonitorInited = ((retval = SetupMonitor(vm)) == 0); + } + + if (vm->isMonitorInited) { + retval = RunMonitor(vm); + } + + break; + } + + case MVPKM_ABORT_MONITOR: { + if (!vm->isMonitorInited) { + return -EINVAL; + } + + ASSERT(vm->wsp != NULL); + + Mvpkm_WakeGuest(vm, ACTION_ABORT); + break; + } + + case MVPKM_CPU_INFO: { + struct MvpkmCpuInfo buf; + uint32 mpidr; + +#ifdef CONFIG_ARM_LPAE + DetermineMemAttrLPAE(&buf.attribMAN); + /** + * We need to add support to the LPV monitor for LPAE page tables if we + * want to use it on a LPAE host, due to the costs involved in + * transitioning between LPAE and non-LPAE page tables without Hyp + * assistance. + * + * @knownjira{MVP-2184} + */ + buf.attribL2D.u = 0; +#else + DetermineMemAttrNonLPAE(&buf.attribL2D, &buf.attribMAN); +#endif + /* + * Are MP extensions implemented? See B4-1618 ARM DDI 0406C-2c for + * magic. + */ + ARM_MRC_CP15(MPIDR, mpidr); + + buf.mpExt = mpidr & ARM_CP15_MPIDR_MP; + + if (copy_to_user((int *)arg, &buf, sizeof(struct MvpkmCpuInfo))) { + retval = -EFAULT; + } + break; + } + + default: { + retval = -EINVAL; + break; + } + } + + PRINTK(KERN_INFO "returning from IOCTL(%d) retval = %d %s\n", + cmd, retval, signal_pending(current)?"(pending signal)":"" ); + + return retval; +} + + + +/********************************************************************* + * + * Locked page management + * + *********************************************************************/ + +/* + * Pages locked by the kernel module are remembered so an unlockAll + * operation can be performed when the vmm is closed. The locked page + * identifiers are stored in a red-black tree to support O(log n) + * removal and search (required for /dev/mem-like mmap). + */ + +/** + * @brief Descriptor of a locked page range + */ +typedef struct { + struct { + __u32 mpn : 20; ///< MPN. + __u32 order : 6; ///< Size/alignment exponent for page. + __u32 forRegion : 6; ///< Annotation to identify guest page allocation + } page; + struct rb_node rb; +} LockedPage; + +static void FreeLockedPages(LockedPage *lp); + +/** + * @brief Search for an mpn inside a RB tree of LockedPages. The mpn + * will match a LockedPage as long as it is covered by the + * entry, i.e. in a non-zero order entry it doesn't have to be + * the base MPN. + * + * This must be called with the relevant vm->lockedSem held. + * + * @param root RB tree root. + * @param mpn MPN to search for. + * + * @return reference to LockedPage entry if found, otherwise NULL. + */ +static LockedPage * +LockedListSearch(struct rb_root *root, __u32 mpn) +{ + struct rb_node *n = root->rb_node; + + while (n) { + LockedPage *lp = rb_entry(n, LockedPage, rb); + + if (lp->page.mpn == (mpn & (~0UL << lp->page.order))) { + return lp; + } + + if (mpn < lp->page.mpn) { + n = n->rb_left; + } else { + n = n->rb_right; + } + } + + return NULL; +} + +/** + * @brief Delete an mpn from the list of locked pages. + * + * @param vm Mvpkm module control structure pointer + * @param mpn MPN to be unlocked and freed for reuse + * @return true if list contained MPN and it was deleted from list + */ + +static _Bool +LockedListDel(MvpkmVM *vm, __u32 mpn) +{ + LockedPage *lp; + + down_write(&vm->lockedSem); + + lp = LockedListSearch(&vm->lockedRoot, mpn); + + /* + * The MPN should be in the locked pages RB tree and it should be the + * base of an entry, i.e. we can't fragment existing allocations for + * a VM. + */ + if (lp == NULL || lp->page.mpn != mpn) { + up_write(&vm->lockedSem); + return false; + } + + FreeLockedPages(lp); + + if (lp->page.forRegion == MEMREGION_MAINMEM) { + ATOMIC_SUBV(vm->usedPages, 1U << lp->page.order); + } + + rb_erase(&lp->rb, &vm->lockedRoot); + kfree(lp); + + up_write(&vm->lockedSem); + + return true; +} + +/** + * @brief Scan the list of locked pages to see if an MPN matches. + * + * @param vm Mvpkm module control structure pointer + * @param mpn MPN to check + * + * @return true iff list contains MPN. + */ +static _Bool +LockedListLookup(MvpkmVM *vm, __u32 mpn) +{ + LockedPage *lp; + + down_read(&vm->lockedSem); + + lp = LockedListSearch(&vm->lockedRoot, mpn); + + up_read(&vm->lockedSem); + + return lp != NULL; +} + +/** + * @brief Add a new mpn to the locked pages RB tree. + * + * @param vm control structure pointer + * + * @param mpn mpn of page that was locked with get_user_pages or some sort of + * get that is undone by put_page. + * The mpn is assumed to be non-zero + * @param order size/alignment exponent for page + * @param forRegion Annotation for Page pool to identify guest page allocations + * + * @return false: couldn't allocate internal memory to record mpn in
+ * true: successful. + */ +static _Bool +LockedListAdd(MvpkmVM *vm, + __u32 mpn, + __u32 order, + PhysMem_RegionType forRegion) +{ + struct rb_node *parent, **p; + LockedPage *tp, *lp = kmalloc(sizeof *lp, GFP_KERNEL); + + if (!lp) { + return false; + } + + lp->page.mpn = mpn; + lp->page.order = order; + lp->page.forRegion = forRegion; + + down_write(&vm->lockedSem); + + if (forRegion == MEMREGION_MAINMEM) { + ATOMIC_ADDV(vm->usedPages, 1U << order); + } + + /* + * Insert as a red leaf in the tree (see include/linux/rbtree.h). + */ + p = &vm->lockedRoot.rb_node; + parent = NULL; + + while (*p) { + parent = *p; + tp = rb_entry(parent, LockedPage, rb); + + /* + * MPN should not already exist in the tree. + */ + ASSERT(tp->page.mpn != (mpn & (~0UL << tp->page.order))); + + if (mpn < tp->page.mpn) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + } + } + + rb_link_node(&lp->rb, parent, p); + + /* + * Restructure tree if necessary (see include/linux/rbtree.h). + */ + rb_insert_color(&lp->rb, &vm->lockedRoot); + + up_write(&vm->lockedSem); + + return true; +} + +/** + * @brief Traverse RB locked tree, freeing every entry. + * + * This must be called with the relevant vm->lockedSem held. + * + * @param node reference to RB node at root of subtree. + */ +static void +LockedListNuke(struct rb_node *node) +{ + while (node) { + if (node->rb_left) { + node = node->rb_left; + } else if (node->rb_right) { + node = node->rb_right; + } else { + /* + * We found a leaf, free it and go back to parent. + */ + LockedPage *lp = rb_entry(node, LockedPage, rb); + + if ((node = rb_parent(node))) { + if (node->rb_left) { + node->rb_left = NULL; + } else { + node->rb_right = NULL; + } + } + + FreeLockedPages(lp); + kfree(lp); + } + } +} + +/** + * @brief Unlock all pages at vm close time. + * + * @param vm control structure pointer + */ +static void +LockedListUnlockAll(MvpkmVM *vm) +{ + + down_write(&vm->lockedSem); + + LockedListNuke(vm->lockedRoot.rb_node); + + ATOMIC_SETV(vm->usedPages, 0); + + up_write(&vm->lockedSem); +} + + +/** + * @brief Allocate zeroed free pages + * + * @param[in] vm which VM the pages are for so they will be freed when the vm + * closes + * @param[in] order log2(number of contiguous pages to allocate) + * @param[in] highmem is it OK to allocate this page in ZONE_HIGHMEM? This + * option should only be specified for pages the host kernel + * will not need to address directly. + * @param[out] hkvaRet where to return host kernel virtual address of the + * allocated pages, if non-NULL, and ONLY IF !highmem. + * @param forRegion Annotation for Page pool to identify guest page allocations + * @return 0: no host memory available
+ * else: starting MPN
+ * *hkvaRet = filled in + */ +static MPN +AllocZeroedFreePages(MvpkmVM *vm, + uint32 order, + _Bool highmem, + PhysMem_RegionType forRegion, + HKVA *hkvaRet) +{ + MPN mpn; + struct page *page; + + if (order > PAGE_ALLOC_COSTLY_ORDER) { + printk(KERN_WARNING "Order %d allocation for region %d exceeds the safe " + "maximum order %d\n", + order, + forRegion, + PAGE_ALLOC_COSTLY_ORDER); + } + + /* + * Get some pages for the requested range. They will be physically + * contiguous and have the requested alignment. They will also + * have a kernel virtual mapping if !highmem. + * + * We allocate out of ZONE_MOVABLE even though we can't just pick up our + * bags. We do this to support platforms that explicitly configure + * ZONE_MOVABLE, such as the Qualcomm MSM8960, to enable deep power down of + * memory banks. When the kernel attempts to take a memory bank offline, it + * will try and place the pages on the isolate LRU - only pages already on an + * LRU, such as anon/file, can get there, so it will not be able to + * migrate/move our pages (and hence the bank will not be offlined). The + * other alternative is to live withing ZONE_NORMAL, and only have available + * a small fraction of system memory. Long term we plan on hooking the + * offlining callback in mvpkm and perform our own migration with the + * cooperation of the monitor, but we don't have dev board to support this + * today. + * + * @knownjira{MVP-3477} + */ + page = alloc_pages(GFP_USER | __GFP_COMP | __GFP_ZERO | + (highmem ? __GFP_HIGHMEM | __GFP_MOVABLE : 0), + order); + + if (page == NULL) { + return 0; + } + + /* + * Return the corresponding page number. + */ + mpn = page_to_pfn(page); + ASSERT(mpn != 0); + + /* + * Remember to unlock the pages when the FD is closed. + */ + if (!LockedListAdd(vm, mpn, order, forRegion)) { + __free_pages(page, order); + return 0; + } + + if (hkvaRet) { + *hkvaRet = highmem ? 0 : __phys_to_virt(page_to_phys(page)); + } + + return mpn; +} + +/** + * @brief Map already-pinned WSP memory in host kernel virtual address(HKVA) + * space. Assumes 2 world switch pages on an 8k boundary. + * + * @param[in] vm which VM the HKVA Area is to be mapped for + * @param[in] mapInfo array of MPNs and execute permission flags to be used in + inserting a new contiguous map in HKVA space + * @return 0: HKVA space could not be mapped + else: HKVA where mapping was inserted + */ +static HKVA +MapWSPHKVA(MvpkmVM *vm, HkvaMapInfo *mapInfo) +{ + unsigned int i; + struct page **pages = NULL; + struct page **pagesPtr; + pgprot_t prot; + int retval; + int allocateCount = WSP_PAGE_COUNT + 1; // Reserve one page for alignment + int pageIndex = 0; + HKVA dummyPage = (HKVA)NULL; + HKVA start; + HKVA startSegment; + HKVA endSegment; + + /* + * Add one page for alignment purposes in case __get_vm_area returns an + * unaligned address. + */ + ASSERT(allocateCount == 3); + ASSERT_ON_COMPILE(WSP_PAGE_COUNT == 2); + + /* + * NOT_IMPLEMENTED if MapHKVA is called more than once. + */ + BUG_ON(vm->wspHkvaArea); + + /* + * Reserve virtual address space. + */ + vm->wspHkvaArea = __get_vm_area((allocateCount * PAGE_SIZE), VM_ALLOC, MODULES_VADDR, MODULES_END); + if (!vm->wspHkvaArea) { + return 0; + } + + pages = kmalloc(allocateCount * sizeof(struct page *), GFP_TEMPORARY); + if (!pages) { + goto err; + } + pagesPtr = pages; + + /* + * Use a dummy page to boundary align the section, if needed. + */ + dummyPage = __get_free_pages(GFP_KERNEL, 0); + if (!dummyPage) { + goto err; + } + vm->wspHKVADummyPage = dummyPage; + + /* + * Back every entry with the dummy page. + */ + for (i = 0; i < allocateCount; i++) { + pages[i] = virt_to_page(dummyPage); + } + + /* + * World switch pages must not span a 1MB boundary in order to maintain only + * a single L2 page table. + */ + start = (HKVA)vm->wspHkvaArea->addr; + startSegment = start & ~(ARM_L1D_SECTION_SIZE - 1); + endSegment = (start + PAGE_SIZE) & ~(ARM_L1D_SECTION_SIZE - 1); + /* + * Insert dummy page at pageIndex, if needed. + */ + pageIndex = (startSegment != endSegment); + + /* + * Back the rest with the actual world switch pages + */ + for (i = pageIndex; i < pageIndex + WSP_PAGE_COUNT; i++) { + pages[i] = pfn_to_page(mapInfo[i - pageIndex].mpn); + } + + /* + * Given the lack of functionality in the kernel for being able to mark + * mappings for a given vm area with different sets of protection bits, + * we simply mark the entire vm area as PAGE_KERNEL_EXEC for now + * (i.e., union of all the protection bits). Given that the kernel + * itself does something similar while loading modules, this should be a + * reasonable workaround for now. In the future, we should set the + * protection bits to strictly adhere to what has been requested in the + * mapInfo parameter. + */ + prot = PAGE_KERNEL_EXEC; + + retval = map_vm_area(vm->wspHkvaArea, prot, &pagesPtr); + if (retval < 0) { + goto err; + } + + kfree(pages); + + return (HKVA)(vm->wspHkvaArea->addr) + pageIndex * PAGE_SIZE; + +err: + if (dummyPage) { + free_pages(dummyPage, 0); + vm->wspHKVADummyPage = (HKVA)NULL; + } + + if (pages) { + kfree(pages); + } + + free_vm_area(vm->wspHkvaArea); + vm->wspHkvaArea = (HKVA)NULL; + + return 0; +} + +static void +UnmapWSPHKVA(MvpkmVM *vm) +{ + if (vm->wspHkvaArea) { + free_vm_area(vm->wspHkvaArea); + } + + if (vm->wspHKVADummyPage) { + free_pages(vm->wspHKVADummyPage, 0); + vm->wspHKVADummyPage = (HKVA)NULL; + } +} + +/** + * @brief Clean and release locked pages + * + * @param lp Reference to the locked pages + */ +static void +FreeLockedPages(LockedPage *lp) +{ + struct page *page; + int count; + + page = pfn_to_page(lp->page.mpn); + count = page_count(page); + + if (count == 0) { + printk(KERN_ERR "%s: found locked page with 0 reference (mpn %05x)\n", + __func__, lp->page.mpn); + return; + } + + if (count == 1) { + int i; + + /* + * There is no other user for this page, clean it. + * + * We don't bother checking if the page was highmem or not, clear_highmem + * works for both. + * We clear the content of the page, and rely on the fact that the previous + * worldswitch has cleaned the potential VIVT I-CACHE. + */ + for (i = 0; i < (1 << lp->page.order); i++) { + clear_highpage(page + i); + } + } else if (lp->page.forRegion != MEMREGION_MAINMEM) { + printk(KERN_WARNING "%s: mpn 0x%05x for region %d is still in use\n", + __func__, lp->page.mpn, lp->page.forRegion); + } + + __free_pages(page, lp->page.order); +} + +/********************************************************************* + * + * Communicate with monitor + * + *********************************************************************/ + +/** + * @brief Register a new monitor page. + * + * @param vm which virtual machine we're running + * @return 0: successful
+ * else: -errno + */ +static int +SetupMonitor(MvpkmVM *vm) +{ + int retval; + WorldSwitchPage *wsp = vm->wsp; + + if (!wsp || + wsp->wspHKVA != (HKVA)wsp) { + return -EINVAL; + } + + if ((retval = Mksck_WspInitialize(vm))) { + return retval; + } + + vm->kobj.kset = mvpkmKSet; + retval = kobject_init_and_add(&vm->kobj, &mvpkmKType, NULL, "%d", wsp->guestId); + if (retval) { + goto error; + } + + /* + * Get a reference to this module such that it cannot be unloaded until + * our kobject's release function completes. + */ + + __module_get(THIS_MODULE); + vm->haveKObj = true; + + /* + * Caution: From here on, if we fail, we must not call kobject_put() + * on vm->kobj since that may / will deallocate 'vm'. Unregistering VM + * ksets on failures, is fine and should be done for proper ref counting. + */ + + vm->devicesKSet = kset_create_and_add("devices", NULL, &vm->kobj); + if (!vm->devicesKSet) { + retval = -ENOMEM; + goto error; + } + + vm->miscKSet = kset_create_and_add("misc", NULL, &vm->kobj); + if (!vm->miscKSet) { + kset_unregister(vm->devicesKSet); + vm->devicesKSet = NULL; + retval = -ENOMEM; + goto error; + } + + down_write(&vm->wspSem); + + /* + * The VE monitor needs to issue a SMC to bootstrap Hyp mode. + */ + if (wsp->monType == MONITOR_TYPE_VE) { + /* + * Here we assemble the monitor's HMAIR0 based on wsp->memAttr. We map + * from the inner/outer normal page cacheability attributes obtained + * from DetermineCacheabilityAttribs to the format required in 4.2.8 + * ARM PRD03-GENC-008469 13.0 (see this document for the magic numbers). + * + * Where a choice is available, we opt for read and/or write allocation. + */ + static const uint32 normalCacheAttr2MAIR[4] = { 0x4, 0xf, 0xa, 0xe }; + + uint32 hmair0 = + ((normalCacheAttr2MAIR[wsp->memAttr.innerCache] | + (normalCacheAttr2MAIR[wsp->memAttr.outerCache] << 4)) + << 8 * MVA_MEMORY) | + (0x4 << 8 * MVA_DEVICE); + + /* + * See B4.1.74 ARM DDI 0406C-2c for the HTCR magic. + */ + uint32 htcr = + 0x80000000 | + (wsp->memAttr.innerCache << 8) | + (wsp->memAttr.outerCache << 10) | + (wsp->memAttr.share << 12); + + /** + * @knownjira{MVP-377} + * Set HSCTLR to enable MMU and caches. We should really run the + * monitor WXN, in non-MVP_DEVEL builds. See + * 13.18 ARM PRD03-GENC-008353 11.0 for the magic. + */ + static const uint32 hsctlr = 0x30c5187d; + + register uint32 r0 asm("r0") = wsp->monVA.excVec; + register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR; + register uint32 r2 asm("r2") = htcr; + register uint32 r3 asm("r3") = hmair0; + register uint32 r4 asm("r4") = hsctlr; + + asm volatile ( + "smc 0" + : + : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4) + : "memory" + ); + } + + /* + * Initialize guest wait-for-interrupt waitqueue. + */ + init_waitqueue_head(&vm->wfiWaitQ); + + MonitorTimer_Setup(vm); + +#ifdef CONFIG_HAS_WAKELOCK + wake_lock_init(&vm->wakeLock, WAKE_LOCK_SUSPEND, "mvpkm"); +#endif + + wsp->mvpkmVersion = MVP_VERSION_CODE; + up_write(&vm->wspSem); + /* + * Ensure coherence of monitor loading and page tables. + */ + flush_cache_all(); + return 0; + +error: + Mksck_WspRelease(wsp); + vm->wsp = NULL; + return retval; +} + +/** + * @brief dummy function to drop the info parameter + * @param info ignored + */ +static +void FlushAllCpuCaches(void *info) +{ + flush_cache_all(); +} + +/** + * @brief return to where monitor called worldswitch + * + * @param vm which virtual machine we're running + * @return 0: successful, just call back when ready
+ * 1: successful, process code in WSP_Params(wsp)->callno
+ * else: -errno + */ +static int +RunMonitor(MvpkmVM *vm) +{ + int ii; + unsigned long flags; + WorldSwitchPage *wsp = vm->wsp; + int retval = 0; + + ASSERT(wsp); + +#ifdef CONFIG_HAS_WAKELOCK + wake_lock(&vm->wakeLock); +#endif + + /* + * Set VCPUThread affinity + */ + if (cpumask_intersects(to_cpumask(vcpuAffinity), cpu_active_mask)) { + set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity)); + } + + /* + * Record the the current task structure, so an ABORT will know, + * who to wake. + */ + down_write(&vm->monThreadTaskSem); + vm->monThreadTask = get_current(); + up_write(&vm->monThreadTaskSem); + + /* + * Keep going as long as the monitor is in critical section or + * there are no pending signals such as SIGINT or SIGKILL. Block + * interrupts before checking so any IPI sent will remain pending + * if our check just misses detecting the signal. + */ + local_irq_save(flags); + while (wsp->critSecCount > 0 || + (!signal_pending(current) && + !(ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT))) { + /* + * ARMv7 Performance counters are per CPU core and might be disabled over + * CPU core sleep if there is nothing else in the system to re-enable + * them, so now that we have been allocated a CPU core to run the guest, + * enable them and in particular the TSC (CCNT) which is used for monitor + * timing between world switches. + */ + { + uint32 pmnc; + uint32 pmcnt; + + /* make sure that the Performance Counters are enabled */ + ARM_MRC_CP15(PERF_MON_CONTROL_REGISTER, pmnc); + if ((pmnc & (ARM_PMNC_E | ARM_PMNC_D)) != (ARM_PMNC_E)) { + pmnc |= ARM_PMNC_E; // Enable TSC + pmnc &= ~ARM_PMNC_D; // Disable cycle count divider + ARM_MCR_CP15(PERF_MON_CONTROL_REGISTER, pmnc); + } + + /* make sure that the CCNT is enabled */ + ARM_MRC_CP15(PERF_MON_COUNT_SET, pmcnt); + if ((pmcnt & ARM_PMCNT_C) != ARM_PMCNT_C) { + pmcnt |= ARM_PMCNT_C; + ARM_MCR_CP15(PERF_MON_COUNT_SET, pmcnt); + } + } + + /* + * Update TSC to RATE64 ratio + */ + { + struct TscToRate64Cb *ttr = &__get_cpu_var(tscToRate64); + wsp->tscToRate64Mult = ttr->mult; + wsp->tscToRate64Shift = ttr->shift; + } + + /* + * Save the time of day for the monitor's timer facility. The timing + * facility in the vmm needs to compute current time in the host linux's + * time representation. It uses the formula: + * now = wsp->switchedAt64 + (uint32)(TSC_READ() - wsp->lowerTSC) + * + * Read the timestamp counter *immediately after* ktime_get() as that + * will give the most consistent offset between reading the hardware + * clock register in ktime_get() and reading the hardware timestamp + * counter with TSC_READ(). + */ + ASSERT_ON_COMPILE(MVP_TIMER_RATE64 == NSEC_PER_SEC); + { + ktime_t now = ktime_get(); + TSC_READ(wsp->switchedAtTSC); + wsp->switchedAt64 = ktime_to_ns(now); + } + + /* + * Save host FPU contents and load monitor contents. + */ + SWITCH_VFP_TO_MONITOR; + + /* + * Call into the monitor to run guest instructions until it wants us to + * do something for it. Note that any hardware interrupt request will + * cause it to volunteer. + */ + switch (wsp->monType) { + case MONITOR_TYPE_LPV: { + uint32 hostVBAR; + + ARM_MRC_CP15(VECTOR_BASE, hostVBAR); + (*wsp->switchToMonitor)(&wsp->regSave); + ARM_MCR_CP15(VECTOR_BASE, hostVBAR); + break; + } + case MONITOR_TYPE_VE: { + register uint32 r1 asm("r1") = wsp->regSave.ve.mHTTBR; + + asm volatile ( + ".word " MVP_STRINGIFY(ARM_INSTR_HVC_A1_ENC(0)) + : "=r" (r1) : "r" (r1) : "r0", "r2", "memory" + ); + break; + } + default: FATAL(); + } + + /* + * Save monitor FPU contents and load host contents. + */ + SWITCH_VFP_TO_HOST; + + /* + * Re-enable local interrupts now that we are back in the host world + */ + local_irq_restore(flags); + + + /* + * Maybe the monitor wrote some messages to monitor->host sockets. + * This will wake the corresponding host threads to receive them. + */ + /** + * @todo This lousy loop is in the critical path. It should be changed + * to some faster algorithm to wake blocked host sockets. + */ + for (ii = 0; ii < MKSCK_MAX_SHARES; ii++) { + if (wsp->isPageMapped[ii]) { + Mksck_WakeBlockedSockets(MksckPage_GetFromIdx(ii)); + } + } + + switch (WSP_Params(wsp)->callno) { + case WSCALL_ACQUIRE_PAGE: { + uint32 i; + + for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) { + MPN mpn = AllocZeroedFreePages(vm, + WSP_Params(wsp)->pages.order, + true, + WSP_Params(wsp)->pages.forRegion, + NULL); + if (mpn == 0) { + printk(KERN_WARNING "WSCALL_ACQUIRE_PAGE: no order %u pages available\n", + WSP_Params(wsp)->pages.order); + WSP_Params(wsp)->pages.pages = i; + break; + } + + WSP_Params(wsp)->pages.mpns[i] = mpn; + } + + break; + } + case WSCALL_RELEASE_PAGE: { + uint32 i; + + for (i = 0; i < WSP_Params(wsp)->pages.pages; ++i) { + if (!LockedListDel(vm, WSP_Params(wsp)->pages.mpns[i])) { + WSP_Params(wsp)->pages.pages = i; + break; + } + } + + break; + } + case WSCALL_MUTEXLOCK: { + retval = Mutex_Lock((void *)WSP_Params(wsp)->mutex.mtxHKVA, + WSP_Params(wsp)->mutex.mode); + + if (retval < 0) { + WSP_Params(wsp)->mutex.ok = false; + goto monitorExit; + } + + /* + * The locking succeeded. From this point on the monitor + * is in critical section. Even if an interrupt comes + * right here, it must return to the monitor to unlock the + * mutex. + */ + wsp->critSecCount++; + WSP_Params(wsp)->mutex.ok = true; + break; + } + case WSCALL_MUTEXUNLOCK: { + Mutex_Unlock((void *)WSP_Params(wsp)->mutex.mtxHKVA, + WSP_Params(wsp)->mutex.mode); + break; + } + case WSCALL_MUTEXUNLSLEEP: { + /* + * The vcpu has just come back from the monitor. During + * the transition interrupts were disabled. Above, + * however, interrupts were enabled again and it is + * possible that a context switch happened into a thread + * (serve_vmx) that instructed the vcpu thread to + * abort. After returning to this thread the vcpu may + * enter a sleep below never to return from it. To avoid + * this deadlock we need to test the abort flag in + * Mutex_UnlSleepTest. + */ + retval = + Mutex_UnlSleepTest((void *)WSP_Params(wsp)->mutex.mtxHKVA, + WSP_Params(wsp)->mutex.mode, + WSP_Params(wsp)->mutex.cvi, + &wsp->hostActions, + ACTION_ABORT); + if (retval < 0) { + goto monitorExit; + } + break; + } + case WSCALL_MUTEXUNLWAKE: { + Mutex_UnlWake((void *)WSP_Params(wsp)->mutex.mtxHKVA, + WSP_Params(wsp)->mutex.mode, + WSP_Params(wsp)->mutex.cvi, + WSP_Params(wsp)->mutex.all); + break; + } + + /* + * The monitor wants us to block (allowing other host threads to run) + * until an async message is waiting for the monitor to process. + * + * If MvpkmWaitForInt() returns an error, it should only be if there + * is another signal pending (such as SIGINT). So we pretend it + * completed normally, as the monitor is ready to be called again (it + * will see no messages to process and wait again), and return to user + * mode so the signals can be processed. + */ + case WSCALL_WAIT: { +#ifdef CONFIG_HAS_WAKELOCK + if (WSP_Params(wsp)->wait.suspendMode) { + /* guest has ok'ed suspend mode, so release SUSPEND wakelock */ + wake_unlock(&vm->wakeLock); + retval = MvpkmWaitForInt(vm, true); + wake_lock(&vm->wakeLock); + WSP_Params(wsp)->wait.suspendMode = 0; + } else { + /* guest has asked for WFI not suspend so keep holding SUSPEND + * wakelock */ + retval = MvpkmWaitForInt(vm, false); + } +#else + retval = MvpkmWaitForInt(vm, WSP_Params(wsp)->wait.suspendMode); +#endif + if (retval < 0) { + goto monitorExit; + } + break; + } + + /* + * The only reason the monitor returned was because there was a + * pending hardware interrupt. The host serviced and cleared that + * interrupt when we enabled interrupts above. Now we call the + * scheduler in case that interrupt woke another thread, we want to + * allow that thread to run before returning to do more guest code. + */ + case WSCALL_IRQ: { + break; + } + + case WSCALL_GET_PAGE_FROM_VMID: { + MksckPage *mksckPage; + mksckPage = MksckPage_GetFromVmIdIncRefc(WSP_Params(wsp)->pageMgmnt.vmId); + + if (mksckPage) { + int ii; + + WSP_Params(wsp)->pageMgmnt.found = true; + for (ii = 0; ii < MKSCKPAGE_TOTAL; ii++) { + WSP_Params(wsp)->pageMgmnt.mpn[ii] = + vmalloc_to_pfn( (void*)(((HKVA)mksckPage) + ii*PAGE_SIZE) ); + } + + ASSERT(!wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)]); + wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)] = true; + } else { + WSP_Params(wsp)->pageMgmnt.found = false; + } + break; + } + + case WSCALL_REMOVE_PAGE_FROM_VMID: { + MksckPage *mksckPage; + mksckPage = MksckPage_GetFromVmId(WSP_Params(wsp)->pageMgmnt.vmId); + ASSERT(wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)]); + wsp->isPageMapped[MKSCK_VMID2IDX(mksckPage->vmId)] = false; + MksckPage_DecRefc(mksckPage); + break; + } + + /* + * Read current wallclock time. + */ + case WSCALL_READTOD: { + struct timeval nowTV; + do_gettimeofday(&nowTV); + WSP_Params(wsp)->tod.now = nowTV.tv_sec; + WSP_Params(wsp)->tod.nowusec = nowTV.tv_usec; + break; + } + + case WSCALL_LOG: { + int len = strlen(WSP_Params(wsp)->log.messg); + printk(KERN_INFO + "VMM: %s%s", + WSP_Params(wsp)->log.messg, + (WSP_Params(wsp)->log.messg[len-1] == '\n') ? "" : "\n"); + break; + } + + case WSCALL_ABORT: { + retval = WSP_Params(wsp)->abort.status; + goto monitorExit; + } + + case WSCALL_QP_GUEST_ATTACH: { + int32 rc; + QPInitArgs args; + uint32 base; + uint32 nrPages; + + args.id = WSP_Params(wsp)->qp.id; + args.capacity = WSP_Params(wsp)->qp.capacity; + args.type = WSP_Params(wsp)->qp.type; + base = WSP_Params(wsp)->qp.base; + nrPages = WSP_Params(wsp)->qp.nrPages; + + rc = QP_GuestAttachRequest(vm, &args, base, nrPages); + + WSP_Params(wsp)->qp.rc = rc; + WSP_Params(wsp)->qp.id = args.id; + break; + } + + case WSCALL_QP_NOTIFY: { + QPInitArgs args; + + args.id = WSP_Params(wsp)->qp.id; + args.capacity = WSP_Params(wsp)->qp.capacity; + args.type = WSP_Params(wsp)->qp.type; + + WSP_Params(wsp)->qp.rc = QP_NotifyListener(&args); + break; + } + + case WSCALL_MONITOR_TIMER: { + MonitorTimer_Request(&vm->monTimer, WSP_Params(wsp)->timer.when64); + break; + } + + case WSCALL_COMM_SIGNAL: { + Mvpkm_CommEvSignal(&WSP_Params(wsp)->commEvent.transpID, + WSP_Params(wsp)->commEvent.event); + break; + } + + case WSCALL_FLUSH_ALL_DCACHES: { + /* + * Broadcast Flush DCache request to all cores. + * Block while waiting for all of them to get done. + */ + on_each_cpu(FlushAllCpuCaches, NULL, 1); + break; + } + default: { + retval = -EPIPE; + goto monitorExit; + } + } + + /* + * The params.callno callback was handled in kernel mode and completed + * successfully. Repeat for another call without returning to user mode, + * unless there are signals pending. + * + * But first, call the Linux scheduler to switch threads if there is + * some other thread Linux wants to run now. + */ + if (need_resched()) { + schedule(); + } + + /* + * Check if cpus allowed mask has to be updated. + * Updating it must be done outside of an atomic context. + */ + if (cpumask_intersects(to_cpumask(vcpuAffinity), cpu_active_mask) && + !cpumask_equal(to_cpumask(vcpuAffinity), ¤t->cpus_allowed)) { + set_cpus_allowed_ptr(current, to_cpumask(vcpuAffinity)); + } + + local_irq_save(flags); + } + + /* + * There are signals pending so don't try to do any more monitor/guest + * stuff. But since we were at the point of just about to run the monitor, + * return success status as user mode can simply call us back to run the + * monitor again. + */ + local_irq_restore(flags); + +monitorExit: + ASSERT(wsp->critSecCount == 0); + + if (ATOMIC_GETO(wsp->hostActions) & ACTION_ABORT) { + PRINTK(KERN_INFO "Monitor has ABORT flag set.\n"); + retval = ExitStatusHostRequest; + } + +#ifdef CONFIG_HAS_WAKELOCK + wake_unlock(&vm->wakeLock); +#endif + + down_write(&vm->monThreadTaskSem); + vm->monThreadTask = NULL; + up_write(&vm->monThreadTaskSem); + + return retval; +} + +/** + * @brief Guest is waiting for interrupts, sleep if necessary + * + * @param vm which virtual machine we're running + * @param suspend is the guest entering suspend or just WFI? + * @return 0: woken up, hostActions should have pending events + * -ERESTARTSYS: broke out because other signals are pending + * + * This function is called in the VCPU context after the world switch to wait + * for an incoming message. If any message gets queued to this VCPU, the + * sender will wake us up. + */ +int +MvpkmWaitForInt(MvpkmVM *vm, _Bool suspend) +{ + WorldSwitchPage *wsp = vm->wsp; + wait_queue_head_t *q = &vm->wfiWaitQ; + + if (suspend) { + return wait_event_interruptible(*q, ATOMIC_GETO(wsp->hostActions) != 0); + } else { + int ret; + ret = wait_event_interruptible_timeout(*q, ATOMIC_GETO(wsp->hostActions) != 0, 10*HZ); + if (ret == 0) { + printk("MvpkmWaitForInt: guest stuck for 10s in WFI! (hostActions %08x)\n", + ATOMIC_GETO(wsp->hostActions)); + } + return ret > 0 ? 0 : ret; + } +} + + +/** + * @brief Force the guest to evaluate its hostActions flag field + * + * @param vm which guest needs waking + * @param why why should be guest be woken up? + * + * This function updates the hostAction flag field as and wakes up the guest as + * required so that it can evaluate it. The guest could be executing guest + * code in an SMP system, in that case send an IPI; or it could be sleeping, in + * the case wake it up. + */ +void +Mvpkm_WakeGuest(MvpkmVM *vm, int why) +{ + ASSERT(why != 0); + + /* set the host action */ + if (ATOMIC_ORO(vm->wsp->hostActions, why) & why) { + /* guest has already been woken up so no need to do it again */ + return; + } + + /* + * VCPU is certainly in 'wait for interrupt' wait. Wake it up ! + */ +#ifdef CONFIG_HAS_WAKELOCK + /* + * To prevent the system to go in suspend mode before the monitor had a + * chance on being scheduled, we will hold the VM wakelock from now. + * As the wakelocks are not managed as reference counts, this is not an + * an issue to take a wake_lock twice in a row. + */ + wake_lock(&vm->wakeLock); +#endif + + /* + * On a UP system, we ensure the monitor thread isn't blocked. + * + * On an MP system the other CPU might be running the guest. This + * is noop on UP. + * + * When the guest is running, it is an invariant that monThreadTaskSem is not + * held as a write lock, so we should not fail to acquire the lock. + * Mvpkm_WakeGuest may be called from an atomic context, so we can't sleep + * here. + */ + if (down_read_trylock(&vm->monThreadTaskSem)) { + if (vm->monThreadTask) { + wake_up_process(vm->monThreadTask); + kick_process(vm->monThreadTask); + } + up_read(&vm->monThreadTaskSem); + } else { + printk("Unexpected failure to acquire monThreadTaskSem!\n"); + } +} diff --git a/arch/arm/mvp/mvpkm/mvpkm_private.h b/arch/arm/mvp/mvpkm/mvpkm_private.h new file mode 100644 index 0000000..3dfc8d4 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvpkm_private.h @@ -0,0 +1,97 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Private interface between user level wrappers and kernel module. + * The communication uses the ioctl linux call. The command operand is one + * of the MVPKM_xxx macros defined below, the custom operand is a pointer + * to the respective structure below. + */ + + +#ifndef _MVPKMPRIVATE_H +#define _MVPKMPRIVATE_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include +/* + * For details how to create ioctl numbers, see + * Documentation/ioctl/ioctl-number.txt. The letter '9' is + * unused. The 0xa0-0xaf block is even more unused. Note however, that + * ioctl numbers are desired to be unique for debug purposes, they + * may conflict. + */ +#define MVP_IOCTL_LETTER '9' +#define MVPKM_DISABLE_FAULT _IO( MVP_IOCTL_LETTER, 0xa0) +#define MVPKM_LOCK_MPN _IOW(MVP_IOCTL_LETTER, 0xa1, MvpkmLockMPN) +#define MVPKM_UNLOCK_MPN _IOW(MVP_IOCTL_LETTER, 0xa2, MvpkmLockMPN) +#define MVPKM_RUN_MONITOR _IO( MVP_IOCTL_LETTER, 0xa3) +#define MVPKM_CPU_INFO _IOR(MVP_IOCTL_LETTER, 0xa4, MvpkmCpuInfo) +#define MVPKM_ABORT_MONITOR _IO( MVP_IOCTL_LETTER, 0xa5) +#define MVPKM_MAP_WSPHKVA _IOW(MVP_IOCTL_LETTER, 0xa7, MvpkmMapHKVA) + +#include "mksck.h" +#include "monva_common.h" +#include "mvpkm_types.h" + +/** + * @brief Operand for the MVPKM_LOCK_MPN call + */ +typedef struct MvpkmLockMPN { + uint32 order; /* IN */ + PhysMem_RegionType forRegion; /* IN */ + uint32 mpn; /* OUT */ +} MvpkmLockMPN; + +/** + * @brief Operand for the MVPKM_MAP_HKVA call + */ +typedef struct MvpkmMapHKVA { + HkvaMapInfo *mapInfo; /* IN */ + PhysMem_RegionType forRegion; /* IN */ + HKVA hkva; /* OUT */ +} MvpkmMapHKVA; + +#define WSP_PAGE_COUNT 2 + +/** + * @brief Operand for the MVPKM_CPU_INFO call + */ +typedef struct MvpkmCpuInfo { + ARM_L2D attribL2D; /* OUT */ + ARM_MemAttrNormal attribMAN; /* OUT */ + _Bool mpExt; /* OUT */ +} MvpkmCpuInfo; + +/** + * @brief These magic numbers mark the beginning and end of the + * special page that is mapped into the virtual address space of MVPD + * when it's monitor coredumper requests an unavailable page. + */ +#define MVPKM_STUBPAGE_BEG 0x78d10c67 +#define MVPKM_STUBPAGE_END 0x8378f3dd +#endif diff --git a/arch/arm/mvp/mvpkm/mvpkm_types.h b/arch/arm/mvp/mvpkm/mvpkm_types.h new file mode 100644 index 0000000..ce23554 --- /dev/null +++ b/arch/arm/mvp/mvpkm/mvpkm_types.h @@ -0,0 +1,49 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Types used in the interface between users, user level wrappers, + * and the kernel module implementation. + */ + + +#ifndef _MVPKMTYPES_H +#define _MVPKMTYPES_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + + +/** + * @brief HkvaMapInfo structure describing the mpn and execute permission + * flag to use to map a given page in HKVA space + */ +typedef struct HkvaMapInfo { + uint32 mpn; + _Bool write; + _Bool exec; +} HkvaMapInfo; + +#endif diff --git a/arch/arm/mvp/mvpkm/nottested.h b/arch/arm/mvp/mvpkm/nottested.h new file mode 100644 index 0000000..5226a22 --- /dev/null +++ b/arch/arm/mvp/mvpkm/nottested.h @@ -0,0 +1,54 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief NOT_TESTED() and related. + */ + +#ifndef _NOTTESTED_H +#define _NOTTESTED_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include + +#ifdef NOT_TESTED_ENABLED +#define NotTestedEnabled true +#else +#define NotTestedEnabled false +#endif + +#define NOT_TESTED() NOT_TESTED_JIRA(0) +#define NOT_TESTED_JIRA(_tkt,...) NotTested(_tkt, __FILE__, __LINE__) + +void NotTested(int tkt, char const *file, int line); + +#endif diff --git a/arch/arm/mvp/mvpkm/platdefx.h b/arch/arm/mvp/mvpkm/platdefx.h new file mode 100644 index 0000000..70fb8d7 --- /dev/null +++ b/arch/arm/mvp/mvpkm/platdefx.h @@ -0,0 +1,67 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Basic platform definitions needed various places. + */ + +#ifndef _PLATDEFX_H +#define _PLATDEFX_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define PAGE_ORDER 12 + +#ifndef PAGE_SIZE +#define PAGE_SIZE (1UL << PAGE_ORDER) +#endif +#if PAGE_SIZE != 4096 +#error bad page size PAGE_SIZE +#endif + +#define PA_2_PPN(_pa) ((_pa) / PAGE_SIZE) +#define PPN_2_PA(_ppn) ((_ppn) * PAGE_SIZE) + +#define VMM_DOMAIN 0x0 +#define VMM_DOMAIN_NO_ACCESS 0x3 +#define VMM_DOMAIN_CLIENT 0x1 +#define VMM_DOMAIN_MANAGER 0x4 + +#define INVALID_CVA (-(CVA)1) +#define INVALID_GVA (-(GVA)1) +#define INVALID_MVA (-(MVA)1) +#define INVALID_HKVA (-(HKVA)1) +#define INVALID_HUVA (-(HUVA)1) + +#define INVALID_MPN (((MPN)-1) >> ARM_L2D_SMALL_ORDER) +#define INVALID_PPN (((PPN)-1) >> ARM_L2D_SMALL_ORDER) + +#endif diff --git a/arch/arm/mvp/mvpkm/psr_defs.h b/arch/arm/mvp/mvpkm/psr_defs.h new file mode 100644 index 0000000..4fa53bc --- /dev/null +++ b/arch/arm/mvp/mvpkm/psr_defs.h @@ -0,0 +1,117 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Constant definitions for ARM CPSR/SPSR registers. See A2.5 + * ARM DDI 0100I. + */ + +#ifndef _PSR_DEFS_H_ +#define _PSR_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define ARM_PSR_MODE_USER 0x10 +#define ARM_PSR_MODE_FIQ 0x11 +#define ARM_PSR_MODE_IRQ 0x12 +#define ARM_PSR_MODE_SUPERVISOR 0x13 +#define ARM_PSR_MODE_ABORT 0x17 +#define ARM_PSR_MODE_HVC 0x1a +#define ARM_PSR_MODE_UNDEFINED 0x1b +#define ARM_PSR_MODE_SYSTEM 0x1f + +/* Bit 31: N */ +#define ARM_PSR_N (1 << 31) + +/* Bit 30: Z */ +#define ARM_PSR_Z (1 << 30) + +/* Bit 29: C */ +#define ARM_PSR_C (1 << 29) + +/* Bit 28: V */ +#define ARM_PSR_V (1 << 28) + +/* Bit 27: Q */ +#define ARM_PSR_Q (1 << 27) + +#define ARM_PSR_COND_FLAGS \ + (ARM_PSR_N | ARM_PSR_Z | ARM_PSR_C | ARM_PSR_V | ARM_PSR_Q) + +/* Bits 26..25: ITSTATE<1..0> */ +#define ARM_PSR_ITSTATE_LOW MVP_MASK(25, 2) + +/* Bit 24: J */ +#define ARM_PSR_J (1 << 24) + +/* Bits 23..20 are reserved as of ARMv7 */ +#define ARM_PSR_RESERVED MVP_MASK(20, 4) + +/* Bits 19..16: GE<3..0> */ +#define ARM_PSR_GE MVP_MASK(16, 4) + +/* Bits 15..10: ITSTATE<7..2> */ +#define ARM_PSR_ITSTATE_HIGH MVP_MASK(10, 6) +#define ARM_PSR_ITSTATE (ARM_PSR_ITSTATE_LOW | ARM_PSR_ITSTATE_HIGH) + +/* Bit 9: E */ +#define ARM_PSR_E_POS (9) +#define ARM_PSR_E (1 << ARM_PSR_E_POS) + +/* Bit 8: A */ +#define ARM_PSR_A_POS (8) +#define ARM_PSR_A (1 << ARM_PSR_A_POS) + +/* Bit 7: I */ +#define ARM_PSR_I_POS (7) +#define ARM_PSR_I (1 << ARM_PSR_I_POS) + +/* Bit 6: F */ +#define ARM_PSR_F_POS (6) +#define ARM_PSR_F (1 << ARM_PSR_F_POS) + +/* Bit 5: T */ +#define ARM_PSR_T_POS (5) +#define ARM_PSR_T (1 << ARM_PSR_T_POS) + +/* Bits 4..0: Mode */ +#define ARM_PSR_MODE_MASK 0x1f + +#define ARM_PSR_MODE(cpsr) ((cpsr) & ARM_PSR_MODE_MASK) +#define ARM_PSR_USER_MODE(cpsr) (ARM_PSR_MODE(cpsr) == ARM_PSR_MODE_USER) + + +/* + * We shadow the 10 LSBs in the CPSR, with the exception of the T bit, as they + * are managed by the VMM on behalf of the guest and are potentially different + * than the physical CPSR during DE. + */ +#define ARM_PSR_MONITOR_BITS 10 +#define ARM_PSR_MONITOR_MASK (((1 << ARM_PSR_MONITOR_BITS) - 1) & ~ARM_PSR_T) + +#endif /// ifndef _PSR_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/qp.h b/arch/arm/mvp/mvpkm/qp.h new file mode 100644 index 0000000..a8d7ac1 --- /dev/null +++ b/arch/arm/mvp/mvpkm/qp.h @@ -0,0 +1,332 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MVP Queue Pairs function and structure declarations + * + * MVP Queue Pairs: + * + * Queue pairs are intended to be a generic bulk data transport mechanism + * between the guest and host kernels. The queue pair abstraction is based + * on two ring buffers (queues) placed on a shared memory region mapped + * into both guest and host kernel address spaces. + * + * NOTE: Queue pairs are SINGLE-READER, SINGLE-WRITER. Any caller is + * responsible for multi-reader/writer serialization!!! + * + * There are a maximum of QP_MAX_QUEUE_PAIRS in the system, with a maximum + * size of QP_MAX_CAPACITY per pair. Each queue pair is identified by + * an ID. + * + * Each peer follows a producer-consumer model in which one side is the + * producer on one queue, and the other side is the consumer on that queue + * (and vice-versa for its pair). + * + * Data is enqueued and dequeued into the pair in transactional stages, + * meaning each enqueue/dequeue can be followed by zero or more + * enqueue/dequeues, but the enqueue/dequeue is not visible to the peer + * until it has been committed with the *Commit() function. + * In PVTCP, for example, this is used to enqueue a short header, then + * followed by 'segments' of iovecs, then followed by a commit. This + * model prevents a peer from reading the header, expecting a payload, + * but not being able to read the payload because it hasn't been + * enqueued yet. + * + * Queue Pair setup: + * + * Before data can be passed, the guest and host kernel must perform + * the following connection handshake: + * + * 1). A host kernel service registers a listener with the queue pair + * subsystem with a callback to be called when guests create + * and attach to a shared memory region. + * + * 2). Guest initiates an QP_Attach() operation to a shared memory region + * keyed by ID. This step allocates memory, maps it into the host + * address space, and optionally notifies any host services who are + * listening for attach requests from the guest (see previous step). + * Host listeners are provided with a copy of the initialization + * arguments used by the guest (id, size, service type). All registered + * listeners are iterated over until one of them handles the attach + * request and acknowledges with QP_SUCCESS. + * + * 3). The registered host callback is called, notifying the host that + * the guest has attached. + * + * 4). The host can now QP_Attach() to the shared memory region with the same + * arguments as the guest. The queue pair is now well formed and enqueues + * and dequeues can proceed on either side. + * + * Queue Pair teardown: + * + * 1). As before, teardowns are initiated by the guest. Hosts can register + * a callback to be called upon detach. Guests initiate a teardown + * through a call to QP_Detach(). + * + * 2). Registered hosts are notified through the aforementioned callback. + * 3). The host service can call QP_Detach() at its own leisure. Memory + * is freed, the queue pair is destroyed. + * + * If at any point the guest unexpectedly shuts down, the host will be + * notified at monitor shutdown time. Memory is freed, and the queue + * pair is destroyed. + * + */ + +#ifndef _QP_H +#define _QP_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +//#define QP_DEBUG 1 + +typedef enum QPState { + QP_STATE_FREE = 0x1, ///< No peers, not memory-backed + QP_STATE_CONNECTED, ///< Both peers attached , memory backed + QP_STATE_GUEST_ATTACHED, ///< Guest allocated memory, host not yet attached + QP_STATE_MAX // leave this at the end! +} QPState; + +typedef struct QPId { + uint32 context; + uint32 resource; +} QPId; + +/* + * Initialization arguments for each queue pair + */ +typedef struct QPInitArgs { + QPId id; ///< Shared memory region ID + uint32 capacity; ///< Total size of shared region in bytes + uint32 type; ///< Type of queue pair (PVTCP, other)... +} QPInitArgs; + +/* + * Placed on the shared region, two per region + */ +typedef struct QHandle { + volatile uint32 head; ///< queue head offset + volatile uint32 tail; ///< queue tail offset + volatile uint32 phantom_head; ///< queue shadow head offset + volatile uint32 phantom_tail; ///< queue shadow tail offset + uint8 data[0]; ///< start of data, runs off + // the struct +} QHandle; + +/* + * Local to each peer + */ +typedef struct QPHandle { + QPId id; ///< shared memory region ID + uint32 capacity; ///< size of region in bytes + QHandle *produceQ; ///< producer queue + QHandle *consumeQ; ///< consumer queue + uint32 queueSize; ///< size of each queue in bytes + uint32 type; ///< type of queue pair + + /* + * Following fields unused by guest + */ + QPState state; + void (*peerDetachCB)(void* data); ///< detach notification callback + void *detachData; ///< data for the detach cb + struct page **pages; ///< page pointers for shared region +} QPHandle; + +/* + * QP Error codes + */ +#define QP_SUCCESS 0 +#define QP_ERROR_NO_MEM (-1) +#define QP_ERROR_INVALID_HANDLE (-2) +#define QP_ERROR_INVALID_ARGS (-3) +#define QP_ERROR_ALREADY_ATTACHED (-4) + +/* + * Hard-coded limits + */ +#define QP_MIN_CAPACITY (PAGE_SIZE * 2) +#define QP_MAX_CAPACITY (1024*1024) // 1M +#define QP_MAX_QUEUE_PAIRS 32 +#define QP_MAX_ID QP_MAX_QUEUE_PAIRS +#define QP_MAX_LISTENERS QP_MAX_QUEUE_PAIRS +#define QP_MAX_PAGES (QP_MAX_CAPACITY/PAGE_SIZE) // 256 pages + +#define QP_INVALID_ID 0xFFFFFFFF +#define QP_INVALID_SIZE 0xFFFFFFFF +#define QP_INVALID_REGION 0xFFFFFFFF +#define QP_INVALID_TYPE 0xFFFFFFFF + +#ifdef __KERNEL__ +/** + * @brief Utility function to sanity check arguments + * @param args argument structure to check + * @return true if arguments are sane, false otherwise + */ +static inline +_Bool QP_CheckArgs(QPInitArgs *args) +{ + if (!args || + !is_power_of_2(args->capacity) || + (args->capacity < QP_MIN_CAPACITY) || + (args->capacity > QP_MAX_CAPACITY) || + !(args->id.resource < QP_MAX_ID || args->id.resource == QP_INVALID_ID) || + (args->type == QP_INVALID_TYPE)) { + return false; + } else { + return true; + } +} +#endif + + +/** + * @brief Utility function to sanity check a queue pair handle + * @param qp handle to the queue pair + * @return true if the handle is sane, false otherwise + */ +static inline +_Bool QP_CheckHandle(QPHandle *qp) +{ +#ifdef MVP_DEBUG + if (!(qp) || + !(qp->produceQ) || + !(qp->consumeQ) || + (qp->state >= (uint32)QP_STATE_MAX) || + !(qp->queueSize < (QP_MAX_CAPACITY/2))) { + return false; + } else { + return true; + } +#else + return true; +#endif +} + + +/** + * @brief Initializes an invalid handle + * @param[in, out] qp handle to the queue pair + */ +static inline void +QP_MakeInvalidQPHandle(QPHandle *qp) +{ + if (!qp) { + return; + } + + qp->id.context = QP_INVALID_ID; + qp->id.resource = QP_INVALID_ID; + qp->capacity = QP_INVALID_SIZE; + qp->produceQ = NULL; + qp->consumeQ = NULL; + qp->queueSize = QP_INVALID_SIZE; + qp->type = QP_INVALID_TYPE; + qp->state = QP_STATE_FREE; + qp->peerDetachCB = NULL; + qp->detachData = NULL; +} + +/* + * Host only + */ +typedef int32 (*QPListener)(const QPInitArgs*); +int32 QP_RegisterListener(const QPListener); +int32 QP_UnregisterListener(const QPListener); +int32 QP_RegisterDetachCB(QPHandle *qp, void (*callback)(void*), void *data); + + +/* + * Host and guest specific implementations, see qp_host.c and qp_guest.c + */ +int32 QP_Attach(QPInitArgs *args, QPHandle** qp); +int32 QP_Detach(QPHandle* qp); +int32 QP_Notify(QPInitArgs *args); + +/* + * Common implementation, see qp_common.c + */ +int32 QP_EnqueueSpace(QPHandle *qp); +int32 QP_EnqueueSegment(QPHandle *qp, const void *buf, size_t length); +int32 QP_EnqueueCommit(QPHandle *qp); +int32 QP_EnqueueReset(QPHandle *qp); + +static inline int32 +QP_EnqueueAtomic(QPHandle *qp, const void *buf, size_t length) +{ + int32 rc; + QP_EnqueueReset(qp); + rc = QP_EnqueueSegment(qp, buf, length); + if (rc < 0) { + return rc; + } else { + QP_EnqueueCommit(qp); + } + return rc; +} + +int32 QP_DequeueSpace(QPHandle *qp); +int32 QP_DequeueSegment(QPHandle *qp, const void *buf, size_t length); +int32 QP_DequeueReset(QPHandle *qp); +int32 QP_DequeueCommit(QPHandle *qp); + +static inline int32 +QP_DequeueAtomic(QPHandle *qp, const void *buf, size_t length) +{ + int32 rc; + QP_DequeueReset(qp); + rc = QP_DequeueSegment(qp, buf, length); + if (rc < 0) { + return rc; + } else { + QP_DequeueCommit(qp); + } + return rc; +} + +/* + * HVC methods and signatures + */ +#define MVP_QP_SIGNATURE 0x53525051 ///< 'QPRS' +#define MVP_QP_ATTACH (MVP_OBJECT_CUSTOM_BASE + 0) ///< attach to a queue pair +#define MVP_QP_DETACH (MVP_OBJECT_CUSTOM_BASE + 1) ///< detach from a queue pair +#define MVP_QP_NOTIFY (MVP_OBJECT_CUSTOM_BASE + 2) ///< notify host of attach +#define MVP_QP_LAST (MVP_OBJECT_CUSTOM_BASE + 3) ///< Number of methods + +/* + * Debug macros + */ +#ifdef QP_DEBUG + #ifdef IN_MONITOR + #define QP_DBG(...) Log(__VA_ARGS__) + #else + #define QP_DBG(...) printk(KERN_INFO __VA_ARGS__) + #endif +#else + #define QP_DBG(...) +#endif + +#endif diff --git a/arch/arm/mvp/mvpkm/qp_common.c b/arch/arm/mvp/mvpkm/qp_common.c new file mode 100644 index 0000000..8d121a1 --- /dev/null +++ b/arch/arm/mvp/mvpkm/qp_common.c @@ -0,0 +1,337 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MVP Queue Pairs common enqueue and dequeue functions. + * Does not include Attach(), and Detach(), as this will be specific + * to host/guest + * implementations. + */ + +#include + +#include "mvp_types.h" +#include "comm_os.h" +#include "qp.h" + + +/** + * @brief Calculate free space in the queue, convenience function + * @param head queue head offset + * @param tail queue tail offset + * @param queueSize size of queue + * @return free space in the queue + */ +static inline int32 +FreeSpace(uint32 head, uint32 tail, uint32 queueSize) { + /* Leave 1 byte free to resolve ambiguity between empty + * and full conditions */ + return (tail >= head) ? (queueSize - (tail - head) - 1) : + (head - tail - 1); +} + + +/** + * @brief Returns available space for enqueue, in bytes + * @param qp handle to the queue pair + * @return available space in bytes in the queue for enqueue operations, + * QP_ERROR_INVALID_HANDLE if the handle is malformed + */ +int32 +QP_EnqueueSpace(QPHandle *qp) +{ + uint32 head; + uint32 phantom; + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + head = qp->produceQ->head; + phantom = qp->produceQ->phantom_tail; + + if (head >= qp->queueSize || + phantom >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + return FreeSpace(head, phantom, qp->queueSize); +} + + +/** + * @brief Enqueues a segment of data into the producer queue + * @param qp handle to the queue pair + * @param buf data to enqueue + * @param bufSize size in bytes to enqueue + * @return number of bytes enqueued on success, appropriate error + * code otherwise + * @sideeffects May move phantom tail pointer + */ +int32 +QP_EnqueueSegment(QPHandle *qp, const void *buf, size_t bufSize) +{ + int32 freeSpace; + uint32 head; + uint32 phantom; + + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + head = qp->produceQ->head; + phantom = qp->produceQ->phantom_tail; + + /* + * This check must go after the assignment above, + * otherwise a malicious guest could write bogus + * offsets to the queue and cause the memcpy to + * copy into unpleasant places. + */ + if (head >= qp->queueSize || + phantom >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + freeSpace = FreeSpace(head, phantom, qp->queueSize); + + if (bufSize <= freeSpace) { + if (bufSize + phantom < qp->queueSize) { + memcpy(qp->produceQ->data + phantom, buf, bufSize); + phantom += bufSize; + } else { + uint32 written = qp->queueSize - phantom; + memcpy(qp->produceQ->data + phantom, buf, written); + memcpy(qp->produceQ->data, (uint8*)buf + written, bufSize - written); + phantom = bufSize - written; + } + } else { + return QP_ERROR_NO_MEM; + } + + qp->produceQ->phantom_tail = phantom; + + return bufSize; +} + + +/** + * @brief Commits any previous EnqueueSegment operations to the queue + * pair + * @param qp handle to the queue pair. + * @return QP_SUCCESS on success, appropriate error code otherwise. + * @sideeffects May move tail pointer + */ +int32 +QP_EnqueueCommit(QPHandle *qp) +{ + uint32 phantom; + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + phantom = qp->produceQ->phantom_tail; + if (phantom >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + qp->produceQ->tail = phantom; + return QP_SUCCESS; +} + + +/** + * @brief Returns any available bytes for dequeue + * @param qp handle to the queue pair + * @return available bytes for dequeue, appropriate error code + * otherwise + */ +int32 +QP_DequeueSpace(QPHandle *qp) +{ + uint32 tail; + uint32 phantom; + int32 bytesAvailable; + + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + tail = qp->consumeQ->tail; + phantom = qp->consumeQ->phantom_head; + + if (tail >= qp->queueSize || + phantom >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + bytesAvailable = (tail - phantom); + if ((int32)bytesAvailable < 0) { + bytesAvailable += qp->queueSize; + } + return bytesAvailable; +} + + +/** + * @brief Dequeues a segment of data from the consumer queue into + * a buffer + * @param qp handle to the queue pair + * @param[out] buf buffer to copy to + * @param bytesDesired number of bytes to dequeue + * @return number of bytes dequeued on success, appropriate error + * code otherwise + * @sideeffects May move phantom head pointer + */ +int32 +QP_DequeueSegment(QPHandle *qp, const void *buf, size_t bytesDesired) +{ + uint32 tail; + uint32 phantom; + int32 bytesAvailable = 0; + + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + tail = qp->consumeQ->tail; + phantom = qp->consumeQ->phantom_head; + + /* + * This check must go after the assignment above, + * otherwise a malicious guest could write bogus + * offsets to the queue and cause the memcpy to + * copy into unpleasant places. + */ + if (tail >= qp->queueSize || + phantom >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + bytesAvailable = (tail - phantom); + if ((int32)bytesAvailable < 0) { + bytesAvailable += qp->queueSize; + } + + if (bytesDesired <= bytesAvailable) { + if (bytesDesired + phantom < qp->queueSize) { + memcpy((void*)buf, qp->consumeQ->data + phantom, bytesDesired); + phantom += bytesDesired; + } else { + uint32 written = qp->queueSize - phantom; + memcpy((void*)buf, qp->consumeQ->data + phantom, written); + memcpy((uint8*)buf + written, qp->consumeQ->data, bytesDesired - written); + phantom = bytesDesired - written; + } + } else { + return QP_ERROR_NO_MEM; + } + + qp->consumeQ->phantom_head = phantom; + + return bytesDesired; +} + + +/** + * @brief Commits any previous DequeueSegment operations to the queue + * pair + * @param qp handle to the queue pair + * @return QP_SUCCESS on success, QP_ERROR_INVALID_HANDLE if the handle + * is malformed + * @sideeffects Moves the head pointer + */ +int32 +QP_DequeueCommit(QPHandle *qp) +{ + uint32 phantom; + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + phantom = qp->consumeQ->phantom_head; + if (phantom >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + qp->consumeQ->head = phantom; + return QP_SUCCESS; +} + + +/** + * @brief Resets the phantom tail pointer and discards any pending + * enqueues + * @param qp handle to the queue pair + * @return QP_SUCCESS on success, QP_ERROR_INVALID_HANDLE if the handle + * is malformed + * @sideeffects Resets the phantom tail pointer + */ +int32 +QP_EnqueueReset(QPHandle *qp) +{ + uint32 tail; + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + tail = qp->produceQ->tail; + if (tail >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + qp->produceQ->phantom_tail = tail; + return QP_SUCCESS; +} + +/** + * @brief Resets the phantom head pointer and discards any pending + * dequeues + * @param qp handle to the queue pair + * @return QP_SUCCESS on success, QP_ERROR_INVALID_HANDLE if the handle + * is malformed + * @sideeffects Resets the phantom head pointer + */ +int32 +QP_DequeueReset(QPHandle *qp) +{ + uint32 head; + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + head = qp->consumeQ->head; + if (head >= qp->queueSize) { + return QP_ERROR_INVALID_HANDLE; + } + + qp->consumeQ->phantom_head = head; + return QP_SUCCESS; +} + +EXPORT_SYMBOL(QP_EnqueueSpace); +EXPORT_SYMBOL(QP_EnqueueSegment); +EXPORT_SYMBOL(QP_EnqueueCommit); +EXPORT_SYMBOL(QP_DequeueSpace); +EXPORT_SYMBOL(QP_DequeueSegment); +EXPORT_SYMBOL(QP_DequeueCommit); +EXPORT_SYMBOL(QP_EnqueueReset); +EXPORT_SYMBOL(QP_DequeueReset); diff --git a/arch/arm/mvp/mvpkm/qp_host_kernel.c b/arch/arm/mvp/mvpkm/qp_host_kernel.c new file mode 100644 index 0000000..c53f315 --- /dev/null +++ b/arch/arm/mvp/mvpkm/qp_host_kernel.c @@ -0,0 +1,574 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief MVP host kernel implementation of the queue pairs API + * + */ + +#include +#include +#include +#include +#include +#include + +#include "mvp.h" +#include "mvpkm_kernel.h" +#include "qp.h" +#include "qp_host_kernel.h" + +static QPHandle queuePairs[QP_MAX_QUEUE_PAIRS]; +static QPListener listeners[QP_MAX_LISTENERS]; + +/* + * Protect listeners and queuePairs. + */ +static DEFINE_MUTEX(qpLock); + +#define QPLock() mutex_lock(&qpLock) +#define QPUnlock() mutex_unlock(&qpLock) + +/** + * @brief Map a vector of pages into virtually contiguous kernel space + * @param vm this vm's vm struct + * @param base base machine page number that lists pages to map + * @param nrPages number of pages to map + * @param[out] qp handle to qp to set up + * @param[out] hkva virtual address mapping + * @return QP_SUCCESS on success, error code otherwise. Mapped address + * is returned in hkva + */ + +static int32 +MapPages(MvpkmVM *vm, + MPN base, + uint32 nrPages, + QPHandle *qp, + HKVA *hkva) +{ + HKVA *va; + uint32 i; + uint32 rc; + struct page *basepfn = pfn_to_page(base); + struct page **pages; + + BUG_ON(!vm); // this would be very bad. + + if (!hkva) { + return QP_ERROR_INVALID_ARGS; + } + + pages = kmalloc(nrPages * sizeof (MPN), GFP_KERNEL); + if (!pages) { + return QP_ERROR_NO_MEM; + } + + /* + * Map in the first page, read out the MPN vector + */ + down_write(&vm->lockedSem); + va = kmap(basepfn); + if (!va) { + rc = QP_ERROR_INVALID_ARGS; + kfree(pages); + qp->pages = NULL; + goto out; + } + + /* + * Grab references and translate MPNs->PFNs + */ + for (i = 0; i < nrPages; i++) { + pages[i] = pfn_to_page(((MPN*)va)[i]); + get_page(pages[i]); + } + + /* + * Clean up the first mapping and remap the entire vector + */ + kunmap(basepfn); + va = vmap(pages, nrPages, VM_MAP, PAGE_KERNEL); + if (!va) { + rc = QP_ERROR_NO_MEM; + for (i = 0; i < nrPages; i++) { + put_page(pages[i]); + } + kfree(pages); + qp->pages = NULL; + goto out; + } else { + *hkva = (HKVA)va; + qp->pages = pages; + } + + /* + * Let's not leak mpns.. + */ + memset(va, 0x0, nrPages * PAGE_SIZE); + + rc = QP_SUCCESS; + +out: + up_write(&vm->lockedSem); + return rc; +} + +/** + * @brief Initialize all free queue pair entries and listeners + */ + +void +QP_HostInit(void) +{ + uint32 i; + + for (i = 0; i < QP_MAX_QUEUE_PAIRS; i++) { + QP_MakeInvalidQPHandle(&queuePairs[i]); + } + + for (i = 0; i < QP_MAX_LISTENERS; i++) { + listeners[i] = NULL; + } +} + + +/** + * @brief Detaches a guest from a queue pair and notifies + * any registered listeners through the detach callback + * @param id id that guest requested a detach from, detaches all + * queue pairs associated with a VM if the resource id == QP_INVALID_ID + * @return QP_SUCCESS on success, appropriate error code otherwise + */ + +int32 +QP_GuestDetachRequest(QPId id) +{ + QPHandle *qp; + uint32 i; + + if (id.resource >= QP_MAX_ID && id.resource != QP_INVALID_ID) { + return QP_ERROR_INVALID_ARGS; + } + + QPLock(); + + /* + * Invalidate all queue pairs associated with this VM if + * resource == QP_INVALID_ID + */ + if (id.resource == QP_INVALID_ID) { + for (i = 0; i < QP_MAX_QUEUE_PAIRS; i++) { + qp = &queuePairs[i]; + if (qp->id.context == id.context && qp->peerDetachCB) { + qp->peerDetachCB(qp->detachData); + } + } + } else { + qp = &queuePairs[id.resource]; + if (qp->peerDetachCB) { + qp->peerDetachCB(qp->detachData); + } + } + + QPUnlock(); + + return QP_SUCCESS; +} + + +/** + * @brief Attaches a guest to shared memory region + * @param vm guest to attach + * @param args queue pair args structure: + * - args->id: id of the region to attach to, if id.resource == QP_INVALID_ID, then + * an id is assigned + * - args->capacity: total size of the region in bytes + * - args->type: type of queue pair (e.g PVTCP) + * @param base base machine page number that lists pages to map + * @param nrPages number of pages to map + * @return QP_SUCCESS on success, appropriate error code otherwise. + */ + +int32 +QP_GuestAttachRequest(MvpkmVM *vm, + QPInitArgs *args, + MPN base, + uint32 nrPages) +{ + int32 rc; + HKVA hkva = 0; + QPHandle *qp; + uint32 i; + + if ((!QP_CheckArgs(args)) || + (vm->wsp->guestId != (Mksck_VmId)args->id.context) || + (args->capacity != (nrPages * PAGE_SIZE))) { + return QP_ERROR_INVALID_ARGS; + } + + QP_DBG("%s: Guest requested attach to [%u:%u] capacity: %u type: %x base: %x nrPages: %u\n", + __FUNCTION__, + args->id.context, + args->id.resource, + args->capacity, + args->type, + base, + nrPages); + + QPLock(); + + /* + * Assign a resource id if id == QP_INVALID_ID + */ + if (args->id.resource == QP_INVALID_ID) { + for (i = 0; i < QP_MAX_QUEUE_PAIRS; i++) { + if (queuePairs[i].state == QP_STATE_FREE) { + args->id.resource = i; + QP_DBG("%s: Guest requested anonymous region, assigning resource id %u\n", + __FUNCTION__, args->id.resource); + goto found; + } + } + + rc = QP_ERROR_NO_MEM; + goto out; + } + +found: + qp = queuePairs + args->id.resource; + + if (qp->state != QP_STATE_FREE) { + rc = QP_ERROR_ALREADY_ATTACHED; + goto out; + } + + /* + * Brand new queue pair, allocate some memory to back it and + * initialize the entry + */ + rc = MapPages(vm, base, nrPages, qp, &hkva); + if (rc != QP_SUCCESS) { + goto out; + } + + /* NB: reversed from the guest */ + qp->id = args->id; + qp->capacity = args->capacity; + qp->produceQ = (QHandle*)hkva; + qp->consumeQ = (QHandle*)(hkva + args->capacity/2); + qp->queueSize = args->capacity/2 - sizeof(QHandle); + qp->type = args->type; + qp->state = QP_STATE_GUEST_ATTACHED; + + /* + * The qp is now assumed to be well-formed + */ + QP_DBG("%s: Guest attached to region [%u:%u] capacity: %u HKVA: %x\n", + __FUNCTION__, + args->id.context, + args->id.resource, + args->capacity, + (uint32)hkva); + rc = QP_SUCCESS; + +out: + QPUnlock(); + if (rc != QP_SUCCESS) { + QP_DBG("%s: Failed to attach: %u\n", __FUNCTION__, rc); + } + return rc; +} + + +/** + * @brief Attaches the host to the shared memory region. The guest + * MUST have allocated the shmem region already or else this will fail. + * @param args structure with the shared memory region id to attach to, + * total size of the region in bytes, and type of queue pair (e.g PVTCP) + * @param[in, out] qp handle to the queue pair to return + * @return QP_SUCCESS on success, appropriate error code otherwise + */ + +int32 +QP_Attach(QPInitArgs *args, + QPHandle** qp) +{ + uint32 rc; + + if (!qp || !QP_CheckArgs(args)) { + return QP_ERROR_INVALID_ARGS; + } + + QP_DBG("%s: Attaching to id: [%u:%u] capacity: %u\n", + __FUNCTION__, + args->id.context, + args->id.resource, + args->capacity); + + QPLock(); + *qp = queuePairs + args->id.resource; + + if (!QP_CheckHandle(*qp)) { + *qp = NULL; + rc = QP_ERROR_INVALID_HANDLE; + goto out; + } + + if ((*qp)->state == QP_STATE_CONNECTED) { + rc = QP_ERROR_ALREADY_ATTACHED; + goto out; + } + + if ((*qp)->state != QP_STATE_GUEST_ATTACHED) { + rc = QP_ERROR_INVALID_HANDLE; + goto out; + } + + (*qp)->state = QP_STATE_CONNECTED; + + QP_DBG("%s: Attached!\n", __FUNCTION__); + rc = QP_SUCCESS; + +out: + QPUnlock(); + return rc; +} + +/** + * @brief Detaches the host to the shared memory region. + * @param[in, out] qp handle to the queue pair + * @return QP_SUCCESS on success, appropriate error code otherwise + * @sideeffects Frees memory + */ + +int32 +QP_Detach(QPHandle* qp) +{ + uint32 rc; + uint32 i; + + QPLock(); + if (!QP_CheckHandle(qp)) { + rc = QP_ERROR_INVALID_HANDLE; + goto out; + } + + QP_DBG("%s: Freeing queue pair [%u:%u]\n", + __FUNCTION__, + qp->id.context, + qp->id.resource); + + BUG_ON(!qp->produceQ); + BUG_ON(!qp->pages); + BUG_ON((qp->state != QP_STATE_CONNECTED) && + (qp->state != QP_STATE_GUEST_ATTACHED)); + + vunmap(qp->produceQ); + + for (i = 0; i < qp->capacity/PAGE_SIZE; i++) { + put_page(qp->pages[i]); + } + kfree(qp->pages); + + QP_DBG("%s: Host detached from [%u:%u]\n", + __FUNCTION__, + qp->id.context, + qp->id.resource); + + QP_MakeInvalidQPHandle(qp); + rc = QP_SUCCESS; + +out: + QPUnlock(); + return rc; +} + + +/** + * @brief Detaches and destroys all queue pairs associated with a given guest + * @param vmID which VM to clean up + * @sideeffects Destroys all queue pairs for guest vmID + */ + +void QP_DetachAll(Mksck_VmId vmID) { + QPId id = { + .context = (uint32)vmID, + .resource = QP_INVALID_ID + }; + + QP_DBG("%s: Detaching all queue pairs from vmId context %u\n", __FUNCTION__, vmID); + QP_GuestDetachRequest(id); +} + +/** + * @brief Registers a listener into the queue pair system. Callbacks are + * called with interrupts disabled and must not sleep. + * @param listener listener to be called + * @return QP_SUCCESS on success, QP_ERROR_NO_MEM if no more + * listeners can be registered + */ + +int32 +QP_RegisterListener(const QPListener listener) +{ + uint32 i; + int32 rc = QP_ERROR_NO_MEM; + + QPLock(); + for (i = 0; i < QP_MAX_LISTENERS; i++) { + if (!listeners[i]) { + listeners[i] = listener; + QP_DBG("%s: Registered listener\n", __FUNCTION__); + rc = QP_SUCCESS; + break; + } + } + QPUnlock(); + + return rc; +} + + +/** + * @brief Unregister a listener service from the queue pair system. + * @param listener listener to unregister + * @return QP_SUCCESS on success, appropriate error code otherwise + */ + +int32 +QP_UnregisterListener(const QPListener listener) +{ + uint32 i; + int32 rc = QP_ERROR_INVALID_HANDLE; + + QPLock(); + for (i = 0; i < QP_MAX_LISTENERS; i++) { + if (listeners[i] == listener) { + listeners[i] = NULL; + QP_DBG("%s: Unregistered listener\n", __FUNCTION__); + rc = QP_SUCCESS; + break; + } + } + QPUnlock(); + + return rc; +} + + +/** + * @brief Registers a callback to be called when the guest detaches + * from a queue pair. Callbacks are called with interrupts and + * must not sleep. + * @param qp handle to the queue pair + * @param callback callback to be called + * @param data data to deliver to the callback + * @return QP_SUCCESS on success, appropriate error code otherwise + */ + +int32 +QP_RegisterDetachCB(QPHandle *qp, + void (*callback)(void*), + void *data) +{ + if (!QP_CheckHandle(qp)) { + return QP_ERROR_INVALID_HANDLE; + } + + if (!callback) { + return QP_ERROR_INVALID_ARGS; + } + + qp->peerDetachCB = callback; + qp->detachData = data; + QP_DBG("%s: Registered detach callback\n", __FUNCTION__); + return QP_SUCCESS; +} + + +/** + * @brief Noop on the host, only guests can initiate a notify + * @param args noop + * @return QP_SUCCESS + */ + + +int32 QP_Notify(QPInitArgs *args) { + return QP_SUCCESS; +} + + +/** + * @brief Notify any registered listeners for the given queue pair + * @param args initialization arguments used by the guest + * @return QP_SUCCESS on success, error otherwise + */ + +int32 QP_NotifyListener(QPInitArgs *args) { + int32 i; + QPHandle *qp = NULL; + + if (!QP_CheckArgs(args)) { + return QP_ERROR_INVALID_ARGS; + } + + /* + * Iterate over listeners until one of them reports they handled it + */ + QPLock(); + for (i = 0; i < QP_MAX_LISTENERS; i++) { + if (listeners[i]) { + QP_DBG("Delivering attach event to listener...\n"); + if (listeners[i](args) == QP_SUCCESS) { + break; + } + } + } + + if (i == QP_MAX_LISTENERS) { + /* + * No listener successfully probed this QP. + * The guest DETACH HVC isn't implemented; we need compensate for it + * by deallocating the QP here. + * This is a workaround which assumes, more-or-less correctly, that + * unsuccessful QP probes never lead to subsequent host-attaching. + */ + + qp = &queuePairs[args->id.resource]; + } + + QPUnlock(); + + if (qp) { + QP_Detach(qp); + } + return QP_SUCCESS; +} + + +EXPORT_SYMBOL(QP_Attach); +EXPORT_SYMBOL(QP_Detach); +EXPORT_SYMBOL(QP_RegisterListener); +EXPORT_SYMBOL(QP_UnregisterListener); +EXPORT_SYMBOL(QP_RegisterDetachCB); +EXPORT_SYMBOL(QP_Notify); diff --git a/arch/arm/mvp/mvpkm/qp_host_kernel.h b/arch/arm/mvp/mvpkm/qp_host_kernel.h new file mode 100644 index 0000000..111524a --- /dev/null +++ b/arch/arm/mvp/mvpkm/qp_host_kernel.h @@ -0,0 +1,44 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief QP host function prototypes + */ + + +#ifndef _QP_HOST_KERNEL_H +#define _QP_HOST_KERNEL_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +void QP_HostInit(void); +int32 QP_GuestAttachRequest(MvpkmVM *vm, + QPInitArgs *args, + MPN base, + uint32 nr_pages); +int32 QP_GuestDetachRequest(QPId id); +void QP_DetachAll(Mksck_VmId vmID); +int32 QP_NotifyListener(QPInitArgs *args); + +#endif diff --git a/arch/arm/mvp/mvpkm/tsc.h b/arch/arm/mvp/mvpkm/tsc.h new file mode 100644 index 0000000..0b3149b --- /dev/null +++ b/arch/arm/mvp/mvpkm/tsc.h @@ -0,0 +1,49 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Time stamp and event counters. + */ + +#ifndef _TSC_H_ +#define _TSC_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "arm_inline.h" + +#define ARM_PMNC_E (1 << 0) +#define ARM_PMNC_D (1 << 3) + +#define ARM_PMCNT_C (1 << 31) + +#define ARM_PMNC_INVALID_EVENT -1 + +#define TSC_READ(_reg) ARM_MRC_CP15(CYCLE_COUNT, (_reg)) +#define TSC_WRITE(_reg) ARM_MCR_CP15(CYCLE_COUNT, (_reg)) + +#endif // ifndef _TSC_H_ diff --git a/arch/arm/mvp/mvpkm/utils.h b/arch/arm/mvp/mvpkm/utils.h new file mode 100644 index 0000000..1fc56e9 --- /dev/null +++ b/arch/arm/mvp/mvpkm/utils.h @@ -0,0 +1,172 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief General architecture-independent definitions, typedefs, and macros. + */ + +#ifndef _UTILS_H +#define _UTILS_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_WORKSTATION +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define MAX_FILENAME 128 + +// Round address up to given size boundary +// Note: ALIGN() conflicts with Linux + +#define MVP_ALIGN(_v, _n) (((_v) + (_n) - 1) & -(_n)) + +#define ALIGNVA(_addr, _size) MVP_ALIGN(_addr, _size) + +#define alignof(t) offsetof(struct { char c; typeof(t) x; }, x) + +#define MIN(x,y) ((x) < (y) ? (x) : (y)) +#define MAX(x,y) ((x) > (y) ? (x) : (y)) + +#ifndef NULL +#define NULL ((void *)0) +#endif + +#define KB(_X_) ((_X_)*1024U) +#define MB(_X_) (KB(_X_)*1024) +#define GB(_X_) (MB(_X_)*1024) + +#define NELEM(x) (sizeof(x)/sizeof((x)[0])) + +/* + * x in [low,high) + * args evaluated once + */ +#define RANGE(x,low,high) \ + ({ \ + typeof(x) _x = (x); \ + typeof(x) _low = (typeof(x))(low); \ + typeof(x) _high =(typeof(x))(high); \ + (_Bool)( (_low <= _x) && (_x < _high)); \ + }) + +#define OBJECTS_PER_PAGE(_type) (PAGE_SIZE / sizeof(_type)) + +#define MA_2_MPN(_ma) ((MPN)((_ma) / PAGE_SIZE)) +#define MPN_2_MA(_mpn) ((MA)((_mpn) * PAGE_SIZE)) + +#define VA_2_VPN(_va) ((_va) / PAGE_SIZE) +#define VPN_2_vA(_vpn) ((_vpn) * PAGE_SIZE) + +/* + * The following convenience macro can be used in a following situation + * + * send(..., &foo, sizeof(foo)) --> send(..., PTR_N_SIZE(foo)) + */ + +#define PTR_N_SIZE(_var) &(_var), sizeof(_var) + + +/* + * + * BIT-PULLING macros + * + */ +#define MVP_BIT(val,n) ( ((val)>>(n))&1) +#define MVP_BITS(val,m,n) (((val)<<(31-(n))) >> ((31-(n))+(m)) ) +#define MVP_EXTRACT_FIELD(w, m, n) MVP_BITS((w), (m), ((m) + (n) - 1)) +#define MVP_MASK(m, n) (MVP_EXTRACT_FIELD(~(uint32)0U, (m), (n)) << (m)) +#define MVP_UPDATE_FIELD(old_val, field_val, m, n) \ + (((old_val) & ~MVP_MASK((m), (n))) | (MVP_EXTRACT_FIELD((field_val), 0, (n)) << (m))) + +/* + * + * 64BIT-PULLING macros + * + */ +#define MVP_BITS64(val,m,n) (((val)<<(63-(n))) >> ((63-(n))+(m)) ) +#define MVP_EXTRACT_FIELD64(w, m, n) MVP_BITS64((w), (m), ((m) + (n) - 1)) +#define MVP_MASK64(m, n) (MVP_EXTRACT_FIELD64(~(uint64)0ULL, (m), (n)) << (m)) +#define MVP_UPDATE_FIELD64(old_val, field_val, m, n) \ + (((old_val) & ~MVP_MASK64((m), (n))) | (MVP_EXTRACT_FIELD64(((uint64)(field_val)), 0ULL, (n)) << (m))) + +/* + * + * BIT-CHANGING macros + * + */ +#define MVP_SETBIT(val,n) ((val)|=(1<<(n))) +#define MVP_CLRBIT(val,n) ((val)&=(~(1<<(n)))) + +/* + * Fixed bit-width sign extension. + */ +#define MVP_SIGN_EXTEND(val,width) \ + (((val) ^ (1 << ((width) - 1))) - (1 << ((width) - 1))) + + +/* + * Assembler helpers. + */ +#define _MVP_HASH # +#define MVP_HASH() _MVP_HASH + +#define _MVP_STRINGIFY(...) #__VA_ARGS__ +#define MVP_STRINGIFY(...) _MVP_STRINGIFY(__VA_ARGS__) + +#ifndef __ASSEMBLER__ + +#include +#include + +/* + * Constant equivalents of build-flags. + * + * Test these when possible instead of using #ifdef so that your code + * gets parsed. + */ +#ifdef MVP_DEBUG +static const _Bool mvpDebug = true; +#else +static const _Bool mvpDebug = false; +#endif + +#ifdef MVP_STATS +static const _Bool mvpStats = true; +#else +static const _Bool mvpStats = false; +#endif + +#ifdef MVP_DEVEL +static const _Bool mvpDevel = true; +#else +static const _Bool mvpDevel = false; +#endif + +#endif + +#endif diff --git a/arch/arm/mvp/mvpkm/ve_defs.h b/arch/arm/mvp/mvpkm/ve_defs.h new file mode 100644 index 0000000..bd1d975 --- /dev/null +++ b/arch/arm/mvp/mvpkm/ve_defs.h @@ -0,0 +1,72 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Virtualization extension definitions. + * + * See ARM PRD03-GENC-008353 11.0. + */ +#ifndef _VE_DEFS_H_ +#define _VE_DEFS_H_ + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define ARM_VE_HSR_EC_BIT_POS 26 +#define ARM_VE_HSR_EC_LENGTH 6 + +#define ARM_VE_HSR_EC_UNKNOWN 0x00 +#define ARM_VE_HSR_EC_WFI_WFE 0x01 +#define ARM_VE_HSR_EC_MCR_MRC_CP15 0x03 +#define ARM_VE_HSR_EC_MCRR_MRRC_CP15 0x04 +#define ARM_VE_HSR_EC_MCR_MRC_CP14 0x05 +#define ARM_VE_HSR_EC_LDC_STC_CP14 0x06 +#define ARM_VE_HSR_EC_HCPTR 0x07 +#define ARM_VE_HSR_EC_MRC_CP10 0x08 +#define ARM_VE_HSR_EC_JAZELLE 0x09 +#define ARM_VE_HSR_EC_BXJ 0x0a +#define ARM_VE_HSR_EC_MRRC_CP14 0x0c +#define ARM_VE_HSR_EC_SVC_HYP 0x11 +#define ARM_VE_HSR_EC_HVC 0x12 +#define ARM_VE_HSR_EC_SMC 0x13 +#define ARM_VE_HSR_EC_IABORT_SND 0x20 +#define ARM_VE_HSR_EC_IABORT_HYP 0x21 +#define ARM_VE_HSR_EC_DABORT_SND 0x24 +#define ARM_VE_HSR_EC_DABORT_HYP 0x25 + +#define ARM_VE_HSR_FS_BIT_POS 0 +#define ARM_VE_HSR_FS_LENGTH 6 + +#define ARM_VE_HSR_FS_TRANS_L1 0x5 +#define ARM_VE_HSR_FS_TRANS_L2 0x6 +#define ARM_VE_HSR_FS_TRANS_L3 0x7 + +#define ARM_VE_HSR_FS_PERM_L1 0xd +#define ARM_VE_HSR_FS_PERM_L2 0xe +#define ARM_VE_HSR_FS_PERM_L3 0xf + +#endif /// ifndef _VE_DEFS_H_ diff --git a/arch/arm/mvp/mvpkm/vfp_switch.S b/arch/arm/mvp/mvpkm/vfp_switch.S new file mode 100644 index 0000000..49d3987 --- /dev/null +++ b/arch/arm/mvp/mvpkm/vfp_switch.S @@ -0,0 +1,216 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +#include "arm_defs.h" +#include "platdefx.h" +#include "arm_as_macros.h" + +/** + * @file + * + * @brief Save and Load VFP entire VFP state. + */ + + .text + +/** + * @brief Save VFP context + * @param R0 = save area pointer: + * .long fpexc,fpscr,fpinst,fpinst2,cpacr,fpexc' + * .double d0..d15 + * .double d16..d31 + * Note: VFP is left in an enable state regardless of initial state. + */ + .align 4 + .global SaveVFP +SaveVFP: + /* + * Save registers. GCC does not expect us to preserve R0..R3,R12,LR. + */ + stmdb sp!, {r4-r6} + + /* + * Save Coproc Access Control register. + */ + mrc_p15 COPROC_ACCESS_CONTROL, r5 + + /* + * If CP10/11 are disabled, enable them so we can save VFP state. + * The host (or guest) may have left data in the data registers that + * must be preserved. + */ + orr r2, r5, #CPACR_CP10_CP11_PRIV_ONLY + mcr_p15 COPROC_ACCESS_CONTROL, r2 + isb + + /* + * Follow procedure on AppxB-22 ARM DDI0406B to save FPINST[2]. + * Also enable VFP access with FPEXC_EN. + */ + fmrx r1, fpexc @ get existing FPEXC system register + orr r6, r1, #ARM_VFP_SYSTEM_REG_FPEXC_EX|ARM_VFP_SYSTEM_REG_FPEXC_FP2V|ARM_VFP_SYSTEM_REG_FPEXC_EN +#if !defined(MVP_HOST_CODE_forceon) + fmxr fpexc, r6 @ set FPEXC.EX, .FP2V and .EN + fmrx r6, fpexc @ read them back + tst r6, #ARM_VFP_SYSTEM_REG_FPEXC_EX @ see if either one is valid + beq 1000f @ neither, skip it all + fmrx r3, FPINST @ FPINST is valid, save it + tst r6, #ARM_VFP_SYSTEM_REG_FPEXC_FP2V @ see if FPINST2 is valid + beq 1000f + fmrx r4, FPINST2 @ FPINST2 is valid, save it +1000: +#else + mov r6, r1 +#endif + fmrx r2, FPSCR @ always save FPSCR system register + + /* + * At this point: + * R1 = original FPEXC + * R2 = FPSCR + * R3 = FPINST + * R4 = FPINST2 + * R5 = original CPACR + * R6 = FPEXC readback with FPEXC.EX, .FP2V and .EN set + * telling us whether FPINST/2 are valid + */ + stmia r0!, {r1-r6} + + /* + * Save floating point data registers. + */ + vstmia r0!, {d0-d15} @ Save d0 thru d15 + + /** + * @todo We should probably just read MVFR0 once at boot/initialization + * time and store it in some variable, to save having to do what might + * be expensive coprocessor accesses. + */ + fmrx r1, MVFR0 @ Read Media and VFP Feature Register 0 + and r1, r1, #ARM_VFP_SYSTEM_REG_MVFR0_A_SIMD_MASK @ A_SIMD field + cmp r1, #2 @ 32 x 64bit registers? + bne 2000f + vstmia r0!, {d16-d31} +2000: + + /* + * Restore scratch registers and return. + */ + ldmia sp!, {r4-r6} + mov pc, lr + + +/** + * @brief Load VFP context + * @param R0 = load area pointer: + * .long fpexc,fpscr,fpinst,fpinst2,cpacr,fpexc' + * .double d0..d15 + * .double d16..d31 + * @note VFP is assumed to be in an enabled state on entry. + */ + .align 4 + .global LoadVFP +LoadVFP: + /* + * Save registers. GCC does not expect us to preserve R0..R3,R12,LR. + */ + stmdb sp!, {r4-r6} + + /* + * Get status register contents: + * R1 = original FPEXC + * R2 = FPSCR + * R3 = FPINST + * R4 = FPINST2 + * R5 = original CPACR + * R6 = FPEXC readback with FPEXC.EX, .FP2V and .EN set + * telling us whether FPINST/2 are valid + */ + ldmia r0!, {r1-r6} + + /* + * Restore some initial FP status registers. + */ + fmxr fpexc, r6 @ with FPEXC.EX, .FP2V and .EN set + fmxr FPSCR, r2 @ always load FPSCR system register + + /* + * Follow procedure on AppxB-22 ARM DDI0406B to load FPINST[2]. + */ +#if !defined(MVP_HOST_CODE_forceon) + fmrx r6, fpexc @ initial call might have different bits + @ ... because FPEXC.EX, .FP2V and .EN + @ are forced set by init code in + @ mvpd.c SetupMonitor() + tst r6, #ARM_VFP_SYSTEM_REG_FPEXC_EX @ see if either one is valid + beq 1000f @ neither, skip it all + fmxr FPINST, r3 @ FPINST is valid, save it + tst r6, #ARM_VFP_SYSTEM_REG_FPEXC_FP2V @ see if FPINST2 is valid + beq 1000f + fmxr FPINST2, r4 @ FPINST2 is valid, save it +1000: +#endif + + /* + * Load floating point data registers. + */ + vldmia r0!, {d0-d15} + + /** + * @todo We should probably just read MVFR0 once at boot/initialization + * time and store it in some variable, to save having to do what might + * be expensive coprocessor accesses. + */ + fmrx r3, MVFR0 @ Read Media and VFP Feature Register 0 + and r3, r3, #ARM_VFP_SYSTEM_REG_MVFR0_A_SIMD_MASK @ A_SIMD field + cmp r3, #2 @ 32 x 64bit registers? + bne 2000f + vldmia r0!, {d16-d31} +2000: + + /* + * Now that VFP registers are all loaded, we put the restored values + * back in the registers, possibly disabling the VFP. + */ + fmxr fpexc, r1 @ with original FPEXC.EX, FPEXC.FP2V + @ and FPEXC.EN values + + /* + * Load Coproc Access Control CP10/CP11 enable bits, possibly disabling + * VFP access. + */ + mrc_p15 COPROC_ACCESS_CONTROL, r0 + bic r0, r0, #CPACR_CP10_CP11_MASK + and r5, r5, #CPACR_CP10_CP11_MASK + orr r0, r0, r5 + mcr_p15 COPROC_ACCESS_CONTROL, r0 + isb + + /* + * Restore scratch registers and return. + */ + ldmia sp!, {r4-r6} + mov pc, lr + + .align 4 + .global GetFPEXC +GetFPEXC: + fmrx r0, fpexc @ get existing FPEXC system register + mov pc, lr diff --git a/arch/arm/mvp/mvpkm/vmid.h b/arch/arm/mvp/mvpkm/vmid.h new file mode 100644 index 0000000..dd89965 --- /dev/null +++ b/arch/arm/mvp/mvpkm/vmid.h @@ -0,0 +1,44 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +#ifndef _VMID_H +#define _VMID_H + +/** + * @file + * + * @brief The vmid definition + */ + + + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_HOSTUSER +#define INCLUDE_ALLOW_GUESTUSER +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define VMID_UNDEF (uint16)0xffff +typedef uint16 VmId; + +#endif diff --git a/arch/arm/mvp/mvpkm/worldswitch.h b/arch/arm/mvp/mvpkm/worldswitch.h new file mode 100644 index 0000000..785f2cd --- /dev/null +++ b/arch/arm/mvp/mvpkm/worldswitch.h @@ -0,0 +1,381 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Definition of the world switch page + * + * Two pages are maintained to facilitate switching from the vmx to + * the monitor - a data and code page. The data page contains: + * - the necessary information about itself (its MPN, KVA, ...) + * - the saved register file of the other world (including some cp15 regs) + * - some information about the monitor's address space (the monVA member) + * that needed right after the w.s before any communication channels + * could have been established + * - a world switch related L2 table of the monitor -- this could be + * elsewhere. + * + * The code page contains: + * - the actual switching code that saves/restores the registers + * + * The world switch data page is mapped into the user, kernel, and the monitor + * address spaces. In case of the user and monitor spaces the global variable + * wsp points to the world switch page (in the vmx and the monitor + * respectively). The kernel address of the world switch page is saved on + * the page itself: wspHKVA. + * + * The kernel virtual address for both code and data pages is mapped into + * the monitor's space temporarily at the time of the actual switch. This is + * needed to provide a stable code and data page while the L1 page table + * base is changing. As the monitor does not need the world switch data page + * at its KVA for its internal operation, that map is severed right after the + * switching to the monitor and re-established before switching back. + */ +#ifndef _WORLDSWITCH_H +#define _WORLDSWITCH_H + +#define INCLUDE_ALLOW_MVPD +#define INCLUDE_ALLOW_VMX +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/** + * @brief Area for saving the monitor/kernel register files. + * + * The order of the registers in this structure was designed to + * facilitate the organization of the switching code. For example + * all Supervisor Mode registers are grouped together allowing the + * @code + * switch to svc, + * stm old svc regs + * ldm new svc regs + * @endcode + * code to work using a single base register for both the store and + * load area. + */ +#define MAX_REGISTER_SAVE_SIZE 464 + +#ifndef __ASSEMBLER__ +typedef struct { + uint32 kSPSR_svc; + uint32 kr13_svc; + uint32 kr14_svc; + uint32 mSPSR_svc; + uint32 mR13_svc; + uint32 mR14_svc; + + uint32 kSPSR_abt; + uint32 kr13_abt; + uint32 kr14_abt; + uint32 mSPSR_abt; + uint32 mR13_abt; + uint32 mR14_abt; + + uint32 kSPSR_und; + uint32 kr13_und; + uint32 kr14_und; + uint32 mSPSR_und; + uint32 mR13_und; + uint32 mR14_und; + + uint32 kSPSR_irq; + uint32 kr13_irq; + uint32 kr14_irq; + uint32 mSPSR_irq; + uint32 mR13_irq; + uint32 mR14_irq; + + uint32 kSPSR_fiq; + uint32 kr8_fiq; + uint32 kr9_fiq; + uint32 kr10_fiq; + uint32 kr11_fiq; + uint32 kr12_fiq; + uint32 kr13_fiq; + uint32 kr14_fiq; + uint32 mSPSR_fiq; + uint32 mR8_fiq; + uint32 mR9_fiq; + uint32 mR10_fiq; + uint32 mR11_fiq; + uint32 mR12_fiq; + uint32 mR13_fiq; + uint32 mR14_fiq; +} BankedRegisterSave; + +/** + * @brief Registers for monitor execution context. + */ +typedef struct { + uint32 mCPSR; + uint32 mR1; + uint32 mR4; + uint32 mR5; + uint32 mR6; + uint32 mR7; + uint32 mR8; + uint32 mR9; + uint32 mR10; + uint32 mR11; + uint32 mSP; + uint32 mLR; // =mPC +} MonitorRegisterSave; + +/** + * @brief LPV monitor register save/restore. + */ +typedef struct { + uint32 kR2; // =kCPSR + uint32 kR4; + uint32 kR5; + uint32 kR6; + uint32 kR7; + uint32 kR8; + uint32 kR9; + uint32 kR10; + uint32 kR11; + uint32 kR13; + uint32 kR14; // =kPC + + BankedRegisterSave bankedRegs; + + uint32 kCtrlReg; + uint32 kTTBR0; + uint32 kDACR; + uint32 kASID; + uint32 kTIDUserRW; + uint32 kTIDUserRO; + uint32 kTIDPrivRW; + uint32 kCSSELR; + uint32 kPMNCIntEn; + uint32 kPMNCCCCNT; + uint32 kPMNCOvFlag; + uint32 kOpEnabled; + uint32 mCtrlReg; + uint32 mTTBR0; + uint32 mASID; + uint32 mTIDUserRW; + uint32 mTIDUserRO; + uint32 mTIDPrivRW; + uint32 mCSSELR; + + MonitorRegisterSave monRegs; +} RegisterSaveLPV; + +/** + * @brief VE monitor register save/restore. + */ +typedef struct { + uint32 mHTTBR; + + uint32 kR3; + uint32 kR4; + uint32 kR5; + uint32 kR6; + uint32 kR7; + uint32 kR8; + uint32 kR9; + uint32 kR10; + uint32 kR11; + uint32 kR12; + uint32 kCPSR; + uint32 kRet; + + BankedRegisterSave bankedRegs; + + uint32 kCSSELR; + uint32 kCtrlReg; + uint32 kTTBR0[2]; + uint32 kTTBR1[2]; + uint32 kTTBRC; + uint32 kDACR; + uint32 kDFSR; + uint32 kIFSR; + uint32 kAuxDFSR; + uint32 kAuxIFSR; + uint32 kDFAR; + uint32 kIFAR; + uint32 kPAR[2]; + uint32 kPRRR; + uint32 kNMRR; + uint32 kASID; + uint32 kTIDUserRW; + uint32 kTIDUserRO; + uint32 kTIDPrivRW; + uint32 mCSSELR; + uint32 mCtrlReg; + uint32 mTTBR0[2]; + uint32 mTTBR1[2]; + uint32 mTTBRC; + uint32 mDACR; + uint32 mDFSR; + uint32 mIFSR; + uint32 mAuxDFSR; + uint32 mAuxIFSR; + uint32 mDFAR; + uint32 mIFAR; + uint32 mPAR[2]; + uint32 mPRRR; + uint32 mNMRR; + uint32 mASID; + uint32 mTIDUserRW; + uint32 mTIDUserRO; + uint32 mTIDPrivRW; + + uint32 mHCR; + uint32 mHDCR; + uint32 mHCPTR; + uint32 mHSTR; + uint32 mVTTBR[2]; + uint32 mVTCR; + + MonitorRegisterSave monRegs; +} RegisterSaveVE; + +typedef union { + unsigned char reserve_space[MAX_REGISTER_SAVE_SIZE]; + RegisterSaveLPV lpv; + RegisterSaveVE ve; +} RegisterSave; + +MY_ASSERTS(REGSAVE, + ASSERT_ON_COMPILE(sizeof(RegisterSave) == MAX_REGISTER_SAVE_SIZE); +) + +/** + * @brief Area for saving the monitor/kernel VFP state. + */ +typedef struct VFPSave { + uint32 fpexc, fpscr, fpinst, fpinst2, cpacr, fpexc_; + + uint64 fpregs[32]; // Hardware requires that this must be 8-byte (64-bit) + // aligned, however the SaveVFP/LoadVFP code does not + // align its pointer before accessing so we don't have + // an 'aligned(8)' attribute here. However, the + // alignment is checked via asserts in SetupMonitor() + // where it initializes the contents. + + // So if the preceding uint32's are changed and fpregs[] + // is no longer 8-byte aligned, the assert will fire. + // Then the uint32's will have to be fixed AND THE CODE + // in SaveVFP/LoadVFP will have to be CHANGED EQUALLY to + // compensate, as simply padding the uint32's (or + // sticking an aligned(8) attribute here) will leave the + // this structure mismatched with the code. + +} VFPSave __attribute__((aligned(8))); + // Keep the aligned(8) attribute here though so the + // VFPSave structures begin on an 8-byte boundary. + +typedef struct WorldSwitchPage WorldSwitchPage; +typedef void (SwitchToMonitor)(RegisterSave *regSave); +typedef void (SwitchToUser)(RegisterSave *regSaveEnd); + +#include "atomic.h" +#include "monva_common.h" +#include "mksck_shared.h" + +struct WorldSwitchPage { + uint32 mvpkmVersion; ///< The version number of mvpkm + + HKVA wspHKVA; ///< host kernel virtual address of this page + ARM_L1D wspKVAL1D; ///< The l1D entry at the above location + + SwitchToMonitor*switchToMonitor;///< entrypoint of the switching function + SwitchToUser *switchToUser; ///< ditto + + MonVA monVA; ///< monitor virtual address space description + union { + ARM_L2D monAttribL2D; ///< {S,TEX,CB} attributes for monitor mappings (LPV) + ARM_MemAttrNormal memAttr; ///< Normal memory attributes for monitor (VE) + }; + + MonitorType monType; ///< the type of the monitor. Used by mvpkm + _Bool allowInts; ///< true: monitor runs with ints enabled as much as possible (normal) + ///< false: monitor runs with ints blocked as much as possible (debug) + + struct { + uint64 switchedAt64; ///< approx time CP15 TSC was set to... + uint32 switchedAtTSC; ///< CP15 TSC value on entry from monitor + uint32 tscToRate64Mult; ///< multiplier to convert TSC_READ()s to our RATE64s + uint32 tscToRate64Shift; ///< shift to convert TSC_READ()s to our RATE64s + }; + + struct { + AtmUInt32 hostActions; ///< actions for monitor on instruction boundary + Mksck_VmId guestId; ///< vmId of the monitor page + }; + + struct { ///< Mksck attributes needed by Mksck_WspRelease() + uint32 critSecCount; ///< if >0 the monitor is in critical section + ///< and expects to regain control + _Bool isPageMapped[MKSCK_MAX_SHARES]; ///< host mksckPages known to the monitor + _Bool guestPageMapped;///< the guest Mksck page has been mapped in MVA space + uint32 isOpened; ///< bitfield indicating which mkscks + ///< are open on the guest's mksckPage. + /* Note that isOpened is per VM not per VCPU. Also note + * that this and other bitfields in the MksckPage structure + * limit the number of sockets to 32. + */ + }; + +#define WSP_PARAMS_SIZE 512 + uint8 params_[WSP_PARAMS_SIZE]; ///< opaque worldswitch call parameters + + RegisterSave regSave; ///< Save area for the worldswitch code below + VFPSave hostVFP; ///< Save areas for monitor/kernel VFP state + VFPSave monVFP; + +__attribute__((aligned(ARM_L2PT_COARSE_SIZE))) + ARM_L2D wspDoubleMap[ARM_L2PT_COARSE_ENTRIES]; ///< maps worldswitch page at its HKVA + uint8 secondHalfPadding[ARM_L2PT_COARSE_SIZE]; +}; + +/* + * These asserts duplicate the assert at the beginning of SetL1L2esc. + */ +MY_ASSERTS(WSP, + ASSERT_ON_COMPILE(offsetof(struct WorldSwitchPage, wspDoubleMap) % + ARM_L2PT_COARSE_SIZE == 0); +) + +extern void SaveVFP(VFPSave *); +extern void LoadVFP(VFPSave *); + +#define SWITCH_VFP_TO_MONITOR \ + do { \ + SaveVFP(&wsp->hostVFP); \ + LoadVFP(&wsp->monVFP); \ + } while(0) + +#define SWITCH_VFP_TO_HOST \ + do { \ + SaveVFP(&wsp->monVFP); \ + LoadVFP(&wsp->hostVFP); \ + } while(0) + +#endif /// __ASSEMBLER__ + +#define OFFSETOF_KR3_REGSAVE_VE_WSP 616 + +#endif /// _WORLDSWITCH_H diff --git a/arch/arm/mvp/mvpkm/wscalls.h b/arch/arm/mvp/mvpkm/wscalls.h new file mode 100644 index 0000000..4864f21 --- /dev/null +++ b/arch/arm/mvp/mvpkm/wscalls.h @@ -0,0 +1,165 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP Hypervisor Support + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Worldswitch call parameters + */ + +#ifndef _WSCALLS_H +#define _WSCALLS_H + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#define WSCALL_ACQUIRE_PAGE 1 +#define WSCALL_FLUSH_ALL_DCACHES 2 +#define WSCALL_IRQ 3 +#define WSCALL_ABORT 4 +#define WSCALL_LOG 5 +#define WSCALL_WAIT 6 +#define WSCALL_MUTEXLOCK 7 +#define WSCALL_MUTEXUNLOCK 8 +#define WSCALL_MUTEXUNLSLEEP 9 +#define WSCALL_MUTEXUNLWAKE 10 +#define WSCALL_GET_PAGE_FROM_VMID 11 +#define WSCALL_REMOVE_PAGE_FROM_VMID 12 +#define WSCALL_RELEASE_PAGE 13 +#define WSCALL_READTOD 14 +#define WSCALL_QP_GUEST_ATTACH 15 +#define WSCALL_MONITOR_TIMER 16 +#define WSCALL_COMM_SIGNAL 17 +#define WSCALL_QP_NOTIFY 18 +/* + * MVPKM V0.5.2.0 supports all the calls above. If new API calls are + * introduced then make sure that the calling function (probably in + * mkhost.c) checks the mvpkm's version stored in wsp->mvpkmVersion + * and invokes the wscall only when it is supported. + */ + +#define WSCALL_MAX_CALLNO 20 + +#define WSCALL_LOG_MAX 256 + +#define WSCALL_MAX_MPNS 16 + +#include "exitstatus.h" +#include "mutex.h" +#include "mksck_shared.h" +#include "qp.h" +#include "comm_transp.h" +#include "comm_transp_impl.h" + +typedef struct WSParams { + uint32 callno; + union { + /** + * @brief Used for both WSCALL_ACQUIRE_PAGE and WSCALL_RELEASE_PAGE. + */ + struct { + uint16 pages; ///< IN Number of pages + uint16 order; /**< IN Size of each page - + 2^(12+order) sized and aligned + in machine space. + (WSCALL_ACQUIRE_PAGE only) */ + PhysMem_RegionType forRegion; /**< IN Region identifier for pages + (WSCALL_ACQUIRE_PAGE only) */ + MPN mpns[WSCALL_MAX_MPNS]; /**< OUT (on WSCALL_ACQUIRE_PAGE) + IN (on WSCALL_RELEASE_PAGE) + Vector of page base MPNs. */ + } pages; + + union { + MPN mpn; ///< IN MPN to query refcount. + _Bool referenced; ///< OUT Do host page tables contain the MPN? + } refCount; + + struct { + ExitStatus status; ///< IN the final status of the monitor + } abort; + + struct { + int level; + char messg[WSCALL_LOG_MAX]; + } log; + + struct { + HKVA mtxHKVA; ///< IN mutex's host kernel virt addr + MutexMode mode; ///< IN shared or exclusive + uint32 cvi; ///< IN condition variable index + _Bool all; ///< IN wake all waiting threads? + _Bool ok; ///< OUT Mutex_Lock completed + } mutex; + + struct { + Mksck_VmId vmId; ///< IN translate and lock this vmID + _Bool found; /**< OUT true if the lookup was successful, + page is found, and refc incremented */ + MPN mpn[MKSCKPAGE_TOTAL]; ///< OUT array of MPNs of the requested vmId + } pageMgmnt; + + struct { + unsigned int now; ///< OUT current time-of-day seconds + unsigned int nowusec; ///< OUT current time-of-day microseconds + } tod; + + struct { + QPId id; ///< IN/OUT shared memory id + uint32 capacity; ///< IN size of shared region requested + uint32 type; ///< IN type of queue pair + uint32 base; ///< IN base MPN of PA vector page + uint32 nrPages; ///< IN number of pages to map + int32 rc; ///< OUT return code + } qp; + + struct { + CommTranspID transpID; + CommTranspIOEvent event; + } commEvent; + + struct { + uint64 when64; ///< IN timer request + } timer; + + struct { + _Bool suspendMode; ///< Is the guest in suspend mode? + } wait; + + }; ///< anonymous union +} WSParams; + + +/** + * @brief Cast the opaque param_ member of the wsp to WSParams type + * @param wsp_ the world switch page structure pointer + * @return the cast pointer + */ +static inline WSParams* UNUSED +WSP_Params(WorldSwitchPage *wsp_) { + return (WSParams*)(wsp_->params_); +} + +MY_ASSERTS(WSParFn, + ASSERT_ON_COMPILE(sizeof(WSParams) <= WSP_PARAMS_SIZE); +) +#endif diff --git a/arch/arm/mvp/pvtcpkm/COPYING b/arch/arm/mvp/pvtcpkm/COPYING new file mode 100644 index 0000000..10828e0 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/COPYING @@ -0,0 +1,341 @@ + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/arch/arm/mvp/pvtcpkm/Kbuild b/arch/arm/mvp/pvtcpkm/Kbuild new file mode 100644 index 0000000..d2ec844 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/Kbuild @@ -0,0 +1,9 @@ +# Warning: autogenerated +obj-m := pvtcpkm.o +pvtcpkm-objs := check_kconfig.o pvtcp_off_io_linux.o pvtcp_off_linux.o comm_os_linux.o comm_os_mod_linux.o pvtcp.o pvtcp_off.o pvtcp_off_linux_shim.o + +ccflags-y += -fno-pic -fno-dwarf2-cfi-asm -march=armv7-a -D__linux__ +ccflags-y += -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast -DPVTCP_BUILDING_SERVER +ccflags-y += -mfpu=neon -DIN_MODULE -DGPLED_CODE +ccflags-y += --std=gnu89 -O2 -g2 -ggdb -mapcs -fno-optimize-sibling-calls -mno-sched-prolog +ccflags-$(CONFIG_VMWARE_MVP_DEBUG) += -DMVP_DEBUG diff --git a/arch/arm/mvp/pvtcpkm/Makefile b/arch/arm/mvp/pvtcpkm/Makefile new file mode 100644 index 0000000..16eb389 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/Makefile @@ -0,0 +1 @@ +# Warning: autogenerated diff --git a/arch/arm/mvp/pvtcpkm/check_kconfig.c b/arch/arm/mvp/pvtcpkm/check_kconfig.c new file mode 100644 index 0000000..6fc27a1 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/check_kconfig.c @@ -0,0 +1,91 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * @brief Check for required kernel configuration + * + * Check to make sure that the kernel options that the MVP hypervisor requires + * have been enabled in the kernel that this kernel module is being built + * against. + */ +#include + +/* + * Minimum kernel version + * - network namespace support is only really functional starting in 2.6.29 + * - Android Gingerbread requires 2.6.35 + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) +#error "MVP requires a host kernel newer than 2.6.35" +#endif + +/* module loading ability */ +#ifndef CONFIG_MODULES +#error "MVP requires kernel loadable module support be enabled (CONFIG_MODULES)" +#endif +#ifndef CONFIG_MODULE_UNLOAD +#error "MVP requires kernel module unload support be enabled (CONFIG_MODULE_UNLOAD)" +#endif + +/* sysfs */ +#ifndef CONFIG_SYSFS +#error "MVP requires sysfs support (CONFIG_SYSFS)" +#endif + +/* network traffic isolation */ +#ifndef CONFIG_NAMESPACES +#error "MVP networking support requires namespace support (CONFIG_NAMESPACES)" +#endif +#ifndef CONFIG_NET_NS +#error "MVP networking support requires Network Namespace support to be enabled (CONFIG_NET_NS)" +#endif + +/* TCP/IP networking */ +#ifndef CONFIG_INET +#error "MVP networking requires IPv4 support (CONFIG_INET)" +#endif +#ifndef CONFIG_IPV6 +#error "MVP networking requires IPv6 support (CONFIG_IPV6)" +#endif + +/* VPN support */ +#if !defined(CONFIG_TUN) && !defined(CONFIG_TUN_MODULE) +#error "MVP VPN support requires TUN device support (CONFIG_TUN)" +#endif + +#if !defined(CONFIG_NETFILTER) && !defined(PVTCP_DISABLE_NETFILTER) +#error "MVP networking support requires netfilter support (CONFIG_NETFILTER)" +#endif + +/* Force /proc/config.gz support for eng/userdebug builds */ +#ifdef MVP_DEBUG +#if !defined(CONFIG_IKCONFIG) || !defined(CONFIG_IKCONFIG_PROC) +#error "MVP kernel /proc/config.gz support required for debuggability (CONFIG_IKCONFIG_PROC)" +#endif +#endif + +/* Sanity check we're only dealing with the memory hotplug + migrate and/or + * compaction combo */ +#ifdef CONFIG_MIGRATION +#if defined(CONFIG_NUMA) || defined(CONFIG_CPUSETS) || defined(CONFIG_MEMORY_FAILURE) +#error "MVP not tested with migration features other than CONFIG_MEMORY_HOTPLUG and CONFIG_COMPACTION" +#endif +#endif diff --git a/arch/arm/mvp/pvtcpkm/comm.h b/arch/arm/mvp/pvtcpkm/comm.h new file mode 100644 index 0000000..877731d --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm.h @@ -0,0 +1,171 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Communication functions based on queue pair transport APIs. + * + * Comm is a shared memory-based mechanism that facilitates the implementation + * of kernel components that require host-to-guest, or guest-to-guest + * communication. + * This facility assumes the availability of a minimal shared memory queue pair + * implementation, such as MVP queue pairs or VMCI queue pairs. The latter must + * provide primitives for queue pair creation and destruction, and reading and + * writing from/to queue pairs. + * Comm assumes that the queue pair (transport) layer is not concerned with + * multi-threading, locking or flow control, and does not require such features. + */ + +#ifndef _COMM_H_ +#define _COMM_H_ + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "comm_os.h" +#include "comm_transp.h" + + +/* Default/maximum Comm timeouts (in milliseconds). */ +#define COMM_MAX_TO 60000ULL +#define COMM_MAX_TO_UNINT (COMM_MAX_TO + 1) + +#define COMM_OPF_SET_ERR(flags) ((flags) |= 128) +#define COMM_OPF_CLEAR_ERR(flags) ((flags) &= 127) +#define COMM_OPF_TEST_ERR(flags) ((flags) & 128) + +#define COMM_OPF_SET_VAL(flags, val) ((flags) |= ((val) & 127)) +#define COMM_OPF_GET_VAL(flags) ((flags) & 127) + +/** + * Packet (header) structure. + * NB: Do not change this structure, especially the first three fields; there + * will be consequences. It may be extended, but it's not recommended: all + * operations carry this header, so it's better kept in its minimal form. + */ + +typedef struct CommPacket { + unsigned int len; // Total length + unsigned char flags; // Operation flags + unsigned char opCode; // Operation to call + unsigned short data16; // Auxiliary data + unsigned long long data64; + unsigned long long data64ex; + union { + struct { + unsigned int data32; + unsigned int data32ex; + }; + unsigned long long data64ex2; + }; +} CommPacket; + + +/* Opaque structure representing a communication channel. */ + +struct CommChannelPriv; +typedef struct CommChannelPriv *CommChannel; + + +/* Input operations associated with a comm channel. */ + +typedef void (*CommOperationFunc)(CommChannel channel, + void *state, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen); + + +/* Helper macros */ + +#define COMM_DEFINE_OP(funcName) \ +void \ +funcName(CommChannel channel, \ + void *state, \ + CommPacket *packet, \ + struct kvec *vec, \ + unsigned int vecLen) + + +/* Comm-based implementations. */ + +typedef struct CommImpl { + struct module *owner; + int (*checkArgs)(CommTranspInitArgs *transpArgs); + void *(*stateCtor)(CommChannel channel); + void (*stateDtor)(void *state); + void *(*dataAlloc)(unsigned int dataLen); + void (*dataFree)(void *data); + const CommOperationFunc *operations; + void (*closeNtf)(void *closeNtfData, + const CommTranspInitArgs *transpArgs, + int inBH); + void *closeNtfData; + void (*activateNtf)(void *activateNtfData, + CommChannel channel); + void *activateNtfData; + unsigned long long openAtMillis; + unsigned long long openTimeoutAtMillis; + CommTranspID ntfCenterID; +} CommImpl; + + +int Comm_Init(unsigned int maxChannels); +int Comm_Finish(unsigned long long *timeoutMillis); +int Comm_RegisterImpl(const CommImpl *impl); +void Comm_UnregisterImpl(const CommImpl *impl); +int Comm_IsActive(CommChannel channel); +CommTranspInitArgs Comm_GetTranspInitArgs(CommChannel channel); +void *Comm_GetState(CommChannel channel); +int Comm_Dispatch(CommChannel channel); +unsigned int Comm_DispatchAll(void); +void Comm_Put(CommChannel channel); +void Comm_DispatchUnlock(CommChannel channel); +int Comm_Lock(CommChannel channel); +void Comm_Unlock(CommChannel channel); +int Comm_Zombify(CommChannel channel, int inBH); + +int +Comm_Alloc(const CommTranspInitArgs *transpArgs, + const CommImpl *impl, + int inBH, + CommChannel *newChannel); + + +int +Comm_Write(CommChannel channel, + const CommPacket *packet, + unsigned long long *timeoutMillis); + +int +Comm_WriteVec(CommChannel channel, + const CommPacket *packet, + struct kvec **vec, + unsigned int *vecLen, + unsigned long long *timeoutMillis, + unsigned int *iovOffset); + +unsigned int Comm_RequestInlineEvents(CommChannel channel); +unsigned int Comm_ReleaseInlineEvents(CommChannel channel); + +#endif // _COMM_H_ diff --git a/arch/arm/mvp/pvtcpkm/comm_os.h b/arch/arm/mvp/pvtcpkm/comm_os.h new file mode 100644 index 0000000..91305f1 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm_os.h @@ -0,0 +1,150 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Cross-platform base type definitions and function declarations. + * Includes OS-specific base type definitions and function declarations. + */ + +#ifndef _COMM_OS_H_ +#define _COMM_OS_H_ + +/* For-ever timeout constant (in milliseconds). */ +#define COMM_OS_4EVER_TO ((unsigned long long)(~0UL >> 1)) + +/* Condition function prototype. Returns 1: true, 0: false, < 0: error code. */ +typedef int (*CommOSWaitConditionFunc)(void *arg1, void *arg2); + +/* Dispatch function prototype. Called by input (dispatch) kernel threads. */ +typedef unsigned int (*CommOSDispatchFunc)(void); + +/* Module initialization and exit callback functions. */ +extern int (*commOSModInit)(void *args); +extern void (*commOSModExit)(void); + +/* Macro to assign Init and Exit callbacks. */ +#define COMM_OS_MOD_INIT(init, exit) \ + int (*commOSModInit)(void *args) = init; \ + void (*commOSModExit)(void) = exit + + +/* + * OS-specific implementations must provide the following: + * 1. Types: + * CommOSAtomic + * CommOSSpinlock + * CommOSMutex + * CommOSWaitQueue + * CommOSWork + * CommOSWorkFunc + * CommOSList + * CommOSModule + * struct kvec + * + * 2. Definition, initializers: + * CommOSSpinlock_Define() + * + * 3. Functions: + * void CommOS_Debug(const char *format, ...); + * void CommOS_Log(const char *format, ...); + * void CommOS_WriteAtomic(CommOSAtomic *atomic, int val); + * int CommOS_ReadAtomic(CommOSAtomic *atomic); + * int CommOS_AddReturnAtomic(CommOSAtomic *atomic, int val); + * int CommOS_SubReturnAtomic(CommOSAtomic *atomic, int val); + * void CommOS_SpinlockInit(CommOSSpinlock *lock); + * void CommOS_SpinLockBH(CommOSSpinlock *lock); + * int CommOS_SpinTrylockBH(CommOSSpinlock *lock); + * void CommOS_SpinUnlockBH(CommOSSpinlock *lock); + * void CommOS_SpinLock(CommOSSpinlock *lock); + * int CommOS_SpinTrylock(CommOSSpinlock *lock); + * void CommOS_SpinUnlock(CommOSSpinlock *lock); + * void CommOS_MutexInit(CommOSMutex *mutex); + * void CommOS_MutexLock(CommOSMutex *mutex); + * int CommOS_MutexLockUninterruptible(CommOSMutex *mutex); + * int CommOS_MutexTrylock(CommOSMutex *mutex); + * void CommOS_MutexUnlock(CommOSMutex *mutex); + * void CommOS_WaitQueueInit(CommOSWaitQueue *wq); + * CommOS_DoWait(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc cond, + * void *condArg1, + * void *condArg2, + * unsigned long long *timeoutMillis, + * int interruptible); + * int CommOS_Wait(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc func, + * void *funcArg1, + * void *funcArg2, + * unsigned long long *timeoutMillis); + * int CommOS_WaitUninterruptible(CommOSWaitQueue *wq, + * CommOSWaitConditionFunc func, + * void *funcArg1, + * void *funcArg2, + * unsigned long long *timeoutMillis); + * void CommOS_WakeUp(CommOSWaitQueue *wq); + * void *CommOS_KmallocNoSleep(unsigned int size); + * void *CommOS_Kmalloc(unsigned int size); + * void CommOS_Kfree(void *arg); + * void CommOS_Yield(void); + * unsigned long long CommOS_GetCurrentMillis(void); + * void CommOS_ListInit(CommOSList *list); + * int CommOS_ListEmpty(CommOSList *list); + * void CommOS_ListAdd(CommOSList *list, CommOSList *listElem); + * void CommOS_ListAddTail(CommOSList *list, CommOSList *listElem); + * void int CommOS_ListDel(CommOSList *listElem); + * Macros: + * CommOS_ListForEach(*list, *item, itemListFieldName); + * CommOS_ListForEachSafe(*list, *item, *tmp, itemListFieldName); + * void CommOS_ListSplice(CommOSList *list, CommOSList *listToAdd); + * void CommOS_ListSpliceTail(CommOSList *list, CommOSList *listToAdd); + * CommOSModule CommOS_ModuleSelf(void); + * int CommOS_ModuleGet(CommOSModule module); + * void CommOS_ModulePut(CommOSModule module); + * void CommOS_MemBarrier(void); + * + * These cannot be defined here: a) non-pointer type definitions need size + * information, and b) functions may or may not be inlined, or macros may + * be used instead. + */ + + +#ifdef __linux__ +#include "comm_os_linux.h" +#else +#error "Unsupported OS" +#endif + +/* Functions to start and stop the dispatch and aio kernel threads. */ +void CommOS_StopIO(void); +void CommOS_ScheduleDisp(void); +void CommOS_InitWork(CommOSWork *work, CommOSWorkFunc func); +int CommOS_ScheduleAIOWork(CommOSWork *work); +void CommOS_FlushAIOWork(CommOSWork *work); + +int +CommOS_StartIO(const char *dispatchTaskName, + CommOSDispatchFunc dispatchHandler, + unsigned int interval, + unsigned int maxCycles, + const char *aioTaskName); + + +#endif /* _COMM_OS_H_ */ diff --git a/arch/arm/mvp/pvtcpkm/comm_os_linux.c b/arch/arm/mvp/pvtcpkm/comm_os_linux.c new file mode 100644 index 0000000..61ce929 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm_os_linux.c @@ -0,0 +1,371 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Linux-specific functions/types. + */ + +#include "comm_os.h" + +#define DISPATCH_MAX_CYCLES 8192 + +/* Type definitions */ + +typedef struct workqueue_struct CommOSWorkQueue; + + +/* Static data */ + +static volatile int running; +static int numCpus; +static CommOSWorkQueue *dispatchWQ; +static CommOSDispatchFunc dispatch; +static CommOSWork dispatchWorksNow[NR_CPUS]; +static CommOSWork dispatchWorks[NR_CPUS]; +static unsigned int dispatchInterval = 1; +static unsigned int dispatchMaxCycles = 2048; +static CommOSWorkQueue *aioWQ; + + +/** + * @brief Initializes a workqueue consisting of per-cpu kernel threads. + * @param name workqueue name + * @return workqueue handle if successful, NULL otherwise + */ + +static inline CommOSWorkQueue * +CreateWorkqueue(const char *name) +{ + return create_workqueue(name); +} + + +/** + * @brief Destroys a workqueue and stops its threads. + * @param[in,out] wq workqueue to destroy. + * @return workqueue handle is successful, NULL otherwise. + */ + +static inline void +DestroyWorkqueue(CommOSWorkQueue *wq) +{ + destroy_workqueue(wq); +} + + +/** + * @brief Force execution of a work item. + * @param[in,out] work work item to dequeue. + */ + +static inline void +FlushDelayedWork(CommOSWork *work) +{ + flush_delayed_work(work); +} + + +/** + * @brief Enqueue a work item to a workqueue for execution on a given cpu + * and after the specified interval. + * @param cpu cpu number. If negative, work item is enqueued on current cpu. + * @param[in,out] wq target work queue. + * @param[in,out] work work item to enqueue. + * @param jif delay interval. + * @return zero if successful, non-zero otherwise. + */ + +static inline int +QueueDelayedWorkOn(int cpu, + CommOSWorkQueue *wq, + CommOSWork *work, + unsigned long jif) +{ + if (cpu < 0) { + return !queue_delayed_work(wq, work, jif) ? -1 : 0; + } else { + return !queue_delayed_work_on(cpu, wq, work, jif) ? -1 : 0; + } +} + + +/** + * @brief Enqueues a work item to a workqueue for execution on the current cpu + * and after the specified interval. + * @param[in,out] wq target work queue. + * @param[in,out] work work item to enqueue. + * @param jif delay interval. + * @return zero if successful, non-zero otherwise. + */ + +static inline int +QueueDelayedWork(CommOSWorkQueue *wq, + CommOSWork *work, + unsigned long jif) +{ + return QueueDelayedWorkOn(-1, wq, work, jif); +} + + +/** + * @brief Cancels a queued delayed work item and synchronizes with its + * completion. + * @param[in,out] work work item to cancel + */ + +static inline void +WaitForDelayedWork(CommOSWork *work) +{ + cancel_delayed_work_sync(work); +} + + +/** + * @brief Discards work items queued to the specified workqueue. + * @param[in,out] wq work queue to flush. + */ + +static inline void +FlushWorkqueue(CommOSWorkQueue *wq) +{ + flush_workqueue(wq); +} + + +/** + * @brief Schedules dispatcher threads for immediate execution. + */ + +void +CommOS_ScheduleDisp(void) +{ + CommOSWork *work = &dispatchWorksNow[get_cpu()]; + + put_cpu(); + if (running) { + QueueDelayedWork(dispatchWQ, work, 0); + } +} + + +/** + * @brief Default delayed work callback function implementation. + * Calls the input function specified at initialization. + * @param[in,out] work work item. + */ + +static void +DispatchWrapper(CommOSWork *work) +{ + unsigned int misses; + + for (misses = 0; running && (misses < dispatchMaxCycles); ) { + /* We run for at most dispatchMaxCycles worth of channel no-ops. */ + + if (!dispatch()) { + /* No useful work was done, on any of the channels. */ + + misses++; + if ((misses % 32) == 0) { + CommOS_Yield(); + } + } else { + misses = 0; + } + } + + if (running && + (work >= &dispatchWorks[0]) && + (work <= &dispatchWorks[NR_CPUS - 1])) { + /* + * If still running _and_ this was a regular, time-based run, then + * re-arm the timer. + */ + + QueueDelayedWork(dispatchWQ, work, dispatchInterval); + } +} + + +/** + * @brief Initializes work item with specified callback function. + * @param[in,out] work work queue to initialize. + * @param func work item to initialize the queue with. + */ + +void +CommOS_InitWork(CommOSWork *work, + CommOSWorkFunc func) +{ + INIT_DELAYED_WORK(work, (work_func_t)func); +} + + +/** + * @brief Flush execution of a work item + * @param{in,out] work work item to dequeue + */ +void +CommOS_FlushAIOWork(CommOSWork *work) +{ + if (aioWQ && work) { + FlushDelayedWork(work); + } +} + + +/** + * @brief Queue a work item to the AIO workqueue. + * @param[in,out] work work item to enqueue. + * @return zero if work enqueued, non-zero otherwise. + */ + +int +CommOS_ScheduleAIOWork(CommOSWork *work) +{ + if (running && aioWQ && work) { + return QueueDelayedWork(aioWQ, work, 0); + } + return -1; +} + + +/** + * @brief Initializes the base IO system. + * @param dispatchTaskName dispatch thread(s) name. + * @param dispatchFunc dispatch function. + * @param intervalMillis periodic interval in milliseconds to call dispatch. + * The floor is 1 jiffy, regardless of how small intervalMillis is + * @param maxCycles number of cycles to do adaptive polling before scheduling. + * The maximum number of cycles is DISPATCH_MAX_CYCLES. + * @param aioTaskName AIO thread(s) name. If NULL, AIO threads aren't started. + * @return zero is successful, -1 otherwise. + * @sideeffects Dispatch threads, and if applicable, AIO threads are started. + */ + +int +CommOS_StartIO(const char *dispatchTaskName, // IN + CommOSDispatchFunc dispatchFunc, // IN + unsigned int intervalMillis, // IN + unsigned int maxCycles, // IN + const char *aioTaskName) // IN +{ + int rc; + int cpu; + + if (running) { + CommOS_Debug(("%s: I/O tasks already running.\n", __FUNCTION__)); + return 0; + } + + /* + * OK, let's test the handler against NULL. Though, the whole concept + * of checking for NULL pointers, outside cases where NULL is meaningful + * to the implementation, is relatively useless: garbage, random pointers + * rarely happen to be all-zeros. + */ + + if (!dispatchFunc) { + CommOS_Log(("%s: a NULL Dispatch handler was passed.\n", __FUNCTION__)); + return -1; + } + dispatch = dispatchFunc; + + if (intervalMillis == 0) { + intervalMillis = 4; + } + if ((dispatchInterval = msecs_to_jiffies(intervalMillis)) < 1) { + dispatchInterval = 1; + } + if (maxCycles > DISPATCH_MAX_CYCLES) { + dispatchMaxCycles = DISPATCH_MAX_CYCLES; + } else if (maxCycles > 0) { + dispatchMaxCycles = maxCycles; + } + CommOS_Debug(("%s: Interval millis %u (jif:%u).\n", __FUNCTION__, + intervalMillis, dispatchInterval)); + CommOS_Debug(("%s: Max cycles %u.\n", __FUNCTION__, dispatchMaxCycles)); + + numCpus = num_present_cpus(); + dispatchWQ = CreateWorkqueue(dispatchTaskName); + if (!dispatchWQ) { + CommOS_Log(("%s: Couldn't create %s task(s).\n", __FUNCTION__, + dispatchTaskName)); + return -1; + } + + if (aioTaskName) { + aioWQ = CreateWorkqueue(aioTaskName); + if (!aioWQ) { + CommOS_Log(("%s: Couldn't create %s task(s).\n", __FUNCTION__, + aioTaskName)); + DestroyWorkqueue(dispatchWQ); + return -1; + } + } else { + aioWQ = NULL; + } + + running = 1; + for (cpu = 0; cpu < numCpus; cpu++) { + CommOS_InitWork(&dispatchWorksNow[cpu], DispatchWrapper); + CommOS_InitWork(&dispatchWorks[cpu], DispatchWrapper); + rc = QueueDelayedWorkOn(cpu, dispatchWQ, + &dispatchWorks[cpu], + dispatchInterval); + if (rc != 0) { + CommOS_StopIO(); + return -1; + } + } + CommOS_Log(("%s: Created I/O task(s) successfully.\n", __FUNCTION__)); + return 0; +} + + +/** + * @brief Stops the base IO system. + * @sideeffects Dispatch threads, and if applicable, AIO threads are stopped. + */ + +void +CommOS_StopIO(void) +{ + int cpu; + + if (running) { + running = 0; + if (aioWQ) { + FlushWorkqueue(aioWQ); + DestroyWorkqueue(aioWQ); + aioWQ = NULL; + } + FlushWorkqueue(dispatchWQ); + for (cpu = 0; cpu < numCpus; cpu++) { + WaitForDelayedWork(&dispatchWorksNow[cpu]); + WaitForDelayedWork(&dispatchWorks[cpu]); + } + DestroyWorkqueue(dispatchWQ); + dispatchWQ = NULL; + CommOS_Log(("%s: I/O tasks stopped.\n", __FUNCTION__)); + } +} diff --git a/arch/arm/mvp/pvtcpkm/comm_os_linux.h b/arch/arm/mvp/pvtcpkm/comm_os_linux.h new file mode 100644 index 0000000..81ee9d1 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm_os_linux.h @@ -0,0 +1,699 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Contains linux-specific type definitions and function declarations + */ + +#ifndef _COMM_OS_LINUX_H_ +#define _COMM_OS_LINUX_H_ + +#include +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) +#error "Kernel versions lower than 2.6.20 are not supported" +#endif + +#include +#include +#include +#include +#include +#include + + +/* + * Type definitions. + */ + +typedef atomic_t CommOSAtomic; +typedef spinlock_t CommOSSpinlock; +typedef struct mutex CommOSMutex; +typedef wait_queue_head_t CommOSWaitQueue; +typedef struct delayed_work CommOSWork; +typedef void (*CommOSWorkFunc)(CommOSWork *work); +typedef struct list_head CommOSList; +typedef struct module *CommOSModule; + + +/* + * Initializers. + */ + +#define CommOSSpinlock_Define DEFINE_SPINLOCK + + +#define COMM_OS_DOLOG(...) printk(KERN_INFO __VA_ARGS__) + + +/** + * @brief Logs given arguments in debug builds. + */ + +#if defined(COMM_OS_DEBUG) + #define CommOS_Debug(args) COMM_OS_DOLOG args +#else + #define CommOS_Debug(args) +#endif + + +/** + * @brief Logs given arguments. + */ + +#define CommOS_Log(args) COMM_OS_DOLOG args + + +/** + * @brief Logs function name and location. + */ + +#if defined(COMM_OS_TRACE) +#define TRACE(ptr) \ + do { \ + CommOS_Debug(("%p:%s: at [%s:%d] with arg ptr [0x%p].\n", current, \ + __FUNCTION__, __FILE__, __LINE__, (ptr))); \ + } while (0) +#else +#define TRACE(ptr) +#endif + + +/** + * @brief Write atomic variable + * @param[in,out] atomic variable to write + * @param val new value + */ + +static inline void +CommOS_WriteAtomic(CommOSAtomic *atomic, + int val) +{ + atomic_set(atomic, val); +} + + +/** + * @brief Reads atomic variable + * @param atomic variable to read + * @return value + */ + +static inline int +CommOS_ReadAtomic(CommOSAtomic *atomic) +{ + return atomic_read(atomic); +} + + +/** + * @brief Atomically add value to atomic variable, return new value. + * @param[in,out] atomic variable + * @param val value to add + * @return new value + */ + +static inline int +CommOS_AddReturnAtomic(CommOSAtomic *atomic, + int val) +{ + return atomic_add_return(val, atomic); +} + + +/** + * @brief Atomically substract value from atomic variable, return new value. + * @param[in,out] atomic variable + * @param val value to substract + * @return new value + */ + +static inline int +CommOS_SubReturnAtomic(CommOSAtomic *atomic, + int val) +{ + return atomic_sub_return(val, atomic); +} + + +/** + * @brief Initializes a given lock. + * @param[in,out] lock lock to initialize + */ + +static inline void +CommOS_SpinlockInit(CommOSSpinlock *lock) +{ + spin_lock_init(lock); +} + + +/** + * @brief Locks given lock and disables bottom half processing. + * @param[in,out] lock lock to lock + */ + +static inline void +CommOS_SpinLockBH(CommOSSpinlock *lock) +{ + spin_lock_bh(lock); +} + + +/** + * @brief Attempts to lock the given lock and disable BH processing. + * @param[in,out] lock lock to lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_SpinTrylockBH(CommOSSpinlock *lock) +{ + return !spin_trylock_bh(lock); +} + + +/** + * @brief Unlocks given lock and re-enables BH processing. + * @param[in,out] lock lock to unlock + */ + +static inline void +CommOS_SpinUnlockBH(CommOSSpinlock *lock) +{ + spin_unlock_bh(lock); +} + + +/** + * @brief Locks the given lock. + * @param[in,out] lock lock to lock + */ + +static inline void +CommOS_SpinLock(CommOSSpinlock *lock) +{ + spin_lock(lock); +} + + +/** + * @brief Attempts to lock the given lock. + * @param[in,out] lock lock to try-lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_SpinTrylock(CommOSSpinlock *lock) +{ + return !spin_trylock(lock); +} + + +/** + * @brief Unlocks given lock. + * @param[in,out] lock lock to unlock + */ + +static inline void +CommOS_SpinUnlock(CommOSSpinlock *lock) +{ + spin_unlock(lock); +} + + +/** + * @brief Initializes given mutex. + * @param[in,out] mutex mutex to initialize + */ + +static inline void +CommOS_MutexInit(CommOSMutex *mutex) +{ + mutex_init(mutex); +} + + +/** + * @brief Acquires mutex. + * @param[in,out] mutex mutex to lock + * @return zero if successful, non-zero otherwise (interrupted) + */ + +static inline int +CommOS_MutexLock(CommOSMutex *mutex) +{ + return mutex_lock_interruptible(mutex); +} + + +/** + * @brief Acquires mutex in uninterruptible mode. + * @param[in,out] mutex mutex to lock + */ + +static inline void +CommOS_MutexLockUninterruptible(CommOSMutex *mutex) +{ + mutex_lock(mutex); +} + + +/** + * @brief Attempts to acquire given mutex. + * @param[in,out] mutex mutex to try-lock + * @return zero if successful, non-zero otherwise + */ + +static inline int +CommOS_MutexTrylock(CommOSMutex *mutex) +{ + return !mutex_trylock(mutex); +} + + +/** + * @brief Releases a given mutex. + * @param[in,out] mutex mutex to unlock + */ + +static inline void +CommOS_MutexUnlock(CommOSMutex *mutex) +{ + mutex_unlock(mutex); +} + + +/** + * @brief Initializes a wait queue. + * @param[in,out] wq workqueue to initialize + */ + +static inline void +CommOS_WaitQueueInit(CommOSWaitQueue *wq) +{ + init_waitqueue_head(wq); +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * - a signal is pending + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @param interruptible enable/disable signal pending check + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, if a signal is pending or other error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_DoWait(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis, + int interruptible) +{ + int rc; + DEFINE_WAIT(wait); + long timeout; +#if defined(COMM_OS_LINUX_WAIT_WORKAROUND) + long tmpTimeout; + long retTimeout; + const unsigned int interval = 50; +#endif + + if (!timeoutMillis) { + return -1; + } + if ((rc = cond(condArg1, condArg2)) != 0) { + return rc; + } + +#if defined(COMM_OS_LINUX_WAIT_WORKAROUND) + timeout = msecs_to_jiffies(interval < *timeoutMillis ? + interval : (unsigned int)*timeoutMillis); + retTimeout = msecs_to_jiffies((unsigned int)(*timeoutMillis)); + + for (; retTimeout >= 0; ) { + prepare_to_wait(wq, &wait, + (interruptible?TASK_INTERRUPTIBLE:TASK_UNINTERRUPTIBLE)); + if ((rc = cond(condArg1, condArg2))) { + break; + } + if (interruptible && signal_pending(current)) { + rc = -EINTR; + break; + } + if ((tmpTimeout = schedule_timeout(timeout))) { + retTimeout -= (timeout - tmpTimeout); + } else { + retTimeout -= timeout; + } + if (retTimeout < 0) { + retTimeout = 0; + } + } + finish_wait(wq, &wait); + if (rc == 0) { + rc = cond(condArg1, condArg2); + if (rc && (retTimeout == 0)) { + retTimeout = 1; + } + } + *timeoutMillis = (unsigned long long)jiffies_to_msecs(retTimeout); +#else // !defined(COMM_OS_LINUX_WAIT_WORKAROUND) + timeout = msecs_to_jiffies((unsigned int)(*timeoutMillis)); + + for (;;) { + prepare_to_wait(wq, &wait, + (interruptible?TASK_INTERRUPTIBLE:TASK_UNINTERRUPTIBLE)); + if ((rc = cond(condArg1, condArg2)) != 0) { + break; + } + if (interruptible && signal_pending(current)) { + rc = -EINTR; + break; + } + if ((timeout = schedule_timeout(timeout)) == 0) { + rc = 0; + break; + } + } + finish_wait(wq, &wait); + if (rc == 0) { + rc = cond(condArg1, condArg2); + if (rc && (timeout == 0)) { + timeout = 1; + } + } + *timeoutMillis = (unsigned long long)jiffies_to_msecs(timeout); +#endif + + return rc; +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * - a signal is pending + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, if a signal is pending or other error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_Wait(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis) +{ + return CommOS_DoWait(wq, cond, condArg1, condArg2, timeoutMillis, 1); +} + + +/** + * @brief Puts the caller on a wait queue until either of the following occurs: + * - the condition function (predicate) evaluates to TRUE + * - the specified timeout interval elapsed + * @param[in,out] wq wait queue to put item on + * @param cond predicate to test + * @param condArg1 argument 1 for cond + * @param condArg2 argument 2 for cond + * @param[in,out] timeoutMillis timeout interval in milliseconds + * @return 1 if condition was met + * 0 if the timeout interval elapsed + * <0, error set by condition + * @sideeffect timeoutMillis is updated to time remaining + */ + +static inline int +CommOS_WaitUninterruptible(CommOSWaitQueue *wq, + CommOSWaitConditionFunc cond, + void *condArg1, + void *condArg2, + unsigned long long *timeoutMillis) +{ + return CommOS_DoWait(wq, cond, condArg1, condArg2, timeoutMillis, 0); +} + + +/** + * @brief Wakes up task(s) waiting on the given wait queue. + * @param[in,out] wq wait queue. + */ + +static inline void +CommOS_WakeUp(CommOSWaitQueue *wq) +{ + wake_up(wq); +} + + +/** + * @brief Allocates kernel memory of specified size; does not sleep. + * @param size size to allocate. + * @return Address of allocated memory or NULL if the allocation fails. + */ + +static inline void * +CommOS_KmallocNoSleep(unsigned int size) +{ + return kmalloc(size, GFP_ATOMIC); +} + + +/** + * @brief Allocates kernel memory of specified size; may sleep. + * @param size size to allocate. + * @return Address of allocated memory or NULL if the allocation fails. + */ + +static inline void * +CommOS_Kmalloc(unsigned int size) +{ + return kmalloc(size, GFP_KERNEL); +} + + +/** + * @brief Frees previously allocated kernel memory. + * @param obj object to free. + */ + +static inline void +CommOS_Kfree(void *obj) +{ + if (obj) { + kfree(obj); + } +} + + +/** + * @brief Yields the current cpu to other runnable tasks. + */ + +static inline void +CommOS_Yield(void) +{ + cond_resched(); +} + + +/** + * @brief Gets the current time in milliseconds. + * @return Current time in milliseconds, with precision of at most one tick. + */ + +static inline unsigned long long +CommOS_GetCurrentMillis(void) +{ + return (unsigned long long)jiffies_to_msecs(jiffies); +} + + +/** + * @brief Initializes given list. + * @param list list to initialize. + */ + +static inline void +CommOS_ListInit(CommOSList *list) +{ + INIT_LIST_HEAD(list); +} + + +/** + * @brief Tests if list is empty. + * @param list list to test. + * @return non-zero if empty, zero otherwise. + */ + +#define CommOS_ListEmpty(list) list_empty((list)) + + +/** + * @brief Adds given element to beginning of list. + * @param list list to add to. + * @param elem element to add. + */ + +#define CommOS_ListAdd(list, elem) list_add((elem), (list)) + + +/** + * @brief Adds given element to end of list. + * @param list list to add to. + * @param elem element to add. + */ + +#define CommOS_ListAddTail(list, elem) list_add_tail((elem), (list)) + + +/** + * @brief Deletes given element from its list. + * @param elem element to delete. + */ + +#define CommOS_ListDel(elem) \ + do { \ + list_del((elem)); \ + INIT_LIST_HEAD((elem)); \ + } while (0) + + +/** + * @brief Iterates over a list. + * @param list list to iterate over. + * @param[out] item stores next element. + * @param itemListFieldName name in the item structure storing the list head. + */ + +#define CommOS_ListForEach(list, item, itemListFieldName) \ + list_for_each_entry((item), (list), itemListFieldName) + + +/** + * @brief Iterates safely over a list. + * @param list list to iterate over. + * @param[out] item stores next element. May be deleted in the loop. + * @param[out] tmpItem saves iteration element. + * @param itemListFieldName name in the item structure storing the list head. + */ + +#define CommOS_ListForEachSafe(list, item, tmpItem, itemListFieldName) \ + list_for_each_entry_safe((item), (tmpItem), (list), itemListFieldName) + + +/** + * @brief Combines two lists, adds second list to beginning of first one. + * @param list list to add to. + * @param list2 list to add. + */ + +#define CommOS_ListSplice(list, list2) list_splice((list2), (list)) + + +/** + * @brief Combines two lists, adds second list to end of first one. + * @param list list to add to. + * @param list2 list to add. + */ + +#define CommOS_ListSpliceTail(list, list2) list_splice_tail((list2), (list)) + + +/** + * @brief Gets current module handle. + * @return module handle. + */ + +static inline CommOSModule +CommOS_ModuleSelf(void) +{ + return THIS_MODULE; +} + + +/** + * @brief Retains module. + * @param[in,out] module to retain. + * @return zero if successful, non-zero otherwise. + */ + +static inline int +CommOS_ModuleGet(CommOSModule module) +{ + int rc = 0; + + if (!module) { + goto out; + } + if (!try_module_get(module)) { + rc = -1; + } + +out: + return rc; +} + + +/** + * @brief Releases module. + * @param[in,out] module to release. + */ + +static inline void +CommOS_ModulePut(CommOSModule module) +{ + if (module) { + module_put(module); + } +} + + +/** + * @brief Inserts r/w memory barrier. + */ + +#define CommOS_MemBarrier smp_mb + +#endif /* _COMM_OS_LINUX_H_ */ diff --git a/arch/arm/mvp/pvtcpkm/comm_os_mod_linux.c b/arch/arm/mvp/pvtcpkm/comm_os_mod_linux.c new file mode 100644 index 0000000..e196108 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm_os_mod_linux.c @@ -0,0 +1,105 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Linux-specific module loading, unloading functions. + */ + +#include "comm_os.h" +#include "comm_os_mod_ver.h" + +#include + + +/* Module parameters -- passed as one 'name=value'-list string. */ + +static char modParams[256]; +module_param_string(COMM_OS_MOD_SHORT_NAME, modParams, sizeof modParams, 0644); + + +/** + * @brief Module initialization entry point. Calls the commOSModInit + * function pointer to perform upper layer initialization. + * @return zero if successful, non-zero otherwise. + */ + +static int __init +ModInit(void) +{ + int rc; + + if (!commOSModInit) { + CommOS_Log(("%s: Can't find \'init\' function for module \'" \ + COMM_OS_MOD_SHORT_NAME_STRING "\'.\n", __FUNCTION__)); + return -1; + } + + CommOS_Debug(("%s: Module parameters: [%s].\n", __FUNCTION__, modParams)); + + rc = (*commOSModInit)(modParams); + if (rc == 0) { + CommOS_Log(("%s: Module \'" COMM_OS_MOD_SHORT_NAME_STRING \ + "\' has been successfully initialized.\n", __FUNCTION__)); + } else { + CommOS_Log(("%s: Module \'" COMM_OS_MOD_SHORT_NAME_STRING \ + "\' could not be initialized [%d].\n", __FUNCTION__, rc)); + } + + return rc > 0 ? -rc : rc; +} + + +/** + * @brief Module exit function. Calls the commOSModExit function pointer + * to perform upper layer cleanup. + */ + +static void __exit +ModExit(void) +{ + if (!commOSModExit) { + CommOS_Log(("%s: Can't find \'fini\' function for module \'" \ + COMM_OS_MOD_SHORT_NAME_STRING "\'.\n", __FUNCTION__)); + return; + } + + (*commOSModExit)(); + CommOS_Log(("%s: Module \'" COMM_OS_MOD_SHORT_NAME_STRING \ + "\' has been stopped.\n", __FUNCTION__)); +} + + +module_init(ModInit); +module_exit(ModExit); + +/* Module information. */ +MODULE_AUTHOR("VMware, Inc."); +MODULE_DESCRIPTION(COMM_OS_MOD_NAME_STRING); +MODULE_VERSION(COMM_OS_MOD_VERSION_STRING); +MODULE_LICENSE("GPL v2"); +/* + * Starting with SLE10sp2, Novell requires that IHVs sign a support agreement + * with them and mark their kernel modules as externally supported via a + * change to the module header. If this isn't done, the module will not load + * by default (i.e., neither mkinitrd nor modprobe will accept it). + */ +MODULE_INFO(supported, "external"); diff --git a/arch/arm/mvp/pvtcpkm/comm_os_mod_ver.h b/arch/arm/mvp/pvtcpkm/comm_os_mod_ver.h new file mode 100644 index 0000000..5e14c62 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm_os_mod_ver.h @@ -0,0 +1,38 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Version definitions for the pvTCP module. + */ + +#ifndef _COMM_OS_MOD_VER_H_ +#define _COMM_OS_MOD_VER_H_ + +#define COMM_OS_MOD_NAME_STRING "VMware paravirtualized tcp/ip module" +#define COMM_OS_MOD_SHORT_NAME pvtcp +#define COMM_OS_MOD_SHORT_NAME_STRING "pvtcp" + +#define COMM_OS_MOD_VERSION 1.0.0.0 +#define COMM_OS_MOD_VERSION_COMMAS 1,0,0,0 +#define COMM_OS_MOD_VERSION_STRING "1.0.0.0" + +#endif /* _COM_OS_MOD_VER_H_ */ diff --git a/arch/arm/mvp/pvtcpkm/comm_svc.h b/arch/arm/mvp/pvtcpkm/comm_svc.h new file mode 100644 index 0000000..784ec76 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm_svc.h @@ -0,0 +1,71 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Communication functions exported by the comm_rt module. + */ + +#ifndef _COMM_SVC_H_ +#define _COMM_SVC_H_ + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +#include "comm.h" + +int CommSvc_RegisterImpl(const CommImpl *impl); +void CommSvc_UnregisterImpl(const CommImpl *impl); +int CommSvc_Zombify(CommChannel channel, int inBH); +int CommSvc_IsActive(CommChannel channel); +CommTranspInitArgs CommSvc_GetTranspInitArgs(CommChannel channel); +void *CommSvc_GetState(CommChannel channel); +void CommSvc_Put(CommChannel channel); +void CommSvc_DispatchUnlock(CommChannel channel); +int CommSvc_Lock(CommChannel channel); +void CommSvc_Unlock(CommChannel channel); +int CommSvc_ScheduleAIOWork(CommOSWork *work); + +int +CommSvc_Alloc(const CommTranspInitArgs *transpArgs, + const CommImpl *impl, + int inBH, + CommChannel *newChannel); + +int +CommSvc_Write(CommChannel channel, + const CommPacket *packet, + unsigned long long *timeoutMillis); + +int +CommSvc_WriteVec(CommChannel channel, + const CommPacket *packet, + struct kvec **vec, + unsigned int *vecLen, + unsigned long long *timeoutMillis, + unsigned int *iovOffset); + +unsigned int CommSvc_RequestInlineEvents(CommChannel channel); +unsigned int CommSvc_ReleaseInlineEvents(CommChannel channel); + +#endif // _COMM_SVC_H_ diff --git a/arch/arm/mvp/pvtcpkm/comm_transp.h b/arch/arm/mvp/pvtcpkm/comm_transp.h new file mode 100644 index 0000000..c46f849 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/comm_transp.h @@ -0,0 +1,90 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Generic shared memory transport API. + */ + +#ifndef _COMM_TRANSP_H_ +#define _COMM_TRANSP_H_ + +#define INCLUDE_ALLOW_PV +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_MONITOR +#define INCLUDE_ALLOW_GPL +#include "include_check.h" + +/* + * Common shared memory identifier. + * External handle that makes sense to both hypervisor and guest. + */ + +#define COMM_TRANSP_ID_8_ANY ((unsigned char)-1) +#define COMM_TRANSP_ID_32_ANY ((unsigned int)-1) +#define COMM_TRANSP_ID_64_ANY ((unsigned long long)-1) + + +typedef struct CommTranspID { + union { + unsigned char d8[8]; + unsigned int d32[2]; + unsigned long long d64; + }; +} CommTranspID; + + +/* Basic initialization arguments. */ + +typedef enum CommTranspInitMode { + COMM_TRANSP_INIT_CREATE = 0x0, + COMM_TRANSP_INIT_ATTACH = 0x1 +} CommTranspInitMode; + +typedef struct CommTranspInitArgs { + unsigned int capacity; // Shared memory capacity. + unsigned int type; // Type / implementation using this area. + CommTranspID id; // ID (name) of shared memory area. + CommTranspInitMode mode; // Init mode (above). +} CommTranspInitArgs; + + +/** + * @brief Generate a type id from description (protocol) string. This function + * uses djb2, a string hashing algorithm by Dan Bernstein. + * (see http://www.cse.yorku.ca/~oz/hash.html) + * @param str string to hash + * @return 32-bit hash value + */ + +static inline unsigned int +CommTransp_GetType(const char *str) +{ + unsigned int hash = 5381; + int c; + + while ((c = *str++)) { + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + } + return hash; +} + +#endif // _COMM_TRANSP_H_ diff --git a/arch/arm/mvp/pvtcpkm/include_check.h b/arch/arm/mvp/pvtcpkm/include_check.h new file mode 100644 index 0000000..2eeafe7 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/include_check.h @@ -0,0 +1,18 @@ +/* + * Linux 2.6.32 and later Kernel module for Empty File Placeholder + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ diff --git a/arch/arm/mvp/pvtcpkm/pvtcp.c b/arch/arm/mvp/pvtcpkm/pvtcp.c new file mode 100644 index 0000000..fdfb0d2 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp.c @@ -0,0 +1,587 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Pvtcp common code. + */ + +#include "pvtcp.h" + + +/* + * Operation table. + */ + +CommOperationFunc pvtcpOperations[] = { + [PVTCP_OP_FLOW] = PvtcpFlowOp, + [PVTCP_OP_IO] = PvtcpIoOp, + [PVTCP_OP_CREATE] = PvtcpCreateOp, + [PVTCP_OP_RELEASE] = PvtcpReleaseOp, + [PVTCP_OP_BIND] = PvtcpBindOp, + [PVTCP_OP_LISTEN] = PvtcpListenOp, + [PVTCP_OP_ACCEPT] = PvtcpAcceptOp, + [PVTCP_OP_CONNECT] = PvtcpConnectOp, + [PVTCP_OP_SHUTDOWN] = PvtcpShutdownOp, + [PVTCP_OP_SETSOCKOPT] = PvtcpSetSockOptOp, + [PVTCP_OP_GETSOCKOPT] = PvtcpGetSockOptOp, + [PVTCP_OP_IOCTL] = PvtcpIoctlOp, + [PVTCP_OP_INVALID] = NULL +}; + + +/* + * Implementation block. + */ + +CommImpl pvtcpImpl = { + .owner = NULL, + .checkArgs = PvtcpCheckArgs, + .stateCtor = PvtcpStateAlloc, + .stateDtor = PvtcpStateFree, + .dataAlloc = PvtcpBufAlloc, + .dataFree = PvtcpBufFree, + .operations = pvtcpOperations, + .closeNtf = PvtcpCloseNtf, + .closeNtfData = &pvtcpImpl, + .ntfCenterID = {{ + .d32[0] = 2U /* x86 host context (vmci, only). */, + .d32[1] = 10000 /* Default, not yet reserved, resource (vmci, only). */ + }} +}; + + +/* + * Version array. + */ + +const char *pvtcpVersions[] = { + [PVTCP_VERS_1_1] = PVTCP_COMM_IMPL_VERS_1_1, + [PVTCP_VERS_1_0] = PVTCP_COMM_IMPL_VERS_1_0 +}; + +const unsigned int pvtcpVersionsSize = + (sizeof pvtcpVersions / sizeof pvtcpVersions[0]); + + +/* + * Client (pv) channel to offload side. We choose to define it here, although + * it's only applicable to the pv implementation. The reason is that we can + * share a common close notification function which does the right thing + * depending on the channel configuration. + */ + +CommChannel pvtcpClientChannel; + + +/* + * Built-in state interfaces. + */ + +static PvtcpIfConf ifUnbound = { + .family = PVTCP_PF_UNBOUND +}; +const PvtcpIfConf *pvtcpIfUnbound = &ifUnbound; + +static PvtcpIfConf ifDeathRow = { + .family = PVTCP_PF_DEATH_ROW +}; +const PvtcpIfConf *pvtcpIfDeathRow = &ifDeathRow; + +static PvtcpIfConf ifLoopbackInet4 = { + .family = PVTCP_PF_LOOPBACK_INET4 +}; +const PvtcpIfConf *pvtcpIfLoopbackInet4 = &ifLoopbackInet4; + + +/* Functions */ + +/** + * @brief Checks if the IF configuration has reasonable values. + * @param conf configuration to check + * @return zero if successful, -1 otherwise + */ + +static int +IfCheck(const PvtcpIfConf *conf) +{ + if (!conf || + ((conf->family != PF_INET) && + (conf->family != PF_INET6) && + (conf->family != PVTCP_PF_UNBOUND) && + (conf->family != PVTCP_PF_DEATH_ROW) && + (conf->family != PVTCP_PF_LOOPBACK_INET4))) { + return -1; + } + + /** @todo Need more checks for IP/netmask format validity. */ + return 0; +} + + +/** + * @brief Checks if the IF has reasonable values, but restricts types to + * AF_INET and AF_INET6 + * @param conf IF to check + * @return zero if successful, -1 otherwise + */ + +static int +IfRestrictedCheck(const PvtcpIfConf *conf) +{ + if (IfCheck(conf) || + ((conf->family != PF_INET) && + (conf->family != PF_INET6))) { + return -1; + } + return 0; +} + + +/** + * @brief Finds a netif given a state and a configuration. The configuration + * must have already been checked. This function doesn't lock, so it + * should not be called when the state, or the netif for the passed + * configuration may be deleted. + * @param state state to look for. + * @param conf configuration to look for. + * @return netif matching configuration, or NULL. + */ + +PvtcpIf * +PvtcpStateFindIf(PvtcpState *state, + const PvtcpIfConf *conf) +{ + PvtcpIf *netif; + + if (!state) { + return NULL; + } + + if (conf->family == PVTCP_PF_UNBOUND) { + return &state->ifUnbound; + } + + if (conf->family == PVTCP_PF_DEATH_ROW) { + return &state->ifDeathRow; + } + + if (conf->family == PVTCP_PF_LOOPBACK_INET4) { + return &state->ifLoopbackInet4; + } + + CommOS_ListForEach(&state->ifList, netif, stateLink) { + if (netif->conf.family == conf->family) { + if ((conf->family == PF_INET && + !memcmp(&netif->conf.addr.in, &conf->addr.in, + sizeof conf->addr.in)) || + (conf->family == PF_INET6 && + !memcmp(&netif->conf.addr.in6, &conf->addr.in6, + sizeof conf->addr.in6))) { + return netif; + } + } + } + return NULL; +} + + +/** + * @brief Creates and initializes a new netif for a given channel and with + * the specified configuration. Death row and unbound netifs may not + * be added using this function. + * @param[in,out] channel channel to make a new netif in + * @param conf configuration to set netif to + * @return 0 if successful, -1 otherwise + * @sideeffect May allocate memory + */ + +int +PvtcpStateAddIf(CommChannel channel, + const PvtcpIfConf *conf) +{ + int rc = -1; + PvtcpState *state; + PvtcpIf *netif; + + if (!channel || IfRestrictedCheck(conf)) { + return rc; + } + + if (CommSvc_Lock(channel)) { + return rc; /* channel isn't active. */ + } + + state = CommSvc_GetState(channel); + if (!state) { + goto out; + } + + if (PvtcpStateFindIf(state, conf)) { + goto out; /* Already configured. */ + } + + netif = CommOS_Kmalloc(sizeof *netif); + if (!netif) { + goto out; + } + + INIT_LIST_HEAD(&netif->stateLink); + INIT_LIST_HEAD(&netif->sockList); + netif->state = state; + netif->conf = *conf; + CommOS_ListAddTail(&state->ifList, &netif->stateLink); + rc = 0; + +out: + CommSvc_Unlock(channel); + return rc; +} + + +/** + * @brief Removes and potentially deallocates all sockets associated with the + * given netif and deallocates the latter. + * @param[in,out] netif netif to deallocate + * @sideeffect Closes sockets, deallocates memory + */ + +static void +IfFree(PvtcpIf *netif) +{ + PvtcpSock *pvsk; + PvtcpSock *tmp; + + if (netif) { + CommOS_ListForEachSafe(&netif->sockList, pvsk, tmp, ifLink) { + CommOS_ListDel(&pvsk->ifLink); + PvtcpReleaseSocket(pvsk); + } + if ((netif->conf.family != PVTCP_PF_UNBOUND) && + (netif->conf.family != PVTCP_PF_DEATH_ROW) && + (netif->conf.family != PVTCP_PF_LOOPBACK_INET4)) { + CommOS_ListDel(&netif->stateLink); + CommOS_Kfree(netif); + } + } +} + + +/** + * @brief Closes all sockets associated with, and deallocates the netif + * in the given channel and with the specified configuration. + * Death row and unbound netifs may not be removed using this function. + * @param[in,out] channel channel to remove from + * @param conf configuration specified + * @return zero if successful, error code otherwise + * @sideeffect Closes sockets, deallocates memory + */ + +void +PvtcpStateRemoveIf(CommChannel channel, + const PvtcpIfConf *conf) +{ + PvtcpState *state; + PvtcpIf *netif; + + if (!channel || IfRestrictedCheck(conf)) { + return; + } + + if (CommSvc_Lock(channel)) { + return; /* channel isn't active. */ + } + + state = CommSvc_GetState(channel); + if (state && (netif = PvtcpStateFindIf(state, conf))) { + if (netif->state == state) { + IfFree(netif); + } + } + + CommSvc_Unlock(channel); +} + + +/** + * @brief Adds a socket to an existing netif. If the socket is already on a + * different netif, it is removed from that netif. + * It locks the must-be-active channel. We use that lock to guard + * against concurrent removal of the netif. + * @param[in,out] channel channel to add to + * @param conf specified configuration + * @param[in,out] sock socket to add + * @return zero if successful, -1 otherwise + */ + +int +PvtcpStateAddSocket(CommChannel channel, + const PvtcpIfConf *conf, + PvtcpSock *sock) +{ + int rc = -1; + PvtcpState *state; + PvtcpIf *netif; + + if (!channel || !sock || (sock->channel != channel) || IfCheck(conf)) { + return rc; + } + + if (CommSvc_Lock(channel)) { + return rc; /* channel isn't active. */ + } + + state = CommSvc_GetState(channel); + if (!state) { + goto out; + } + + netif = PvtcpStateFindIf(state, conf); + if (!netif) { + goto out; + } + + CommOS_ListDel(&sock->ifLink); + sock->netif = netif; + CommOS_ListAddTail(&netif->sockList, &sock->ifLink); + rc = 0; + +out: + CommSvc_Unlock(channel); + return rc; +} + + +/** + * @brief Removes a socket from its netif. + * It locks the must-be-active channel. We use that lock to guard + * against concurrent removal of the netif. + * @param[in,out] channel channel to remove from + * @param[in,out] sock socket to remove + * @return zero if successful, -1 otherwise + */ + +int +PvtcpStateRemoveSocket(CommChannel channel, + PvtcpSock *sock) +{ + if (!channel || !sock || + (sock->channel && (sock->channel != channel))) { + return -1; + } + + if (CommSvc_Lock(channel)) { + return -1; /* channel isn't active. */ + } + + CommOS_ListDel(&sock->ifLink); + sock->channel = NULL; + sock->state = NULL; + sock->netif = NULL; + + CommSvc_Unlock(channel); + return 0; +} + + +/** + * @brief State constructor called when a channel is created. The netifs + * 'death row' and 'unbound' are always initialized. + * @param[in,out] channel channel to initialize + * @return pointer to a new state structure or NULL + * @sideeffect Allocates memory + */ + +void * +PvtcpStateAlloc(CommChannel channel) +{ + PvtcpState *state; + + state = CommOS_Kmalloc(sizeof *state); + if (state) { + state->channel = channel; + INIT_LIST_HEAD(&state->ifList); + + /* Initialize always-present netifs. */ + INIT_LIST_HEAD(&state->ifDeathRow.stateLink); /* Irrelevant */ + INIT_LIST_HEAD(&state->ifDeathRow.sockList); + state->ifDeathRow.state = state; + state->ifDeathRow.conf.family = PVTCP_PF_DEATH_ROW; + + INIT_LIST_HEAD(&state->ifUnbound.stateLink); /* Irrelevant */ + INIT_LIST_HEAD(&state->ifUnbound.sockList); + state->ifUnbound.state = state; + state->ifUnbound.conf.family = PVTCP_PF_UNBOUND; + + INIT_LIST_HEAD(&state->ifLoopbackInet4.stateLink); /* Irrelevant */ + INIT_LIST_HEAD(&state->ifLoopbackInet4.sockList); + state->ifLoopbackInet4.state = state; + state->ifLoopbackInet4.conf.family = PVTCP_PF_LOOPBACK_INET4; + + state->namespace = NULL; + state->mask = ((unsigned int)channel << 4) ^ (unsigned int)state; +#if defined(__linux__) + state->id = ((unsigned long long)random32() << 32) | + (unsigned long long)random32(); +#else + state->id = (unsigned long long)state; +#endif + } + return state; +} + + +/** + * @brief State destructor called when a channel is closed. + * The caller (Comm) guarantees proper locking. + * @param arg pointer to state structure + * @sideeffect Destroys all netifs and their sockets, deallocates memory + */ + +void +PvtcpStateFree(void *arg) +{ + PvtcpState *state = arg; + PvtcpIf *netif; + PvtcpIf *tmp; + + if (state) { + CommOS_ListForEachSafe(&state->ifList, netif, tmp, stateLink) { + IfFree(netif); + } + /* coverity[address_free] */ + IfFree(&state->ifLoopbackInet4); + /* coverity[address_free] */ + IfFree(&state->ifUnbound); + /* coverity[address_free] */ + IfFree(&state->ifDeathRow); + CommOS_Kfree(state); + } +} + + +/** + * @brief Checks transport arguments. + * @param transpArgs transport arguments. + * @return zero if successful, < 0 otherwise. + */ + +int +PvtcpCheckArgs(CommTranspInitArgs *transpArgs) +{ + int rc = -1; + const unsigned int minCapacity = + (PVTCP_SOCK_BUF_SIZE + sizeof(CommPacket)) * 2; + unsigned int versionIndex = pvtcpVersionsSize; + + if (transpArgs->capacity < minCapacity) { + return rc; + } + + while (versionIndex--) { + if (transpArgs->type == CommTransp_GetType(pvtcpVersions[versionIndex])) { + /* If a match, overwrite the hash with the actual version (index). */ + + transpArgs->type = versionIndex; + rc = 0; + break; + } + } + + return rc; +} + + +/** + * @brief Called after a channel is freed. + * @param ntfData callback data from implementation block. + * @param transpArgs transport arguments of closed channel. + * @param inBH whether called in bottom half. + */ + +void +PvtcpCloseNtf(void *ntfData, + const CommTranspInitArgs *transpArgs, + int inBH) +{ + CommImpl *impl = (CommImpl *)ntfData; + + pvtcpClientChannel = NULL; + CommOS_Log(("%s: Channel was reset!\n", __FUNCTION__)); + + /* + * If the impl. block owner is NULL, we're pv client: we attempt to + * reopen the channel in a few seconds. + */ + + if (impl && !impl->owner && !inBH) { + CommOS_Log(("%s: Attempting to re-initialize channel.\n", __FUNCTION__)); + impl->openAtMillis = CommOS_GetCurrentMillis(); + impl->openTimeoutAtMillis = + CommOS_GetCurrentMillis() + PVTCP_CHANNEL_OPEN_TIMEOUT; + if (CommSvc_Alloc(transpArgs, impl, inBH, &pvtcpClientChannel)) { + CommOS_Log(("%s: Failed to initialize channel!\n", __FUNCTION__)); + } + } +} + + +/** + * @brief Initializes the Pvtcp socket common fields. + * @param pvsk pvtcp socket. + * @param channel Comm channel this socket is associated with. + * @return 0 if successful, -1 otherwise. + */ + +int +PvtcpSockInit(PvtcpSock *pvsk, + CommChannel channel) +{ + PvtcpState *state; + int rc = -1; + + if (pvsk && channel && (state = CommSvc_GetState(channel))) { + /* Must _not_ zero out pvsk! */ + + CommOS_MutexInit(&pvsk->inLock); + CommOS_MutexInit(&pvsk->outLock); + CommOS_SpinlockInit(&pvsk->stateLock); + CommOS_ListInit(&pvsk->ifLink); + CommOS_InitWork(&pvsk->work, PvtcpProcessAIO); + pvsk->netif = NULL; + pvsk->state = state; + pvsk->stateID = state->id; + pvsk->channel = channel; + pvsk->peerSock = PVTCP_PEER_SOCK_NULL; + pvsk->peerSockSet = 0; + CommOS_WriteAtomic(&pvsk->deltaAckSize, + (1 << PVTCP_SOCK_SMALL_ACK_ORDER)); + CommOS_WriteAtomic(&pvsk->rcvdSize, 0); + CommOS_WriteAtomic(&pvsk->sentSize, 0); + CommOS_WriteAtomic(&pvsk->queueSize, 0); + CommOS_ListInit(&pvsk->queue); + pvsk->rpcReply = NULL; + pvsk->rpcStatus = 0; + pvsk->err = 0; + rc = 0; + } + return rc; +} diff --git a/arch/arm/mvp/pvtcpkm/pvtcp.h b/arch/arm/mvp/pvtcpkm/pvtcp.h new file mode 100644 index 0000000..7f4f2f5 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp.h @@ -0,0 +1,458 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Pvtcp common APIs. + */ + +#ifndef _PVTCP_H_ +#define _PVTCP_H_ + +/* + * Pvtcp state store ipv4 and ipv6 address structures. + * Platform-specific headers where these are defined, must be included here. + * Implementation-related header files should not be included in this file. + * + * NOTE: Pvtcp is not an API and none of its functions are exported. + */ + +#if defined(__linux__) +#include +#include +#else +#error "Unsupported OS." +#endif + +#include "comm_svc.h" + +/* Max time to wait for a channel to be created. */ +#define PVTCP_CHANNEL_OPEN_TIMEOUT 2000 + +/* Max payload size. Used to allocate offload per-cpu bounce buffers. */ +#define PVTCP_SOCK_BUF_SIZE (8 << 10) /* 8K */ + +#define PVTCP_SOCK_DGRAM_BUF_SIZE PVTCP_SOCK_BUF_SIZE +#define PVTCP_SOCK_STREAM_BUF_SIZE PVTCP_SOCK_BUF_SIZE + +/* Dgram payloads include a pseudo (udp/ip) header. */ +typedef struct PvtcpDgramPseudoHeader { + unsigned long long d0; + unsigned long long d1; + unsigned long long d2; + unsigned long long d3; +} PvtcpDgramPseudoHeader; + + +/* + * Flow control constants for pv/offload sockets. + * We are defining a receive size model: 1) small, 2) medium, 3)large. + * This seems sufficient in addressing most target environments, but more + * models may be defined. A smaller minimum model (1) cannot be defined. + * + * Short description of socket-level flow control. This applies to both + * dgram and stream sockets, in both directions. It follows that, with regard + * to 'comm' writes, dgram and stream writes are: a) lossless and b) ordered. + * + * 0. Both sides (offload, pv) of a socket maintain (almost) mirror values + * of input/output queue sizes. We say 'almost', because they're allowed + * to conservatively converge in time. + * 1. Senders never write out to the shmem channel, and destined to a socket + * (be it offload or pv), more bytes than that socket can hold/enqueue. + * This is based on socket fields storing information mentioned above. + * The upper limit is PVTCP_SOCK_RCVSIZE and cannot be exceeded under + * any circumstances. + * 2. There is a 'safe' limit value (per socket) which can be tested prior + * to writing one more max-sized packet to that socket. + * This value is PVTCP_SOCK_SAFE_RCVSIZE. + * 3. There is also a notion of 'large' acks, which controls the frequency of + * reporting socket queue size changes when bytes are consumed from it. + * When a sender is about to write out (to the channel, for a given socket) + * in excess of PVTCP_SOCK_LARGE_ACK_WM bytes, it sets, in the packet + * header flag field, the PVTCP_SOCK_LARGE_ACK_ORDER value. The other end + * updates its 'delta ack' value accordingly (1 << flag value). + * 4. As bytes are consumed (again, at either end), the operation or function, + * will send a size ack packet with the consumed size since the last ack, + * _iff_ that size is larger than, or equal to the 'delta ack' value. + * If an ack was sent, the 'delta ack' is decreased by half, to a minimum + * indicated by PVTCP_SOCK_SMALL_ACK_ORDER. + * Note that concurrently setting the 'delta ack' to its high value + * because of condition 3) above, is fine since the sender already has, + * or is about to put pressure on the socket. + */ + +#if !defined(PVTCP_SOCK_RCVSIZE_MODEL) + #define PVTCP_SOCK_RCVSIZE_MODEL 1 +#endif + +#if PVTCP_SOCK_RCVSIZE_MODEL == 1 + #define PVTCP_SOCK_LARGE_ACK_WM (64 << 10) /* 64K */ + #define PVTCP_SOCK_LARGE_ACK_ORDER 15 + #define PVTCP_SOCK_SMALL_ACK_ORDER 11 + #define PVTCP_SOCK_SAFE_RCVSIZE (128 << 10) /* 128K */ +#elif PVTCP_SOCK_RCVSIZE_MODEL == 2 + #define PVTCP_SOCK_LARGE_ACK_WM (128 << 10) /* 128K */ + #define PVTCP_SOCK_LARGE_ACK_ORDER 16 + #define PVTCP_SOCK_SMALL_ACK_ORDER 12 + #define PVTCP_SOCK_SAFE_RCVSIZE (256 << 10) /* 256K */ +#elif PVTCP_SOCK_RCVSIZE_MODEL == 3 + #define PVTCP_SOCK_LARGE_ACK_WM (128 << 10) /* 128K */ + #define PVTCP_SOCK_LARGE_ACK_ORDER 16 + #define PVTCP_SOCK_SMALL_ACK_ORDER 12 + #define PVTCP_SOCK_SAFE_RCVSIZE (512 << 10) /* 512K */ +#else + #error "Invalid PVTCP_SOCK_RCVSIZE_MODEL (one of 1, 2, 3)" +#endif + +#define PVTCP_SOCK_RCVSIZE \ + (PVTCP_SOCK_SAFE_RCVSIZE + \ + PVTCP_SOCK_BUF_SIZE + sizeof (PvtcpDgramPseudoHeader)) + + +/* + * Operation codes + */ + +enum PvtcpOpCodes { + PVTCP_OP_FLOW = 0, + PVTCP_OP_IO, + PVTCP_OP_CREATE, + PVTCP_OP_RELEASE, + PVTCP_OP_BIND, + PVTCP_OP_LISTEN, + PVTCP_OP_ACCEPT, + PVTCP_OP_CONNECT, + PVTCP_OP_SHUTDOWN, + PVTCP_OP_SETSOCKOPT, + PVTCP_OP_GETSOCKOPT, + PVTCP_OP_IOCTL, + PVTCP_OP_INVALID +}; + +#define PVTCP_FLOW_OP_INVALID_SIZE 0xffffffff + + +/* + * Operation functions + */ + +COMM_DEFINE_OP(PvtcpFlowOp); +COMM_DEFINE_OP(PvtcpIoOp); +COMM_DEFINE_OP(PvtcpCreateOp); +COMM_DEFINE_OP(PvtcpReleaseOp); +COMM_DEFINE_OP(PvtcpBindOp); +COMM_DEFINE_OP(PvtcpListenOp); +COMM_DEFINE_OP(PvtcpAcceptOp); +COMM_DEFINE_OP(PvtcpConnectOp); +COMM_DEFINE_OP(PvtcpShutdownOp); +COMM_DEFINE_OP(PvtcpSetSockOptOp); +COMM_DEFINE_OP(PvtcpGetSockOptOp); +COMM_DEFINE_OP(PvtcpIoctlOp); + + +/* + * Pvtcp/Comm type and supported versions. + */ + +#define PVTCP_COMM_IMPL_TYPE "com.vmware.comm.protocol.pvTCP@" + +#define PVTCP_COMM_IMPL_VERS_1_0 (PVTCP_COMM_IMPL_TYPE "1.0") +#define PVTCP_COMM_IMPL_VERS_1_1 (PVTCP_COMM_IMPL_TYPE "1.1") + +typedef enum { + PVTCP_VERS_1_0 = 0, + PVTCP_VERS_1_1 +} PvtcpVersion; + +extern const char *pvtcpVersions[]; +extern const unsigned int pvtcpVersionsSize; + + +/* + * State interface markers + */ + +#define PVTCP_PF_UNBOUND 0x0 +#define PVTCP_PF_DEATH_ROW 0xffffffff +#define PVTCP_PF_LOOPBACK_INET4 (PVTCP_PF_DEATH_ROW - 1) + + +/* + * Interface and interface configuration structures. + */ + +typedef struct PvtcpIfConf { + int family; // Values: + // unbound (PVTCP_PF_UNBOUND) + // deathRow (PVTCP_PF_DEATH_ROW) + // loopback (PVTCP_PF_LOOPBACK_INET4) + // inet4 (PF_INET) + // inet6 (PF_INET6) + union { + struct in_addr in; + struct in6_addr in6; + } addr; // inet4 or inet6 address. + union { + struct in_addr in; + struct in6_addr in6; + } mask; // inet4 or inet6 netmask. +} PvtcpIfConf; + + +struct PvtcpState; + +typedef struct PvtcpIf { + CommOSList sockList; // List of sockets. + CommOSList stateLink; // Link in PvtcpState.ifList. + struct PvtcpState *state; // Back reference to state. + PvtcpIfConf conf; // Interface configuration. +} PvtcpIf; + + +/* + * General pvtcp state associated with a channel. + */ + +typedef struct PvtcpState { + unsigned long long id; // Randomly generated state ID. + CommOSList ifList; // List of active interfaces. + CommChannel channel; // Comm channel back reference. + PvtcpIf ifDeathRow; // Always-present netif. + PvtcpIf ifUnbound; // Ditto. + PvtcpIf ifLoopbackInet4; // Ditto. + void *namespace; // Name space, where supported. + void *extra; // Used by upper layer to extend state as needed. + unsigned int mask; // Mask used to obfuscate socket pointers. +} PvtcpState; + + +/* + * Define pvtcp socket common fields and include the pv or offload header + * to get the right PvtcpSock definition. + */ + +#define PVTCP_SOCK_COMMON_FIELDS \ + CommOSMutex inLock; /* Input lock. */ \ + CommOSMutex outLock; /* Output lock. */ \ + CommOSSpinlock stateLock; /* State update lock. */ \ + CommOSList ifLink; /* Link in PvtcpIf.sockList. */ \ + CommOSWork work; /* Work item for AIO processing. */ \ + PvtcpIf *netif; /* Netif reference. */ \ + PvtcpState *state; /* State reference. */ \ + unsigned long long stateID; /* State ID. */ \ + CommChannel channel; /* Comm channel reference. */ \ + unsigned long long peerSock; /* Peer socket, opaque. */ \ + volatile int peerSockSet; /* Peer socket valid. */ \ + CommOSAtomic deltaAckSize; /* Recv size updates required by peer. */ \ + CommOSAtomic rcvdSize; /* Bytes received since last ack. */ \ + CommOSAtomic sentSize; /* Bytes sent; also updated by peer. */ \ + CommOSAtomic queueSize; /* Queue size. */ \ + CommOSList queue; /* Send queue (off) or recv queue (pv). */ \ + void *rpcReply; /* RPC reply. */ \ + int rpcStatus; /* RPC completion status. */ \ + int err /* Socket error. */ + +#define PVTCP_PEER_SOCK_NULL ((unsigned long long)0) + + +/* + * Helper macros + */ + +#define SOCK_STATE_LOCK(pvsk) CommOS_SpinLock(&(pvsk)->stateLock) +#define SOCK_STATE_UNLOCK(pvsk) CommOS_SpinUnlock(&(pvsk)->stateLock) + +#define SOCK_IN_TRYLOCK(pvsk) CommOS_MutexTrylock(&(pvsk)->inLock) +#define SOCK_IN_LOCK(pvsk) CommOS_MutexLock(&(pvsk)->inLock) +#define SOCK_IN_UNLOCK(pvsk) CommOS_MutexUnlock(&(pvsk)->inLock) + +#define SOCK_OUT_TRYLOCK(pvsk) CommOS_MutexTrylock(&(pvsk)->outLock) +#define SOCK_OUT_LOCK(pvsk) CommOS_MutexLock(&(pvsk)->outLock) +#define SOCK_OUT_LOCK_UNINT(pvsk) \ + CommOS_MutexLockUninterruptible(&(pvsk)->outLock) +#define SOCK_OUT_UNLOCK(pvsk) CommOS_MutexUnlock(&(pvsk)->outLock) + +#define PVTCP_UNLOCK_DISP_DISCARD_VEC() \ + CommSvc_DispatchUnlock(channel); \ + while (vecLen) { \ + PvtcpBufFree(vec[--vecLen].iov_base); \ + } + + +#if defined(PVTCP_BUILDING_SERVER) +#include "pvtcp_off.h" +#else +#include "pvtcp_pv.h" +#endif // defined(PVTCP_BUILDING_SERVER) + + +/* + * Data declarations + */ + +extern const PvtcpIfConf *pvtcpIfUnbound; +extern const PvtcpIfConf *pvtcpIfDeathRow; +extern const PvtcpIfConf *pvtcpIfLoopbackInet4; + +extern CommImpl pvtcpImpl; +extern CommOperationFunc pvtcpOperations[]; + +extern CommChannel pvtcpClientChannel; + + +/* + * Common state manipulation functions. + */ + +void *PvtcpStateAlloc(CommChannel channel); +void PvtcpStateFree(void *arg); + +int PvtcpStateAddIf(CommChannel channel, const PvtcpIfConf *conf); +void PvtcpStateRemoveIf(CommChannel channel, const PvtcpIfConf *conf); +PvtcpIf *PvtcpStateFindIf(PvtcpState *state, const PvtcpIfConf *conf); + +int +PvtcpStateAddSocket(CommChannel channel, + const PvtcpIfConf *conf, + PvtcpSock *sock); +int PvtcpStateRemoveSocket(CommChannel channel, PvtcpSock *sock); + + +/* + * Common Pvtcp functions. + */ + +int PvtcpCheckArgs(CommTranspInitArgs *transpArgs); + +void +PvtcpCloseNtf(void *ntfData, + const CommTranspInitArgs *transpArgs, + int inBH); + +void *PvtcpBufAlloc(unsigned int size); +void PvtcpBufFree(void *buf); + +void PvtcpReleaseSocket(PvtcpSock *pvsk); +int PvtcpSockInit(PvtcpSock *pvsk, CommChannel channel); + +void PvtcpProcessAIO(CommOSWork *work); + + +/** + * @brief Packs an IPV6 address stored in an array of four 32-bit elements, + * into two 64-bit variables. + * @param addr IPV6 address as an array of 32-bit elements. + * @param[out] d64_0 pointer to 64-bit variable. + * @param[out] d64_1 pointer to 64-bit variable. + */ + +static inline void +PvtcpI6AddrPack(const unsigned int addr[4], + unsigned long long *d64_0, + unsigned long long *d64_1) +{ + *d64_0 = *(unsigned long long *)&addr[0]; + *d64_1 = *(unsigned long long *)&addr[2]; +} + + +/** + * @brief Unpacks two 64-bit values into an IPV6 address-storing array of + * four 32-bit elements, + * @param[out] addr IPV6 address as an array of 32-bit elements. + * @param d64_0 64-bit value. + * @param d64_1 64-bit value. + */ + +static inline void +PvtcpI6AddrUnpack(unsigned int addr[4], + unsigned long long d64_0, + unsigned long long d64_1) +{ + *(unsigned long long *)&addr[0] = d64_0; + *(unsigned long long *)&addr[2] = d64_1; +} + + +/** + * @brief Verifies whether the argument is a valid socket. If yes, it returns + * the actual pointer. Otherwise, it returns from the calling function. + * WARNING: This macro must ONLY be used in operation functions, as its + * implementation assumes. + * @param handle socket handle to verify. + * @param container state supposed to contain the socket handle. + * @return 32-bit or 64-bit PvtcpSock*, depending on __LP64__ or __LLP64__. + */ + +#if defined(__LP64__) || defined(__LLP64__) + +#define PvtcpGetPvskOrReturn(handle, container) \ + ({ \ + PvtcpState *__state = (PvtcpState *)(container); \ + PvtcpSock *__pvsk = \ + (PvtcpSock *)((handle) ^ (unsigned long long)__state->mask); \ + \ + if (__pvsk->stateID != __state->id) { \ + PVTCP_UNLOCK_DISP_DISCARD_VEC(); \ + CommSvc_Zombify(__state->channel, 0); \ + return; \ + } \ + (__pvsk); \ + }) + +#else // __LP64__ || __LLP64__ + +#define PvtcpGetPvskOrReturn(handle, container) \ + ({ \ + PvtcpState *__state = (PvtcpState *)(container); \ + PvtcpSock *__pvsk = \ + (PvtcpSock *)((unsigned int)(handle) ^ __state->mask); \ + \ + if (__pvsk->stateID != __state->id) { \ + PVTCP_UNLOCK_DISP_DISCARD_VEC(); \ + CommSvc_Zombify(__state->channel, 0); \ + return; \ + } \ + (__pvsk); \ + }) + +#endif // __LP64__ || __LLP64__ + + +/** + * @brief Masks a socket pointer to be passed to the peer module. + * @param pvsk socket pointer to mask. + * @return 64-bit pvtcp socket handle. + */ + +#if defined(__LP64__) || defined(__LLP64__) + +#define PvtcpGetHandle(pvsk) \ + ((unsigned long long)(pvsk) ^ (unsigned long long)(pvsk)->state->mask) + +#else // __LP64__ || __LLP64__ + +#define PvtcpGetHandle(pvsk) \ + ((unsigned int)(pvsk) ^ (pvsk)->state->mask) + +#endif // __LP64__ || __LLP64__ + +#endif // _PVTCP_H_ diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off.c b/arch/arm/mvp/pvtcpkm/pvtcp_off.c new file mode 100644 index 0000000..053d9c2 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp_off.c @@ -0,0 +1,81 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Server (offload) side code. + */ + +#include "pvtcp.h" + +/** + * @brief Allocates the net buffer. + * @param size buffer size + * @return address of buffer or NULL + */ +void * +PvtcpBufAlloc(unsigned int size) +{ + PvtcpOffBuf *buf; + + /* coverity[alloc_fn] */ + /* coverity[var_assign] */ + buf = CommOS_Kmalloc(size + sizeof *buf - sizeof buf->data); + if (buf) { + CommOS_ListInit(&buf->link); + buf->len = (unsigned short)size; + buf->off = 0; + return PvtcpOffBufFromInternal(buf); + } + return NULL; +} + + +/** + * @brief Deallocates given net buffer. + * @param buf buffer to deallocate + * @sideeffect Frees memory + */ + +void +PvtcpBufFree(void *buf) +{ + CommOS_Kfree(PvtcpOffInternalFromBuf(buf)); +} + + +/** + * @brief Initializes the Pvtcp socket offload common fields. + * @param pvsk pvtcp socket. + * @param channel Comm channel this socket is associated with. + * @return 0 if successful, -1 otherwise. + */ + +int +PvtcpOffSockInit(PvtcpSock *pvsk, + CommChannel channel) +{ + int rc = PvtcpSockInit(pvsk, channel); + + pvsk->opFlags = 0; + pvsk->flags = 0; + return rc; +} diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off.h b/arch/arm/mvp/pvtcpkm/pvtcp_off.h new file mode 100644 index 0000000..f183968 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp_off.h @@ -0,0 +1,219 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Offload common definitions. + * This file is meant to only be included via pvtcp.h. + */ + +#ifndef _PVTCP_OFF_H_ +#define _PVTCP_OFF_H_ + + +#define PVTCP_OFF_SOCK_COMMON_FIELDS \ + volatile unsigned int opFlags; /* Saves op codes as bit mask. */ \ + volatile unsigned int flags /* General purpose flags. */ + + +/* General purpose socket flags */ + +enum PvtcpOffPvskFlags { + PVTCP_OFF_PVSKF_IPV6_LOOP = 0, /* Used for IPV6 loopback morphing/reset. */ + PVTCP_OFF_PVSKF_SHUT_RD, /* Set to initiate socket recv shutdown. */ + PVTCP_OFF_PVSKF_SHUT_WR, /* Set to initiate socket send shutdown. */ + PVTCP_OFF_PVSKF_TCP_NODELAY, /* Caches the TCP_NODELAY socket option. */ + PVTCP_OFF_PVSKF_TCP_CORK, /* Caches the TCP_CORK socket option. */ + PVTCP_OFF_PVSKF_DISCONNECT, /* Set do indicate connect()/AF_UNSPEC. */ + PVTCP_OFF_PVSKF_INVALID = 32 +}; + + +/* + * Include OS-dependent PvtcpSock structure and functions. + */ + +#if defined(__linux__) +#include "pvtcp_off_linux.h" +#else +#error "Unsupported OS." +#endif + + +/* + * Offload packet payload data structure. + */ + +typedef struct PvtcpOffBuf { + CommOSList link; // Link in socket queue. + unsigned short len; + unsigned short off; + char data[1]; +} PvtcpOffBuf; + + +/** + * @brief Returns net buffer given private data structure pointer and based + * on the internal offset pointer + * @param arg pointer to PvtcpOffBuf wrapper structure + * @return address of buffer or NULL + */ + +static inline void * +PvtcpOffBufFromInternalOff(PvtcpOffBuf *arg) +{ + return arg ? + &arg->data[arg->off] : + NULL; +} + + +/** + * @brief Returns net buffer given private data structure pointer + * @param arg pointer to PvtcpOffBuf wrapper structure + * @return address of buffer or NULL + */ + +static inline void * +PvtcpOffBufFromInternal(PvtcpOffBuf *arg) +{ + return arg ? + &arg->data[0] : + NULL; +} + + +/** + * @brief Returns internal data structure given net buffer pointer + * @param arg pointer to PvtcpOffBuf wrapper structure + * @return address of internal data structure or NULL + */ + +static inline PvtcpOffBuf * +PvtcpOffInternalFromBuf(void *arg) +{ + return arg ? + (PvtcpOffBuf *)((char *)arg - offsetof(PvtcpOffBuf, data)) : + NULL; +} + + +/** + * @brief Tests operation flag for AIO processing. + * @param pvsk socket to test operation on. + * @param op operation to test if set. + * @return non-zero if operation set, zero otherwise. + * @sideeffect socket processing by AIO threads affected according to operation. + */ + +static inline int +PvskTestOpFlag(struct PvtcpSock *pvsk, + int op) +{ + return pvsk->opFlags & (1 << op); +} + + +/** + * @brief Sets operation flag for AIO processing; acquires the state lock. + * @param[in,out] pvsk socket to set operation on. + * @param op operation to set. + * @sideeffect socket processing by AIO threads affected according to operation. + */ + +static inline void +PvskSetOpFlag(struct PvtcpSock *pvsk, + int op) +{ + unsigned int ops; + + SOCK_STATE_LOCK(pvsk); + ops = pvsk->opFlags | (1 << op); + pvsk->opFlags = ops; + SOCK_STATE_UNLOCK(pvsk); +} + + +/** + * @brief Resets operation flag for AIO processing; acquires the state lock. + * @param[in,out] pvsk socket to reset operation on. + * @param op operation to reset. + * @sideeffect socket processing by AIO threads affected according to operation. + */ + +static inline void +PvskResetOpFlag(struct PvtcpSock *pvsk, + int op) +{ + unsigned int ops; + + SOCK_STATE_LOCK(pvsk); + ops = pvsk->opFlags & ~(1 << op); + pvsk->opFlags = ops; + SOCK_STATE_UNLOCK(pvsk); +} + + +/** + * @brief Tests general purpose socket flags. + * @param pvsk socket. + * @param flag flag to test. + * @return non-zero if flag set, zero otherwise. + */ + +static inline int +PvskTestFlag(struct PvtcpSock *pvsk, + int flag) +{ + return (flag < PVTCP_OFF_PVSKF_INVALID) && (pvsk->flags & (1 << flag)); +} + + +/** + * @brief Sets general purpose socket flags; acquires the state lock. + * @param[in,out] pvsk socket. + * @param flag flag to set or clear. + * @param onOff whether to set or clear the flag. + */ + +static inline void +PvskSetFlag(struct PvtcpSock *pvsk, + int flag, + int onOff) +{ + unsigned int flags; + + SOCK_STATE_LOCK(pvsk); + if (flag < PVTCP_OFF_PVSKF_INVALID) { + if (onOff) { + flags = pvsk->flags | (1 << flag); + } else { + flags = pvsk->flags & ~(1 << flag); + } + pvsk->flags = flags; + } + SOCK_STATE_UNLOCK(pvsk); +} + + +int PvtcpOffSockInit(PvtcpSock *pvsk, CommChannel channel); + +#endif // _PVTCP_OFF_H_ diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off_io_linux.c b/arch/arm/mvp/pvtcpkm/pvtcp_off_io_linux.c new file mode 100644 index 0000000..9958c39 --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp_off_io_linux.c @@ -0,0 +1,831 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Server (offload) side Linux-specific socket I/O functions. + */ + +#include "pvtcp.h" + +/* + * Data. + */ + +/* Used to check if OutputAIO()-ing is likely in progress. */ + +CommOSAtomic PvtcpOutputAIOSection; + + +/* + * Large datagram bounce buffer (PVTCP_SOCK_BUF_SIZE < size <= 64K). + * Only one such buffer is available, shared across cpus via get/put. + * A preallocated, smaller buffer is used for most over-size 'allocs'. + * A larger, 64K-buffer may need to be __vmalloc()-ed. + */ + +typedef struct LargeDgramBuf { + unsigned char buf[PVTCP_SOCK_BUF_SIZE << 1]; /* Fast buffer. */ + void *spareBuf; /* Dynamically allocated. */ + CommOSMutex lock; +} LargeDgramBuf; + +static LargeDgramBuf largeDgramBuf; + + +/** + * @brief One time initialization of large datagram buffer. + */ + +void +PvtcpOffLargeDgramBufInit(void) +{ + largeDgramBuf.spareBuf = NULL; + CommOS_MutexInit(&largeDgramBuf.lock); +} + + +/** + * @brief Reserves/holds the large datagram buffer. + * @param size size of buffer. + * @sizeeffect may sleep until the buffer is available. + * @return address of buffer, or NULL if size too large or allocation failed. + */ + +static inline void * +LargeDgramBufGet(int size) +{ + static const unsigned int maxSize = 64 * 1024; + + /* coverity[alloc_fn] */ + /* coverity[var_assign] */ + + CommOS_MutexLockUninterruptible(&largeDgramBuf.lock); + + if (size <= sizeof largeDgramBuf.buf) { + return largeDgramBuf.buf; + } + + if (size <= maxSize) { + if (!largeDgramBuf.spareBuf) { + largeDgramBuf.spareBuf = __vmalloc(maxSize, + (GFP_ATOMIC | __GFP_HIGHMEM), + PAGE_KERNEL); + } + if (largeDgramBuf.spareBuf) { + return largeDgramBuf.spareBuf; + } + } + + CommOS_MutexUnlock(&largeDgramBuf.lock); + return NULL; +} + + +/** + * @brief Releases hold on the large datagram buffer. + * @param buf buffer to put back. + */ + +static inline void +LargeDgramBufPut(void *buf) +{ + static unsigned int spareBufPuts = 0; + + BUG_ON((buf != largeDgramBuf.buf) && (buf != largeDgramBuf.spareBuf)); + + if (largeDgramBuf.spareBuf && (++spareBufPuts % 2) == 0) { + /* Deallocate the spare buffer every now and then. */ + + vfree(largeDgramBuf.spareBuf); + largeDgramBuf.spareBuf = NULL; + } + + CommOS_MutexUnlock(&largeDgramBuf.lock); +} + + +/* + * I/O offload operations. + */ + +/** + * @brief Flow control notification received when more (enough) data was + * consumed from a PV socket. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled + */ + +void +PvtcpFlowOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + CommOS_SubReturnAtomic(&pvsk->rcvdSize, (int)packet->data32); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Outputs bytes to socket. + * @param channel communication channel with offloader. + * @param upperLayerState state associated with this channel. + * @param packet received packet header. + * @param vec payload buffer descriptors. + * @param vecLen payload buffer descriptor count. + * @sideeffect Changes send size/capacity ratio. May schedule AIO processing + * for enqueued bytes, if applicable. + */ + +void +PvtcpIoOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + int rc; + unsigned int vecOff; + PvtcpOffBuf *internalBuf; + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + unsigned int dataLen = packet->len - sizeof *packet; + struct msghdr msg = { + .msg_controllen = 0, + .msg_control = NULL + }; + int tmpSize; + int needSched = 0; + + PvtcpHoldSock(pvsk); + rc = 0; + + if (!pvsk->peerSockSet || PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_WR)) { + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + goto out; + } + + tmpSize = (int)COMM_OPF_GET_VAL(packet->flags); + if (tmpSize) { + /* It was requested that we update deltaAckSize. */ + + tmpSize = 1 << tmpSize; + CommOS_WriteAtomic(&pvsk->deltaAckSize, tmpSize); + } + + if (sk->sk_type == SOCK_STREAM) { + unsigned int queueSize = 0; + + if (!SOCK_OUT_TRYLOCK(pvsk)) { + if (pvsk->peerSockSet && + (sk->sk_state == TCP_ESTABLISHED) && + (CommOS_ReadAtomic(&pvsk->queueSize) == 0)) { + /* Attempt to write directly as many bytes as we can. */ + + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + rc = kernel_sendmsg(sock, &msg, vec, vecLen, dataLen); + + if (rc == -EAGAIN) { + rc = 0; + } + if (rc >= 0) { + dataLen = rc; + for (vecOff = 0; vecOff < vecLen; vecOff++) { + if (rc >= vec[vecOff].iov_len) { + /* Dispose of all fully consumed buffers. */ + + PvtcpBufFree(vec[vecOff].iov_base); + rc -= vec[vecOff].iov_len; + } else { + /* Place partly consumed / unconsumed buffers in queue. */ + + internalBuf = + PvtcpOffInternalFromBuf(vec[vecOff].iov_base); + BUG_ON(internalBuf == NULL); + if (rc > 0) { + internalBuf->len -= rc; + internalBuf->off += rc; + rc = 0; + } + CommOS_ListAddTail(&pvsk->queue, &internalBuf->link); + queueSize += internalBuf->len; + } + } + if (queueSize > 0) { + CommOS_AddReturnAtomic(&pvsk->queueSize, queueSize); + needSched = 1; + } + } else { + /* + * We never close offload sockets unless told by the PV side, + * or when the comm goes down. Getting out of sync with PV + * sockets is a dangerously bad idea. + * This is very likely an EPIPE/ECONNRESET. + */ + + dataLen = 0; + for ( vecOff = 0; vecOff < vecLen; vecOff++) { + PvtcpBufFree(vec[vecOff].iov_base); + } + } + SOCK_OUT_UNLOCK(pvsk); + } else { + SOCK_OUT_UNLOCK(pvsk); + goto enqueueBytes; + } + } else { + /* + * We enqueue the bytes for aio processing. Note that request + * level ordering is preserved since we're still under the dispatch + * lock. However, accessing 'queue' must be protected via + * the state lock to serialize with aio changes. + * Note that the struct socket *sock may have been released, but here + * we only access sk which is held (albeit potentially orphaned). + */ + + CommOSList bufList; + +enqueueBytes: + dataLen = 0; + if (pvsk->peerSockSet && (sk->sk_state == TCP_ESTABLISHED)) { + queueSize = 0; + CommOS_ListInit(&bufList); + for (vecOff = 0; vecOff < vecLen; vecOff++) { + internalBuf = PvtcpOffInternalFromBuf(vec[vecOff].iov_base); + BUG_ON(internalBuf == NULL); + CommOS_ListAddTail(&bufList, &internalBuf->link); + queueSize += internalBuf->len; + } + + if (queueSize > 0) { + SOCK_STATE_LOCK(pvsk); + CommOS_ListSpliceTail(&pvsk->queue, &bufList); + SOCK_STATE_UNLOCK(pvsk); + CommOS_AddReturnAtomic(&pvsk->queueSize, queueSize); + needSched = 1; + } + } else { + for ( vecOff = 0; vecOff < vecLen; vecOff++) { + PvtcpBufFree(vec[vecOff].iov_base); + } + } + } + } else { /* SOCK_DGRAM || SOCK_RAW */ + struct sockaddr *addr; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + int addrLen; + + /* + * Non-stream sockets don't use the send queue, packets are sent + * directly and they must _not_ be merged. + */ + + if (sk->sk_family == AF_INET) { + sin.sin_family = AF_INET; + sin.sin_port = packet->data16; + addr = (struct sockaddr *)&sin; + addrLen = sizeof sin; + sin.sin_addr.s_addr = (unsigned int)packet->data64ex; + PvtcpTestAndBindLoopbackInet4(pvsk, &sin.sin_addr.s_addr, 0); + } else { /* AF_INET6 */ + sin6.sin6_family = AF_INET6; + sin6.sin6_port = packet->data16; + addr = (struct sockaddr *)&sin6; + addrLen = sizeof sin6; + PvtcpTestAndBindLoopbackInet6(pvsk, &packet->data64ex, + &packet->data64ex2, 0); + PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], + packet->data64ex, packet->data64ex2); + } + msg.msg_flags = packet->data32 | MSG_DONTWAIT | MSG_NOSIGNAL; + msg.msg_name = addr; + msg.msg_namelen = addrLen; + + if (pvsk->peerSockSet) { + /* + * Flow-control already done, based on PVTCP_SOCK_SAFE_RCVSIZE, just + * as with stream sockets. Meaning that we block the senders in the + * guest (if applicable). + * + * The send buffer size was set high enough, at socket creation time, + * to avoid dropping datagrams during the (non-blocking) write. + */ + + if (vecLen == 0) { + /* + * Allow zero-sized datagram sending. + */ + + struct kvec dummy = { .iov_base = NULL, .iov_len = 0 }; + + rc = kernel_sendmsg(sock, &msg, &dummy, 0, 0); + if (rc != dummy.iov_len) { +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Dgram [0x%p] sent [%d], expected [%d]\n", + __FUNCTION__, sk, rc, dummy.iov_len)); +#endif + if (rc == -EAGAIN) { /* As if lost on the wire. */ + rc = 0; + } + } + } + + for (vecOff = 0; vecOff < vecLen; vecOff++) { + rc = kernel_sendmsg(sock, &msg, &vec[vecOff], 1, + vec[vecOff].iov_len); + PvtcpBufFree(vec[vecOff].iov_base); + if (rc != vec[vecOff].iov_len) { +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Dgram [0x%p] sent [%d], expected [%d]\n", + __FUNCTION__, sk, rc, vec[vecOff].iov_len)); +#endif + if (rc == -EAGAIN) { /* As if lost on the wire. */ + rc = 0; + } + } + } + + if (COMM_OPF_TEST_ERR(packet->flags)) { + /* PV client wants an automatic bind. */ + + PvskSetOpFlag(pvsk, PVTCP_OP_BIND); + PvtcpSchedSock(pvsk); + } + } else { + for ( vecOff = 0; vecOff < vecLen; vecOff++) { + PvtcpBufFree(vec[vecOff].iov_base); + } + } + } + CommSvc_DispatchUnlock(channel); + +out: + if (rc < 0) { + pvsk->err = -rc; + } + tmpSize = CommOS_AddReturnAtomic(&pvsk->sentSize, dataLen); + if ((tmpSize >= CommOS_ReadAtomic(&pvsk->deltaAckSize)) || + pvsk->err || needSched) { + if (CommOS_AddReturnAtomic(&PvtcpOutputAIOSection, 1) == 1) { + /* OutputAIO() (likely) not running. */ + + PvtcpSchedSock(pvsk); + } + CommOS_SubReturnAtomic(&PvtcpOutputAIOSection, 1); + } + + PvtcpPutSock(pvsk); +} + + +/* + * AI/O functions called from the main AIO processing function. + */ + +/** + * @brief Processes socket flow control acks and error notifications in an + * AIO thread. This function is called with the socket 'in' lock taken. + * @param[in,out] pvsk socket to process. + * @param err non-zero if offload was closed, zero otherwise. + * @sideeffect May resume PV socket sending or raise errors. + */ + +void +PvtcpFlowAIO(PvtcpSock *pvsk, + int err) +{ + CommPacket packet = { .flags = 0 }; + unsigned long long timeout; + int tmpSize; + + COMM_OPF_CLEAR_ERR(packet.flags); + packet.data32 = PVTCP_FLOW_OP_INVALID_SIZE; + if (pvsk->err || err) { + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = !pvsk->err ? 0 : xchg(&pvsk->err, 0); + if (!packet.data32ex) { + packet.data32ex = -err; + } +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sending socket error [%u] on [0x%p -> 0x%0x].\n", + __FUNCTION__, packet.data32ex, pvsk, + (unsigned)(pvsk->peerSock))); +#endif + } else { + SOCK_STATE_LOCK(pvsk); + tmpSize = CommOS_ReadAtomic(&pvsk->deltaAckSize); + if (CommOS_ReadAtomic(&pvsk->sentSize) >= tmpSize) { + if ((SkFromPvsk(pvsk)->sk_type != SOCK_STREAM) && + !sock_writeable(SkFromPvsk(pvsk))) { + /* Don't send dgram flow op until WriteSpaceCB tells us to do so. */ + + packet.data32 = PVTCP_FLOW_OP_INVALID_SIZE; + } else { + packet.data32 = CommOS_ReadAtomic(&pvsk->sentSize); + CommOS_WriteAtomic(&pvsk->sentSize, 0); + if (tmpSize > (1 << (PVTCP_SOCK_SMALL_ACK_ORDER + 1))) { + tmpSize >>= 1; + CommOS_WriteAtomic(&pvsk->deltaAckSize, tmpSize); + } + } + } + SOCK_STATE_UNLOCK(pvsk); + packet.data32ex = 0; + } + + if (((packet.data32 != PVTCP_FLOW_OP_INVALID_SIZE) || + COMM_OPF_TEST_ERR(packet.flags)) && + pvsk->peerSockSet) { + packet.len = sizeof packet; + packet.opCode = PVTCP_OP_FLOW; + packet.data64 = pvsk->peerSock; + timeout = COMM_MAX_TO; + CommSvc_Write(pvsk->channel, &packet, &timeout); + } +} + + +/** + * @brief Processes queued socket output in an AIO thread. This function is + * called with the socket 'out' lock taken. + * @param[in,out] pvsk socket to process. + * @sideeffect Changes send size/capacity ratio. + */ + +void +PvtcpOutputAIO(PvtcpSock *pvsk) +{ + struct sock *sk; + struct socket *sock; + PvtcpOffBuf *internalBuf; + PvtcpOffBuf *tmp; + CommOSList queue; +#define VEC_SIZE 32 + struct kvec vec[VEC_SIZE]; + unsigned int vecLen; + unsigned int dataLen; + struct msghdr msg = { + .msg_controllen = 0, + .msg_control = NULL, + .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL + }; + int queueDelta = 0; + int done = 0; + int rc; + + sk = SkFromPvsk(pvsk); + if (!sk) { + /* This is an error socket, we don't process it. */ + + return; + } + + sock = sk->sk_socket; + +again: + CommOS_AddReturnAtomic(&PvtcpOutputAIOSection, 1); + while (!done && CommOS_ReadAtomic(&pvsk->queueSize) > 0) { + /* Note: only stream sockets can have a positive send queue size. + * Similar to PvtcpIoOp: we must check if sock (struct socket *) is + * still valid. + */ + + /* Take the current queue private. */ + + SOCK_STATE_LOCK(pvsk); + queue = pvsk->queue; + if (CommOS_ListEmpty(&queue)) { + SOCK_STATE_UNLOCK(pvsk); + return; + } + queue.next->prev = &queue; + queue.prev->next = &queue; + CommOS_ListInit(&pvsk->queue); + SOCK_STATE_UNLOCK(pvsk); + + vecLen = 0; + dataLen = 0; + + if (sk->sk_state == TCP_ESTABLISHED) { + CommOS_ListForEach(&queue, internalBuf, link) { + if (vecLen == VEC_SIZE) { + break; + } + vec[vecLen].iov_base = PvtcpOffBufFromInternalOff(internalBuf); + vec[vecLen].iov_len = internalBuf->len; + dataLen += internalBuf->len; + vecLen++; + } + + rc = kernel_sendmsg(sock, &msg, vec, vecLen, dataLen); + + if (rc == -EAGAIN) { + rc = 0; + } + if (rc >= 0) { + /* If we wrote anything, dispose of the buffers in question. */ + + queueDelta = rc; + if (queueDelta > 0) { + CommOS_ListForEachSafe(&queue, internalBuf, tmp, link) { + if (rc >= internalBuf->len) { + rc -= internalBuf->len; + CommOS_ListDel(&internalBuf->link); + PvtcpBufFree(PvtcpOffBufFromInternal(internalBuf)); + } else { + internalBuf->len -= rc; + internalBuf->off += rc; + break; + } + } + } + if (!CommOS_ListEmpty(&queue)) { + /* Add the remaining bytes to the beginning of the queue. */ + + SOCK_STATE_LOCK(pvsk); + CommOS_ListSplice(&pvsk->queue, &queue); + SOCK_STATE_UNLOCK(pvsk); + } + if (queueDelta == 0) { + /* Bail out if no bytes written, WriteSpaceCB() will resched. */ + + done = 1; + break; + } + CommOS_AddReturnAtomic(&pvsk->sentSize, queueDelta); + CommOS_SubReturnAtomic(&pvsk->queueSize, queueDelta); + } else { + /* + * Very likely, this is due to the socket being closed, so fine. + */ + + goto discardOutput; + } + } else { + /* Dispose of all buffers in the queue and mark it empty. */ + +discardOutput: + if (!CommOS_ListEmpty(&queue)) { + CommOS_ListForEachSafe(&queue, internalBuf, tmp, link) { + CommOS_ListDel(&internalBuf->link); + PvtcpBufFree(PvtcpOffBufFromInternal(internalBuf)); + } + } + CommOS_WriteAtomic(&pvsk->queueSize, 0); + break; + } + } + if (CommOS_SubReturnAtomic(&PvtcpOutputAIOSection, 1) > 0) { + if (!done) { + goto again; + } + } + + if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_WR)) { + kernel_sock_shutdown(sock, SHUT_WR); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_WR, 0); + } +#undef VEC_SIZE +} + + +/** + * @brief Processes socket input in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket to process. + * @param[in,out] perCpuBuf per-cpu socket read buffer. + * @return zero if eof was not detected, non-zero otherwise. + * @sideeffect Changes receive size/capacity ratio. + */ + +int +PvtcpInputAIO(PvtcpSock *pvsk, + void *perCpuBuf) +{ + struct sock *sk; + struct socket *sock; + int err = 0; + CommPacket packet = { + .opCode = PVTCP_OP_IO + }; + unsigned long long timeout; + + sk = SkFromPvsk(pvsk); + if (!sk) { + /* IO processing is skipped on socket create-error sockets. */ + + return -1; + } + if (!perCpuBuf) { + /* No read buffer. */ + + return -1; + } + + sock = sk->sk_socket; + packet.data64 = pvsk->peerSock; + COMM_OPF_CLEAR_ERR(packet.flags); + + if (sk->sk_state == TCP_LISTEN) { + /* Process stream listen 'input'. */ + + packet.len = sizeof packet; + packet.data16 = sk->sk_ack_backlog; + timeout = COMM_MAX_TO; + if (pvsk->peerSockSet) { + CommSvc_Write(pvsk->channel, &packet, &timeout); + CommOS_Debug(("%s: Listen sock [0x%p] 'ack_backlog' [%hu].\n", + __FUNCTION__, sk, packet.data16)); + } + } else { + /* Common path for both stream and datagram sockets. */ + + int rc; + int tmpSize; + struct kvec vec[2]; + void *ioBuf = perCpuBuf; + struct kvec *inVec; + unsigned int inVecLen; + unsigned int iovOffset = 0; + unsigned int inputSize = 0; + unsigned int coalescingSize = PVTCP_SOCK_RCVSIZE >> 2; + struct sockaddr_in sin = { .sin_family = AF_INET }; + struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6 }; + struct msghdr msg = { + .msg_controllen = 0, + .msg_control = NULL, + .msg_flags = MSG_DONTWAIT + }; + int tmpFlags = msg.msg_flags; + PvtcpDgramPseudoHeader dgramHeader; + + tmpSize = CommOS_ReadAtomic(&pvsk->rcvdSize); + while ((tmpSize < PVTCP_SOCK_SAFE_RCVSIZE) && pvsk->peerSockSet) { + if (ioBuf != perCpuBuf) { + LargeDgramBufPut(ioBuf); + ioBuf = perCpuBuf; + } + vec[0].iov_base = (char *)ioBuf; + + if (sk->sk_type == SOCK_STREAM) { + if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_RD)) { + break; + } + + msg.msg_name = NULL; + msg.msg_namelen = 0; + vec[0].iov_len = PVTCP_SOCK_STREAM_BUF_SIZE; + } else { /* SOCK_DGRAM || SOCK_RAW */ + if (sk->sk_family == AF_INET) { + msg.msg_name = &sin; + msg.msg_namelen = sizeof sin; + } else { + msg.msg_name = &sin6; + msg.msg_namelen = sizeof sin6; + } + + /* + * Check if datagram larger than the per cpu buffer; if so, + * allocate a large enough buffer. This should happen quite + * rarely, as well-behaved applications don't rely on IP + * fragmentation to accommodate large sizes. + */ + + vec[0].iov_len = 1; + msg.msg_flags |= (MSG_PEEK | MSG_TRUNC); + rc = kernel_recvmsg(sock, &msg, vec, 1, 1, msg.msg_flags); + if (rc < 0) { + break; + } + msg.msg_flags = tmpFlags; + if (rc > PVTCP_SOCK_DGRAM_BUF_SIZE) { + /* + * Track large datagram allocations, whether allocation succeeds + * or not. No need for atomic overhead, approximating is OK. + */ + + pvtcpOffDgramAllocations++; + ioBuf = LargeDgramBufGet(rc); + if (!ioBuf) { + /* + * We reset it to the per-cpu buffer such that we can still + * consume the datagram in the next recvmsg, which will set + * MSG_TRUNC so we won't put it on the channel. + */ + + CommOS_Debug(("%s: Dropping datagram (alloc failure)!\n", + __FUNCTION__)); + ioBuf = perCpuBuf; + vec[0].iov_len = PVTCP_SOCK_DGRAM_BUF_SIZE; + } else { + vec[0].iov_len = rc; + } + } else { + vec[0].iov_len = PVTCP_SOCK_DGRAM_BUF_SIZE; + } + vec[0].iov_base = (char *)ioBuf; + } + + rc = kernel_recvmsg(sock, &msg, vec, 1, vec[0].iov_len, msg.msg_flags); + if (rc < 0) { + break; + } + + if ((rc == 0) && (sk->sk_type == SOCK_STREAM)) { + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_RD, 1); + err = -ECONNRESET; + break; + } + + if (msg.msg_flags & MSG_TRUNC) { + continue; + } + + inputSize += rc; + tmpSize = CommOS_AddReturnAtomic(&pvsk->rcvdSize, rc); + if (tmpSize >= PVTCP_SOCK_LARGE_ACK_WM) { + COMM_OPF_SET_VAL(packet.flags, PVTCP_SOCK_LARGE_ACK_ORDER); + } else { + COMM_OPF_SET_VAL(packet.flags, 0); + } + + if (sk->sk_type == SOCK_STREAM) { + vec[0].iov_base = ioBuf; + vec[0].iov_len = rc; + inVecLen = 1; + packet.len = sizeof packet + rc; + } else { /* SOCK_DGRAM || SOCK_RAW */ + if (sk->sk_family == AF_INET) { + dgramHeader.d0 = (unsigned long long)sin.sin_port; + PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); + dgramHeader.d1 = (unsigned long long)sin.sin_addr.s_addr; + } else { /* AF_INET6 */ + dgramHeader.d0 = (unsigned long long)sin6.sin6_port; + PvtcpResetLoopbackInet6(pvsk, &sin6.sin6_addr); + PvtcpI6AddrPack(&sin6.sin6_addr.s6_addr32[0], + &dgramHeader.d1, &dgramHeader.d2); + } + vec[0].iov_base = &dgramHeader; + vec[0].iov_len = sizeof dgramHeader; + vec[1].iov_base = ioBuf; + vec[1].iov_len = rc; + inVecLen = 2; + packet.len = sizeof packet + sizeof dgramHeader + rc; + } + + inVec = vec; + timeout = COMM_MAX_TO; + rc = CommSvc_WriteVec(pvsk->channel, &packet, + &inVec, &inVecLen, &timeout, &iovOffset); + if (rc != packet.len) { + CommOS_Log(("%s: BOOG -- WROTE INCOMPLETE PACKET [%u->%d]!\n", + __FUNCTION__, packet.len, rc)); + break; + } + + /* + * If the write failed, we could print a warning. But if this + * happened, the comm channel went down. + */ + if (inputSize >= coalescingSize) { + PvtcpSchedSock(pvsk); /* We must schedule ourselves back in. */ + break; + } + } + if (ioBuf != perCpuBuf) { + LargeDgramBufPut(ioBuf); + } + } + return err; +} diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c new file mode 100644 index 0000000..047547f --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.c @@ -0,0 +1,2858 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Server (offload) side Linux-specific functions and callbacks. + */ + + +#include "pvtcp.h" + +#if defined(CONFIG_NET_NS) +#include +#include +#endif + +#include +#include +#include +#include +#include + + +/* The PVSock address (127.238.0.1) in binary form, host byte order. */ +#define PVTCP_PVSOCK_ADDR 0x7fee0001 +#define PVTCP_PVSOCK_NET 0x7fee0000 +#define PVTCP_PVSOCK_MASK 0x000000ff + +/* From mvpkm */ +extern uid_t Mvpkm_vmwareUid; + +/* + * Credentials to back socket file pointer. Used in Android ICS network + * data usage accounting to bill guest data to MVP. + */ +static struct cred _cred; +static struct file _file = { + .f_cred = &_cred, +}; + +/* From pvtcp_off_io_linux.c */ +extern CommOSAtomic PvtcpOutputAIOSection; +extern void PvtcpOffLargeDgramBufInit(void); + +static const unsigned short portRangeBase = 7000; +static const unsigned int portRangeSize = 31; +static int hooksRegistered = 0; + +static inline int PvtcpTestPortIndexBit(unsigned int addr, + unsigned int portIdx); +/** + * @note + * Netfilter hooks: + * + * We decide to drop each packet based on the following criteria: + * 1) Destination address is to a pvsock address AND + * 3) (NOT(uid == 0 OR uid == vmwareUid)) OR + * 4) (type == UDP AND NOT(port-in-pvsock-range))) + */ + +/** + * @brief Netfilter hook. Restricts LOCAL_OUT packets. + * See note above to filter policy. + * @param skb skbuff + * @param inet6 is this socket ipv4 or ipv6? + * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise + */ +static inline unsigned int +PvsockNfHook(struct sk_buff *skb, int inet6) +{ + uid_t uid; + unsigned int port; + struct socket *sock; + unsigned int addr = inet6 ? + ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]) : + ntohl(ip_hdr(skb)->daddr); + + if (likely((addr ^ PVTCP_PVSOCK_NET) & ~PVTCP_PVSOCK_MASK)) { + /* Not a pvsock address. */ + return NF_ACCEPT; + } + + sock = skb->sk->sk_socket; + if (unlikely(!sock)) { + return NF_ACCEPT; + } + + /* + * Guest (kernel) sockets can send to other guest sockets, + * Root can send to whoever it wants, no checks. + */ + uid = (sock->file ? sock->file->f_cred->uid : 0); + if (uid == 0 || (sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM)) { + return NF_ACCEPT; + } + + /* + * Only vmware can send to guest. + */ + if (likely(uid == Mvpkm_vmwareUid)) { + if (sock->type == SOCK_DGRAM) { + /* + * Deny sending to UDP port in pvsock range, if receiving socket was + * not created by the guest with this pvsock address. Drop all other + * UDP packets. + */ + port = ntohs(udp_hdr(skb)->dest) - portRangeBase; + if ((port < portRangeSize) && + PvtcpTestPortIndexBit(htonl(addr), port)) { + return NF_ACCEPT; + } + return NF_DROP; + } + /* + * TCP is all-good. + */ + return NF_ACCEPT; + } + + return NF_DROP; +} + + +/** + * @brief AF_INET4 Netfilter hook. Restricts LOCAL_OUT packets. + * See note above to filter policy. + * @param hooknum netfilter hook number + * @param skb skbuff + * @param in rx net_device + * @param out out net_device + * @param okfn ignored + * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise + */ +static unsigned int +Inet4NfHook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return PvsockNfHook(skb, 0); +} + +/** + * @brief AF_INET6 Netfilter hook. Restricts LOCAL_OUT packets. + * See note above to filter policy. + * @param hooknum netfilter hook number + * @param skb skbuff + * @param in rx net_device + * @param out out net_device + * @param okfn ignored + * @return NF_ACCEPT if the packet is allowed through, NF_DROP otherwise + */ +static unsigned int +Inet6NfHook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!ipv6_addr_v4mapped(&ipv6_hdr(skb)->daddr)) { + /* Not ipv4-mapped, so not a pvsock address. */ + return NF_ACCEPT; + } + + return PvsockNfHook(skb, 1); +} + + +static struct nf_hook_ops netfilterHooks[] = { + { + .hook = Inet4NfHook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_SECURITY + }, + { + .hook = Inet6NfHook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_SECURITY + } +}; + + +#if !defined(CONFIG_SYSFS) +#error "The pvTCP offload module requires sysfs!" +#endif + +/* + * State kobject, attributes and type. + */ + +typedef struct PvtcpStateKObj { + struct kobject kobj; + CommTranspInitArgs transpArgs; + unsigned int pvsockAddr; + int useNS; + int haveNS; +} PvtcpStateKObj; + + +typedef struct PvtcpStateKObjAttr { + struct attribute attr; + ssize_t (*show)(PvtcpStateKObj *stateKObj, char *buf); + ssize_t (*store)(PvtcpStateKObj *stateKObj, const char *buf, size_t count); +} PvtcpStateKObjAttr; + + +/** + * @brief Releases state a kobject. + * @param kobj (embedded) state kobject. + */ + +static void +StateKObjRelease(struct kobject *kobj) +{ + kfree(container_of(kobj, PvtcpStateKObj, kobj)); +} + + +/** + * @brief Sysfs show function for all pvtcp attributes. + * @param kobj (embedded) state kobject. + * @param attr pvtcp attribute to show. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + PvtcpStateKObjAttr *stateAttr = container_of(attr, PvtcpStateKObjAttr, attr); + PvtcpStateKObj *stateKObj = container_of(kobj, PvtcpStateKObj, kobj); + + if (stateAttr->show) { + return stateAttr->show(stateKObj, buf); + } + + return -EIO; +} + + +/** + * @brief Sysfs store function for all pvtcp attributes. + * @param kobj (embedded) state kobject. + * @param attr pvtcp attribute to show. + * @param buf input buffer. + * @param count input buffer length. + * @return number of bytes consumed or negative error code. + */ + +static ssize_t +StateKObjStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count) +{ + PvtcpStateKObjAttr *stateAttr = container_of(attr, PvtcpStateKObjAttr, attr); + PvtcpStateKObj *stateKObj = container_of(kobj, PvtcpStateKObj, kobj); + + if (stateAttr->store) { + return stateAttr->store(stateKObj, buf, count); + } + + return -EIO; +} + + +static struct sysfs_ops StateKObjSysfsOps = { + .show = StateKObjShow, + .store = StateKObjStore +}; + + +/** + * @brief Show function for the comm_info pvtcp attribute. + * @param stateKObj state kobject. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjCommInfoShow(PvtcpStateKObj *stateKObj, + char *buf) +{ + unsigned int typeHash; + + /* + * In the offload module, the transport arguments' type field has been + * assigned the matching index in the versions array at probe time. + * Recover and print out the type hash. + */ + + typeHash = CommTransp_GetType(pvtcpVersions[stateKObj->transpArgs.type]); + + return snprintf(buf, PAGE_SIZE, "ID=%u,%u\nCAPACITY=%u\nTYPE=0x%0x\n", + stateKObj->transpArgs.id.d32[0], + stateKObj->transpArgs.id.d32[1], + stateKObj->transpArgs.capacity, + typeHash); +} + + +/** + * @brief Show function for the pvsock_addr pvtcp attribute. + * @param stateKObj state kobject. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjPvsockAddrShow(PvtcpStateKObj *stateKObj, + char *buf) +{ + union { + unsigned int raw; + unsigned char bytes[4]; + } addr; + + addr.raw = stateKObj->pvsockAddr; + return snprintf(buf, PAGE_SIZE, "%u.%u.%u.%u\n", + (unsigned int)addr.bytes[0], (unsigned int)addr.bytes[1], + (unsigned int)addr.bytes[2], (unsigned int)addr.bytes[3]); +} + + +/** + * @brief Show function for the use_ns pvtcp attribute. + * @param stateKObj state kobject. + * @param buf output buffer. + * @return number of bytes written or negative error code. + */ + +static ssize_t +StateKObjUseNSShow(PvtcpStateKObj *stateKObj, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", stateKObj->useNS); +} + + +/** + * @brief Store function for the use_ns pvtcp attribute. + * @param stateKObj state kobject. + * @param buf input buffer. + * @param count input buffer length. + * @return number of bytes consumed or negative error code. + */ + +static ssize_t +StateKObjUseNSStore(PvtcpStateKObj *stateKObj, + const char *buf, + size_t count) +{ + int rc = -EINVAL; + + /* coverity[secure_coding] */ + if (stateKObj->haveNS && (sscanf(buf, "%d", &stateKObj->useNS) == 1)) { + stateKObj->useNS = !!stateKObj->useNS; + rc = count; + } + + return rc; +} + + +static PvtcpStateKObjAttr stateKObjCommInfoAttr = + __ATTR(comm_info, 0444, StateKObjCommInfoShow, NULL); + +static PvtcpStateKObjAttr stateKObjPvsockAddrAttr = + __ATTR(pvsock_addr, 0444, StateKObjPvsockAddrShow, NULL); + +static PvtcpStateKObjAttr stateKObjUseNSAttr = + __ATTR(use_ns, 0644, StateKObjUseNSShow, StateKObjUseNSStore); + + +static struct attribute *stateKObjDefaultAttrs[] = { + &stateKObjCommInfoAttr.attr, + &stateKObjPvsockAddrAttr.attr, + &stateKObjUseNSAttr.attr, + NULL +}; + + +static struct kobj_type stateKType = { + .sysfs_ops = &StateKObjSysfsOps, + .release = StateKObjRelease, + .default_attrs = stateKObjDefaultAttrs +}; + + +/* + * Initialization of module entry and exit callbacks. + */ + +static int Init(void *args); +static void Exit(void); + +COMM_OS_MOD_INIT(Init, Exit); + + +/* + * AIO socket read buffers, stats and other global state. + */ + +static CommOSMutex globalLock; +static char perCpuBuf[NR_CPUS][PVTCP_SOCK_BUF_SIZE]; + +#define PVTCP_OFF_MAX_LB_ADDRS 255 +static unsigned int loopbackAddrs[PVTCP_OFF_MAX_LB_ADDRS] = { + 0xffffffff, // Network address always on, all ports allowed. + 0x7fffffff // Host address not yet on, all ports allowed. + // All the rest zeroed out. +}; + +static const unsigned int loopbackReserved = 0x00000001 << 31; + + +#define PvtcpTestLoopbackBit(entry, mask) \ + ((entry) & (mask)) + +#define PvtcpSetLoopbackBit(entry, mask) \ + ((entry) |= (mask)) + +#define PvtcpResetLoopbackBit(entry, mask) \ + ((entry) &= ~(mask)) + + +static inline int +PvtcpTestPortIndexBit(unsigned int addr, + unsigned int portIdx) +{ + return PvtcpTestLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], + BIT(portIdx)); +} + + +static inline void +PvtcpSetPortIndexBit(unsigned int addr, + unsigned int portIdx) +{ + PvtcpSetLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], + BIT(portIdx)); +} + + +static inline void +PvtcpResetPortIndexBit(unsigned int addr, + unsigned int portIdx) +{ + PvtcpResetLoopbackBit(loopbackAddrs[*((unsigned char *)&addr + 3)], + BIT(portIdx)); +} + + +unsigned int pvtcpLoopbackOffAddr; + +unsigned long long pvtcpOffDgramAllocations = 0; + +/* + * Destructor shim addresses and function pointer + */ + +extern void asmDestructorShim(struct sock*); + + +/* + * Functions. + */ + +/** + * @brief Release a socket, NULLing out the fake file field to avoid confusing + * Linux on the release path + * @param sock socket to release + */ +static void +SockReleaseWrapper(struct socket *sock) +{ + sock->file = NULL; + sock_release(sock); +} + +/** + * @brief Gets a new loopback address in the 127.238.0.255 network. + * Note that the first address, 127.238.0.1, is always the host's. + * @return new address or -1U if none is available. + */ + +static unsigned int +GetLoopbackAddr(void) +{ + static unsigned char addrTempl[4] = { 127, 238, 0, 0 }; + unsigned int rc = -1U; + unsigned int idx; + struct socket *sock; + + CommOS_MutexLock(&globalLock); + for (idx = 1; idx < PVTCP_OFF_MAX_LB_ADDRS; idx++) { + if (!PvtcpTestLoopbackBit(loopbackAddrs[idx], loopbackReserved)) { + addrTempl[3] = (unsigned char)idx; + memcpy(&rc, addrTempl, sizeof rc); + + /* Create a dgram socket to configure/bring-up the lo:N interface. */ + + if (!sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock)) { + int err; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr = { .s_addr = rc } + }; + struct ifreq ifr = { + .ifr_flags = IFF_UP + }; + + snprintf(ifr.ifr_name, sizeof ifr.ifr_name, "lo:%u", idx); + memcpy(&ifr.ifr_addr, &sin, sizeof ifr.ifr_addr); + err = kernel_sock_ioctl(sock, SIOCSIFADDR, (unsigned long)&ifr); + sock_release(sock); + if (err) { + CommOS_Log(("%s: Could not set loopback address (ioctl)!\n", + __FUNCTION__)); + rc = -1U; + continue; /* Try next address. */ + } else { + PvtcpSetLoopbackBit(loopbackAddrs[idx], loopbackReserved); + CommOS_Debug(("%s: Allocated loopback address [%u.%u.%u.%u].\n", + __FUNCTION__, + addrTempl[0], addrTempl[1], + addrTempl[2], addrTempl[3])); + break; + } + } else { + CommOS_Log(("%s: Could not set loopback address (create)!\n", + __FUNCTION__)); + rc = -1U; + break; + } + } + } + if (idx == PVTCP_OFF_MAX_LB_ADDRS) { + CommOS_Log(("%s: loopback address range exceeded!\n", __FUNCTION__)); + } + + CommOS_MutexUnlock(&globalLock); + return rc; +} + + +/** + * @brief Puts back a loopback address in the 127.238.0.255 network. + * @param uaddr address to put back. + */ + +static void +PutLoopbackAddr(unsigned int uaddr) +{ + const unsigned char addrTempl[3] = { 127, 238, 0 }; + unsigned char addr[4]; + unsigned int idx; + struct socket *sock; + + memcpy(addr, &uaddr, sizeof uaddr); + if (memcmp(addrTempl, addr, sizeof addrTempl)) { + return; + } + + idx = addr[3]; + if ((idx == 0) || (idx >= PVTCP_OFF_MAX_LB_ADDRS)) { + return; + } + + CommOS_MutexLock(&globalLock); + if (!PvtcpTestLoopbackBit(loopbackAddrs[idx], loopbackReserved)) { + CommOS_Debug(("%s: loopback entry [%u] already freed.\n", + __FUNCTION__, idx)); + goto out; + } + + if (!sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock)) { + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr = { .s_addr = uaddr } + }; + struct ifreq ifr = { + .ifr_flags = 0 + }; + + snprintf(ifr.ifr_name, sizeof ifr.ifr_name, "lo:%u", idx); + memcpy(&ifr.ifr_addr, &sin, sizeof ifr.ifr_addr); + kernel_sock_ioctl(sock, SIOCSIFFLAGS, (unsigned long)&ifr); + sock_release(sock); + loopbackAddrs[idx] = 0; // Zero everything out. + CommOS_Debug(("%s: Deallocated loopback address [%u.%u.%u.%u].\n", + __FUNCTION__, addr[0], addr[1], addr[2], addr[3])); + } else { + CommOS_Log(("%s: Could not delete loopback address!\n", + __FUNCTION__)); + } + +out: + CommOS_MutexUnlock(&globalLock); +} + + +/** + * @brief Retrieves and retains the namespace associated with a channel. + * A server must be listening for requests to retrieve the pid of the + * process owning the net namespace for the passed context/vm id. + * Communication takes place over a datagram socket in the AF_UNIX family, + * bound to "/usr/lib/vmware/pvtcp/config/serv_addr". + * @param state channel state for which to retrieve the network namespace. + * @sideeffect If an associated namespace is found, it is retained and saved + * in the state object. + */ + +static void +GetNetNamespace(PvtcpState *state) +{ +#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) + CommTranspInitArgs args; + pid_t pidn; + struct pid *pid; + struct task_struct *tsk; + struct nsproxy *nsproxy; + struct net *ns; + struct socket *sock; + struct sockaddr_un addr = { + .sun_family = AF_UNIX + }; + struct timeval timeout = { + .tv_sec = 3000, + .tv_usec = 0 + }; + const int passcred = 1; + char buf[64]; + struct kvec vec; + const char *sockname = "pvtcp-vpn"; /* abstract namespace for AF_UNIX/LOCAL sockets */ + const size_t socknamelen = strlen(sockname); + + struct msghdr msg = { + .msg_name = (struct sockaddr *)&addr, + .msg_namelen = 1 + offsetof(struct sockaddr_un, sun_path) + socknamelen + }; + + + if (!state) { + return; + } + + args = CommSvc_GetTranspInitArgs(state->channel); + ns = NULL; + pidn = 0; + + if (sock_create_kern(AF_UNIX, SOCK_DGRAM, 0, &sock)) { + CommOS_Debug(("%s: Can't create config socket!\n", __FUNCTION__)); + goto out; + } + if (kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&timeout, sizeof timeout)) { + sock_release(sock); + CommOS_Debug(("%s: Can't set timeout on config socket!\n", __FUNCTION__)); + goto out; + } + if (kernel_setsockopt(sock, SOL_SOCKET, SO_PASSCRED, + (char *)&passcred, sizeof passcred)) { + sock_release(sock); + CommOS_Debug(("%s: Can't set passcred on config socket!\n", + __FUNCTION__)); + goto out; + } + + /* + * Send the configuration request and receive the reply: + * - the request carries the VM/guest ID as used in the transport + * arguments used to create the channel. + * - the reply is expected to contain the pid of the namespace owner. + */ + + memset(buf, 0, sizeof buf); + snprintf(buf, sizeof buf, "%u\n", args.id.d32[0]); + buf[sizeof buf - 1] = '\0'; + vec.iov_base = buf; + vec.iov_len = strlen(buf); + + /* use anonymous name */ + addr.sun_path[0] = 0; + memcpy(addr.sun_path+1, sockname, socknamelen); + + if (kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len) <= 0) { + sock_release(sock); + CommOS_Debug(("%s: Could not send config request for vm [%u]!\n", + __FUNCTION__, args.id.d32[0])); + goto out; + } + + memset(buf, 0, sizeof buf); + vec.iov_base = buf; + vec.iov_len = sizeof buf; + if (kernel_recvmsg(sock, &msg, &vec, 1, vec.iov_len, 0) <= 0) { + CommOS_Debug(("%s: Could not receive config reply for vm [%u]!\n", + __FUNCTION__, args.id.d32[0])); + } else { + buf[sizeof buf - 1] = '\0'; + /* coverity[secure_coding] */ + sscanf(buf, "%d", &pidn); + } + sock_release(sock); + + if (!pidn) { + goto out; + } + + pid = find_get_pid(pidn); + if (pid) { + tsk = pid_task(pid, PIDTYPE_PID); + if (tsk) { + rcu_read_lock(); + nsproxy = task_nsproxy(tsk); + if (nsproxy && nsproxy->net_ns) { + ns = maybe_get_net(nsproxy->net_ns); + } + rcu_read_unlock(); + } + put_pid(pid); + } + +out: + if (!ns) { + CommOS_Debug(("%s: Not using a namespace for vm [%u].\n", + __FUNCTION__, args.id.d32[0])); + ns = &init_net; + } else { + CommOS_Debug(("%s: Found the net namespace for vm [%u].\n", + __FUNCTION__, args.id.d32[0])); + } +#else + void *ns = NULL; +#endif + + state->namespace = ns; +} + + +/** + * @brief Releases the network namespace associated with a channel state. + * @param namespace namespace to be released. + * @sideeffect If the namespace is not the initial one, it is released. + */ + +static void +PutNetNamespace(void *namespace) +{ +#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) + if (namespace && (namespace != &init_net)) { + put_net((struct net *)namespace); + } +#endif +} + + +/** + * @brief Offload state constructor called when a channel is created. + * The function first calls the default state allocator; it then retrieves + * the n/w namespace associated with this client, retains it and stores it + * in the state object. Finally, it creates a sysfs node. + * @param[in,out] channel channel to initialize. + * @return pointer to a new state structure or NULL. + * @sideeffect Allocates memory. + */ + +static void * +StateAlloc(CommChannel channel) +{ + extern struct kset *Mvpkm_FindVMNamedKSet(int, const char *); + PvtcpState *state = NULL; + PvtcpIf *loopbackNetif = NULL; + PvtcpStateKObj *stateKObj = NULL; + struct kset *kset = NULL; + int rc; + CommTranspInitArgs transpArgs; + + transpArgs = CommSvc_GetTranspInitArgs(channel); + + /* + * The transport ID is assigned in an implementation-dependent way. + * (see lib/comm/comm_transp.h for transport type definitions.) + * However, the first 32 bits are expected to denote the guest/VM ID, + * while the last 32 bits are a resource handle within that VM. On MVP, + * transports map to queue pairs, which follow this convention. + */ + + kset = Mvpkm_FindVMNamedKSet((int)transpArgs.id.d32[0], "devices"); + if (!kset) { + CommOS_Debug(("%s: Could not find sysfs '.../vm/N/devices' kset!\n", + __FUNCTION__)); + goto error; + } + + state = PvtcpStateAlloc(channel); + if (!state) { + CommOS_Debug(("%s: Could not allocate state!\n", __FUNCTION__)); + goto error; + } + + /* coverity[leaked_storage] */ + stateKObj = kzalloc(sizeof *stateKObj, GFP_KERNEL); + if (!stateKObj) { + CommOS_Debug(("%s: Could not allocate state kobject!\n", __FUNCTION__)); + goto error; + } + + stateKObj->kobj.kset = kset; + /* coverity[leaked_storage] */ + rc = kobject_init_and_add(&stateKObj->kobj, &stateKType, NULL, "pvtcp"); + if (rc) { + CommOS_Debug(("%s: Could not add state kobject to parent kset [%d]!\n", + __FUNCTION__, rc)); + goto error; + } + + loopbackNetif = PvtcpStateFindIf(state, pvtcpIfLoopbackInet4); + BUG_ON(loopbackNetif == NULL); + loopbackNetif->conf.addr.in.s_addr = GetLoopbackAddr(); + if (loopbackNetif->conf.addr.in.s_addr == -1U) { + CommOS_Log(("%s: Could not allocate loopback address!\n", __FUNCTION__)); + goto error; + } + + GetNetNamespace(state); + + stateKObj->transpArgs = transpArgs; + stateKObj->pvsockAddr = loopbackNetif->conf.addr.in.s_addr; +#if defined(CONFIG_NET_NS) + stateKObj->haveNS = (state->namespace != &init_net); + stateKObj->useNS = stateKObj->haveNS; +#endif + state->extra = stateKObj; + + _cred.uid = _cred.gid = _cred.suid = _cred.sgid = + _cred.euid = _cred.egid = _cred.fsuid = _cred.fsgid = Mvpkm_vmwareUid; + + +out: + if (kset) { + kset_put(kset); + } + return state; + +error: + if (stateKObj) { + kobject_del(&stateKObj->kobj); + kobject_put(&stateKObj->kobj); + } + if (loopbackNetif && (loopbackNetif->conf.addr.in.s_addr != -1U)) { + PutLoopbackAddr(loopbackNetif->conf.addr.in.s_addr); + } + if (state) { + PvtcpStateFree(state); + state = NULL; + } + goto out; +} + + +/** + * @brief Offload state destructor called when a channel is closed. + * The function releases this client's n/w namespace and then calls the + * default state deallocator. + * @param arg pointer to state structure. + * @sideeffect Destroys all netifs and their sockets, deallocates memory. + */ + +static void +StateFree(void *arg) +{ + PvtcpState *state = arg; + PvtcpIf *loopbackNetif; + void *namespace; + + if (!state) { + return; + } + + if (state->extra) { + PvtcpStateKObj *stateKObj = state->extra; + + kobject_del(&stateKObj->kobj); + kobject_put(&stateKObj->kobj); + } + + namespace = state->namespace; + loopbackNetif = PvtcpStateFindIf(state, pvtcpIfLoopbackInet4); + BUG_ON(loopbackNetif == NULL); + PutLoopbackAddr(loopbackNetif->conf.addr.in.s_addr); + PvtcpStateFree(state); + PutNetNamespace(namespace); +} + + +/** + * @brief Releases socket. This function is called when the channel state + * owning the socket is closed. + * @param[in,out] pvsk PV socket to release. + * @sideeffect the socket eventually gets deallocated. + */ + +void +PvtcpReleaseSocket(PvtcpSock *pvsk) +{ + struct socket *sock = SkFromPvsk(pvsk)->sk_socket; + + SOCK_IN_LOCK(pvsk); + SOCK_OUT_LOCK(pvsk); + pvsk->peerSockSet = 0; + SockReleaseWrapper(sock); + SOCK_OUT_UNLOCK(pvsk); + SOCK_IN_UNLOCK(pvsk); + CommOS_Debug(("%s: [0x%p].\n", __FUNCTION__, pvsk)); +} + + +/** + * @brief Tests if the passed address is 127.238.0.1 or 127.0.0.1. + * @param pvsk socket to test. + * @param addr inet4 address to test. + * @return > 1: morph and propagate new address to caller, 1: just morph, + * 0: don't morph, < 0 (-EADDRNOTAVAIL): bad loopback. + */ + +static inline int +TestLoopbackInet4(PvtcpSock *pvsk, + unsigned int addr) +{ + if (!ipv4_is_loopback(addr)) { + return 0; + } + + if (addr != htonl(PVTCP_PVSOCK_ADDR)) { + if (addr != htonl(INADDR_LOOPBACK)) { + return -EADDRNOTAVAIL; + } + if (PvtcpHasSockNamespace(pvsk)) { + /* We don't morph normal 127.0.0.1 when NS present. */ + + return 0; + } + return 2; + } + + return 1; +} + + +/** + * @brief Tests if the passed address is 127.238.0.1 or 127.0.0.1 and the + * socket has a namespace. If yes, the address will be morphed into + * the actual loopback address, then a bind() is performed. + * Note that the function returns EADDRNOTAVAIL for any other loopbacks. + * @param pvsk socket to test. + * @param[in,out] addr inet4 address to test. + * @param port port to bind, or zero for any port. + * @return 1 if bind should be performed by caller, bind return code otherwise. + */ + +int +PvtcpTestAndBindLoopbackInet4(PvtcpSock *pvsk, + unsigned int *addr, + unsigned short port) +{ + int rc; + struct sockaddr_in sin; + unsigned int morphedAddr; + int propagate = 0; + + rc = TestLoopbackInet4(pvsk, *addr); + switch (rc) { + case 2: + propagate = 1; // Fall through. + case 1: + break; // Proceed with morphing. + case 0: + return 1; // Don't morph, let bind() be done by caller. + default: + return rc; + } + + if (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { + /* The socket has already been morphed/bound. */ + + morphedAddr = pvsk->netif->conf.addr.in.s_addr; + rc = 0; + goto out; + } + + /* + * Move the socket to the initial namespace before binding it + * such that the loopback address is accessible to the host. + */ + + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); + morphedAddr = pvsk->netif->conf.addr.in.s_addr; + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_port = port; + sin.sin_addr.s_addr = morphedAddr; + + /* Bind to the channel loopback address. */ + + rc = kernel_bind(SkFromPvsk(pvsk)->sk_socket, + (struct sockaddr *)&sin, sizeof sin); + if (rc) { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); + } else { + /* + * Bind succeeded on pvsock address. + * If this is a pvsock UDP reserved port, record it. + */ + + port = ntohs(port) - portRangeBase; + if ((SkFromPvsk(pvsk)->sk_socket->type == SOCK_DGRAM) && + (port < portRangeSize)) { + CommOS_MutexLock(&globalLock); + PvtcpSetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); + CommOS_MutexUnlock(&globalLock); + } + + /* + * pvsock data usage shouldn't be counted as MVP external traffic. + */ + SkFromPvsk(pvsk)->sk_socket->file = NULL; + } + +out: + if (propagate) { + *addr = morphedAddr; + } + return rc; +} + + +/** + * @brief Tests if the passed address is IPV4-mapped 127.238.0.1 or 127.0.0.1, + * clean ::1, and whether the socket has a namespace. + * If needed, the address will be morphed into the actual loopback address, + * then a bind() is performed. + * Note that the function returns EADDRNOTAVAIL for any other loopbacks. + * @param pvsk socket to test. + * @param[in,out] addr0 first 64 bits of inet6 address to test. + * @param[in,out] addr1 last 64 bits of inet6 address to test. + * @param port port to bind, or zero for any port. + * @return 1 if bind should be performed by caller, bind return code otherwise. + */ + +int +PvtcpTestAndBindLoopbackInet6(PvtcpSock *pvsk, + unsigned long long *addr0, + unsigned long long *addr1, + unsigned short port) +{ + int rc; + struct sockaddr_in6 sin6; + union { + unsigned long long halves[2]; + struct in6_addr in6; + } in6Addr = { + .halves = { *addr0, *addr1 } + }; + int propagate = 0; + const int ipv6Only = 0; + + if (ipv6_addr_loopback(&in6Addr.in6)) { + if (PvtcpHasSockNamespace(pvsk)) { + return 1; + } + + /* Remember that we were passed '::1'. */ + + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_IPV6_LOOP, 1); + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &in6Addr.in6); + } + + if (!ipv6_addr_v4mapped(&in6Addr.in6)) { + /* If the address is not ipv4-mapped, stop testing. */ + + return 1; + } + + rc = TestLoopbackInet4(pvsk, in6Addr.in6.s6_addr32[3]); + switch (rc) { + case 2: + propagate = 1; // Fall through. + case 1: + break; // Proceed with morphing. + case 0: + return 1; // Don't morph, let bind() be done by caller. + default: + return rc; + } + + if (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { + /* The socket has already been morphed/bound. */ + + ipv6_addr_set_v4mapped(pvsk->netif->conf.addr.in.s_addr, &in6Addr.in6); + rc = 0; + goto out; + } + + /* + * Move the socket to the initial namespace before binding it + * such that the loopback address is accessible to the host. + */ + + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); + ipv6_addr_set_v4mapped(pvsk->netif->conf.addr.in.s_addr, &in6Addr.in6); + memset(&sin6, 0, sizeof sin6); + sin6.sin6_family = AF_INET6; + sin6.sin6_port = port; + sin6.sin6_addr = in6Addr.in6; + + /* + * Ensure we can use ipv4 mapped addresses and bind to the channel + * loopback address. + */ + + (void)kernel_setsockopt(SkFromPvsk(pvsk)->sk_socket, IPPROTO_IPV6, + IPV6_V6ONLY, (char *)&ipv6Only, sizeof ipv6Only); + rc = kernel_bind(SkFromPvsk(pvsk)->sk_socket, + (struct sockaddr *)&sin6, sizeof sin6); + if (rc) { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); + } else { + /* + * Bind succeeded on pvsock address. + * If this is a pvsock UDP reserved port, record it. + */ + + port = ntohs(port) - portRangeBase; + if ((SkFromPvsk(pvsk)->sk_socket->type == SOCK_DGRAM) && + (port < portRangeSize)) { + CommOS_MutexLock(&globalLock); + PvtcpSetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); + CommOS_MutexUnlock(&globalLock); + } + + /* + * pvsock data usage shouldn't be counted as MVP external traffic. + */ + SkFromPvsk(pvsk)->sk_socket->file = NULL; + } + +out: + if (propagate) { + *addr0 = in6Addr.halves[0]; + *addr1 = in6Addr.halves[1]; + } + return rc; +} + + +/** + * @brief Resets a 127.238.0.N address to 127.0.0.1. + * @param pvsk socket whose address needs resetting. + * @param[in,out] addr inet4 address to reset. + */ + +void +PvtcpResetLoopbackInet4(PvtcpSock *pvsk, + unsigned int *addr) +{ + if (!PvtcpHasSockNamespace(pvsk)) { + static const unsigned int pvsockAddr = htonl(PVTCP_PVSOCK_ADDR); + + if (!memcmp(&pvsockAddr, addr, 3) && memcmp(&pvsockAddr, addr, 4)) { + /* If it's a pvsock address but _not_ the host's, overwrite it. */ + + *addr = htonl(INADDR_LOOPBACK); + } + } +} + + +/** + * @brief Resets an IPV4-mapped ::ffff:127.238.0.N IPV6 address to loopback. + * @param pvsk socket whose address needs resetting. + * @param[in,out] in6 inet6 address to reset. + */ + +void +PvtcpResetLoopbackInet6(PvtcpSock *pvsk, + struct in6_addr *in6) +{ + if (!PvtcpHasSockNamespace(pvsk) && ipv6_addr_v4mapped(in6)) { + if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_IPV6_LOOP)) { + /* If the original address came in as ::1, we reset as such. */ + + static const struct in6_addr in6Loopback = IN6ADDR_LOOPBACK_INIT; + + *in6 = in6Loopback; + } else { + PvtcpResetLoopbackInet4(pvsk, &in6->s6_addr32[3]); + } + } +} + + +/** + * @brief Called at module load time. It registers with the Comm runtime. + * @param args initialization arguments + * @return zero if successful, -1 otherwise + * @sideeffect Leaves the module loaded + */ + +static int +Init(void *args) +{ + int rc = -1; + +#if !defined(PVTCP_DISABLE_NETFILTER) + rc = nf_register_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); + if (rc) { + CommOS_Log(("%s: Could not register netfilter hooks!\n", __FUNCTION__)); + goto out; + } else { + CommOS_Debug(("%s: Registered netfilter hooks.\n", __FUNCTION__)); + } + hooksRegistered = 1; +#else + CommOS_Log(("%s: Netfilter hooks disabled.\n", __FUNCTION__)); +#endif + + CommOS_MutexInit(&globalLock); + CommOS_WriteAtomic(&PvtcpOutputAIOSection, 0); + PvtcpOffLargeDgramBufInit(); + + pvtcpImpl.owner = CommOS_ModuleSelf(); + pvtcpImpl.stateCtor = StateAlloc; + pvtcpImpl.stateDtor = StateFree; + if (CommSvc_RegisterImpl(&pvtcpImpl) == 0) { + rc = 0; + pvtcpLoopbackOffAddr = GetLoopbackAddr(); + if (pvtcpLoopbackOffAddr == -1U) { + CommOS_Log(("%s: Could not allocate offload loopback address!\n", + __FUNCTION__)); + rc = -1; + CommSvc_UnregisterImpl(&pvtcpImpl); + } + } + +out: + if (rc) { + if (hooksRegistered) { + nf_unregister_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); + } + } + return rc; +} + + +/** + * @brief Called at module unload time. It shuts down pvtcp. + * @sideeffect Total and utter destruction. + */ + +static void +Exit(void) +{ + PutLoopbackAddr(pvtcpLoopbackOffAddr); + CommSvc_UnregisterImpl(&pvtcpImpl); +#if !defined(PVTCP_DISABLE_NETFILTER) + if (hooksRegistered) { + nf_unregister_hooks(netfilterHooks, ARRAY_SIZE(netfilterHooks)); + CommOS_Debug(("%s: Netfilter hooks unregistered.\n", __FUNCTION__)); + } +#endif + CommOS_Log(("%s: Allocations of large datagrams: %llu.\n", + __FUNCTION__, pvtcpOffDgramAllocations)); +} + + +/* + * Socket callback interceptors. + */ + +/** + * @brief Callback called when socket is destroyed. + * @param[in,out] sk socket to cleanup + * @return 0 if socket memory is freed, < 0 otherwise (no-op) + * @sideeffect Send queue buffers are deallocated + */ + +int +DestructCB(struct sock *sk) +{ + PvtcpOffBuf *internalBuf; + PvtcpOffBuf *tmp; + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->destruct == asmDestructorShim)) { + /* Module put _not_ to be performed by asmDestructorShim. */ + + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return -1; + } + + CommOS_ListForEachSafe(&pvsk->queue, internalBuf, tmp, link) { + CommOS_ListDel(&internalBuf->link); + PvtcpBufFree(PvtcpOffBufFromInternal(internalBuf)); + } + if (pvsk->destruct) { + pvsk->destruct(sk); + } + + if (pvsk->rpcReply) { + CommOS_Kfree(pvsk->rpcReply); + } + CommOS_Kfree(pvsk); + + /* + * Module put is performed by asmDestructorShim. + */ + + return 0; +} + + +/** + * @brief Callback called when socket state changes occur. + * @param sk socket specified socket which changed state + * @sideeffect A writer task may be scheduled + */ + +static void +StateChangeCB(struct sock *sk) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->stateChange == StateChangeCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + */ + + CommOS_Debug(("%s: [0x%p] sk_state [%u] sk_err [%d] sk_err_soft [%d].\n", + __FUNCTION__, pvsk, sk->sk_state, + sk->sk_err, sk->sk_err_soft)); + if (pvsk->stateChange) { + pvsk->stateChange(sk); + } + if (sk->sk_state == TCP_ESTABLISHED) { + PvskSetOpFlag(pvsk, PVTCP_OP_CONNECT); + } + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Callback called when an error is set on the socket. + * @param sk socket the error happened on + * @sideeffect A writer task may be scheduled + */ + +static void +ErrorReportCB(struct sock *sk) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->errorReport == ErrorReportCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + * Interesting sk_err-s: + * ECONNRESET - tcp_disconnect(), tcp_reset() + * ECONNREFUSED - tcp_reset() + * EPIPE - tcp_reset() + * ETIMEDOUT - tcp_write_error() + * EHOSTUNREACH, etc. - tcp_v4_error()??, icmp errors + * etc. - __udp4_lib_err(), icmp errors + */ + + CommOS_Debug(("%s: [0x%p] sk_err [%d] sk_err_soft [%d].\n", + __FUNCTION__, pvsk, sk->sk_err, sk->sk_err_soft)); + if (pvsk->errorReport) { + pvsk->errorReport(sk); + } + pvsk->err = sk->sk_err; + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Callback called when data is available to be read from a socket. + * @param sk socket in question + * @param bytes number of bytes to read + * @sideeffect A writer task is scheduled _iff_ the peer can safely + * receive. + */ + +static void +DataReadyCB(struct sock *sk, + int bytes) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->dataReady == DataReadyCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + */ + + if (pvsk->dataReady) { + pvsk->dataReady(sk, bytes); + } + if (sk->sk_state == TCP_LISTEN) { + CommOS_Debug(("%s: Listen socket ready to accept [0x%p].\n", + __FUNCTION__, pvsk)); + } + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Callback called when writing is possible on a socket. + * @param sk socket in question + * @sideeffect An AIO thread is scheduled. + */ + +static void +WriteSpaceCB(struct sock *sk) +{ + PvtcpSock *pvsk = PvskFromSk(sk); + + if (!pvsk || + (SkFromPvsk(pvsk) != sk) || + (pvsk->writeSpace == WriteSpaceCB)) { + CommOS_Debug(("%s: pvsk / sk inconsistency. Ignored.\n", __FUNCTION__)); + return; + } + + /* + * The socket (spin) lock is held when this function is called. + */ + + if (pvsk->writeSpace) { + pvsk->writeSpace(sk); + } + PvtcpSchedSock(pvsk); +} + + +/** + * @brief Initializes a newly created socket for offload operations. + * @param[in,out] sock socket to initialize + * @param channel channel to update + * @param peerSock peer PV socket of this socket + * @param parentPvsk parent of this socket or NULL + * @return zero on success, error code otherwise + */ + +static int +SockAllocInit(struct socket *sock, + CommChannel channel, + unsigned long long peerSock, + PvtcpSock *parentPvsk) +{ + struct sock *sk; + PvtcpSock *pvsk; + int sndBuf = PVTCP_SOCK_RCVSIZE * 4; + + if (!sock || !channel || !peerSock) { + return -EINVAL; + } + + sk = sock->sk; + sk->sk_user_data = NULL; + + pvsk = CommOS_Kmalloc(sizeof *pvsk); + if (!pvsk) { + return -ENOMEM; + } + + if (PvtcpOffSockInit(pvsk, channel)) { + CommOS_Kfree(pvsk); + return -ENOMEM; + } + + /* + * PVTCP sockets should be billed against the vmware uid. + */ + sk->sk_socket->file = &_file; + + /* Set peer (pv) socket. */ + pvsk->peerSock = peerSock; + pvsk->peerSockSet = 1; + + /* Set up back pointer. */ + pvsk->sk = sk; + + /* Keep track of new socket. */ + if (PvtcpStateAddSocket(channel, pvtcpIfUnbound, pvsk) != 0) { + CommOS_Kfree(pvsk); + return -ENOMEM; + } + + /* + * Keep pvtcp around for at least the lifetime of this socket + */ + CommOS_ModuleGet(pvtcpImpl.owner); + + if (!parentPvsk) { + pvsk->destruct = sk->sk_destruct; + sk->sk_destruct = asmDestructorShim; + pvsk->stateChange = sk->sk_state_change; + sk->sk_state_change = StateChangeCB; + pvsk->errorReport = sk->sk_error_report; + sk->sk_error_report = ErrorReportCB; + pvsk->dataReady = sk->sk_data_ready; + sk->sk_data_ready = DataReadyCB; + pvsk->writeSpace = sk->sk_write_space; + sk->sk_write_space = WriteSpaceCB; + } else { + /* + * Copy the parent's saved callbacks. The parent pvsk is only passed + * when creating/initializing a socket after an 'accept'. + */ + + pvsk->destruct = parentPvsk->destruct; + sk->sk_destruct = asmDestructorShim; + pvsk->stateChange = parentPvsk->stateChange; + sk->sk_state_change = StateChangeCB; + pvsk->errorReport = parentPvsk->errorReport; + sk->sk_error_report = ErrorReportCB; + pvsk->dataReady = parentPvsk->dataReady; + sk->sk_data_ready = DataReadyCB; + pvsk->writeSpace = parentPvsk->writeSpace; + sk->sk_write_space = WriteSpaceCB; + + if (parentPvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4) { + /* The parent socket was morphed/bound. */ + + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + PvtcpStateAddSocket(pvsk->channel, pvtcpIfLoopbackInet4, pvsk); + } + } + + /* Install forward socket reference. */ + sk->sk_user_data = pvsk; + + /* + * Force the send buffer size high enough, such that we don't lose the + * just-a-bit-over-the-limit bytes. This is mainly needed for datagrams. + * Note that we always apply flow control between host and guest modules, + * according to the sizing model; so this is not artificially inflated. + */ + + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUFFORCE, + (void *)&sndBuf, sizeof sndBuf); + + return 0; +} + + +/** + * @brief Allocates a pvsk socket for error reporting (create operation). + * @param err error code to report to PV side + * @param channel channel error socket belongs to + * @param peerSock peer PV socket of this socket + * @return error socket on success, NULL otherwise + */ + +static PvtcpSock * +SockAllocErrInit(int err, + CommChannel channel, + unsigned long long peerSock) +{ + PvtcpSock *pvsk; + + if (!channel || !peerSock) { + return NULL; + } + + pvsk = CommOS_Kmalloc(sizeof *pvsk); + if (!pvsk) { + return NULL; + } + + if (PvtcpOffSockInit(pvsk, channel)) { + CommOS_Kfree(pvsk); + return NULL; + } + + /* Set peer (pv) socket and error. */ + pvsk->peerSock = peerSock; + pvsk->peerSockSet = 1; + pvsk->err = err; + + /* Set up back pointer to NULL such that PvtcpPutSock deallocates it. */ + pvsk->sk = NULL; + return pvsk; +} + + +/* + * Offload operations. + */ + +/** + * @brief Creates an offload socket and schedules it for reply. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back. + */ + +void +PvtcpCreateOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + int rc; + struct socket *sock; + PvtcpSock *pvsk; + PvtcpState *state = (PvtcpState *)upperLayerState; + const int enable = 1; + + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + +#if defined(PVTCP_IPV6_DISABLE) + if (packet->data16 == AF_INET6) { + CommOS_Debug(("%s: AF_INET6 support is disabled.\n", __FUNCTION__)); + rc = -EAFNOSUPPORT; + } else +#endif + { + rc = sock_create_kern(packet->data16, packet->data32, + packet->data32ex, &sock); + } + + if (!rc) { + rc = SockAllocInit(sock, channel, packet->data64, NULL); + if (rc) { + SockReleaseWrapper(sock); + goto fail; + } + kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (void *)&enable, sizeof enable); + pvsk = PvskFromSk(sock->sk); + if (state->extra && + ((PvtcpStateKObj *)(state->extra))->useNS) { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_CHANNEL); + } else { + PvtcpSwitchSock(pvsk, PVTCP_SOCK_NAMESPACE_INITIAL); + } + PvtcpStateAddSocket(pvsk->channel, pvtcpIfUnbound, pvsk); + PvskSetOpFlag(pvsk, PVTCP_OP_CREATE); + } else { + CommOS_Debug(("%s: Error creating offload socket: %d\n", + __FUNCTION__, rc)); + /* + * Pass -rc so we follow error conventions for other reply ops. + * The error code is fixed by the PV side so error codes are properly + * reported. + */ + pvsk = SockAllocErrInit(-rc, channel, packet->data64); + if (!pvsk) { + goto fail; + } + } + + PvtcpSchedSock(pvsk); + return; + +fail: + CommOS_Log(("%s: BOOG ** FAILED TO CREATE OFFLOAD SOCKET [%d] " + "_AND_ ERROR REPORTING SOCKET!\n" + " PV SIDE MAY BE LOCKED UP UNTIL CREATE RPC TIMES OUT!", + __FUNCTION__, rc)); +} + + +/** + * @brief Schedules an offload socket to be removed. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back and + * then release the socket. + */ + +void +PvtcpReleaseOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + + /* + * Check if this is a pvsock datagram socket bound on a reserved port. + * If so, reset the bit such that filtering drops rogue packets. + */ + + if ((sk->sk_socket->type == SOCK_DGRAM) && + (pvsk->netif->conf.family == PVTCP_PF_LOOPBACK_INET4)) { + unsigned short port = 0; + + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + if(!kernel_getsockname(sk->sk_socket, + (struct sockaddr *)&sin, &addrLen)) { + port = sin.sin_port; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + if(!kernel_getsockname(sk->sk_socket, + (struct sockaddr *)&sin, &addrLen)) { + port = sin.sin6_port; + } + } + + port = ntohs(port) - portRangeBase; + if (port < portRangeSize) { + CommOS_MutexLock(&globalLock); + PvtcpResetPortIndexBit(pvsk->netif->conf.addr.in.s_addr, port); + CommOS_MutexUnlock(&globalLock); + } + } + + /* + * - hold the socket before setting the 'release' flag and until after + * the call to PvtcpSchedSock(): if the socket had already been scheduled + * ReleaseAIO may run, find the flag set and release this socket while + * it's being unlocked here. + * + * - hold the dispatch lock until done to ensure that subsequent Ops for + * this socket see peerSockSet == 0. + */ + + PvtcpHoldSock(pvsk); + SOCK_STATE_LOCK(pvsk); + pvsk->peerSockSet = 0; + SOCK_STATE_UNLOCK(pvsk); + PvskSetOpFlag(pvsk, PVTCP_OP_RELEASE); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); +} + + +/** + * @brief Binds an offload socket to a given address + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ + +void +PvtcpBindOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct sockaddr *addr; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + int reuseAddr; + int addrLen; + int rc; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + /* + * The socket-level option SO_REUSEADDR is set in the common socket code, + * meaning that we cannot intercept it in the guest pvtcp implementation. + * In order to respect the setting, the guest would pass the current + * setting in 'bind' requests. + * If the guest requires 'reuse address' setting, the value is incremented + * such that we differentiate between: 0) not requested, 1) 'false' and + * 2) 'true'. + */ + + reuseAddr = COMM_OPF_GET_VAL(packet->flags); + if ((reuseAddr == 1) || (reuseAddr == 2)) { + /* Explicit request, so decrement the value. */ + + reuseAddr--; + kernel_setsockopt(sk->sk_socket, SOL_SOCKET, SO_REUSEADDR, + (void *)&reuseAddr, sizeof reuseAddr); + } + + if (sk->sk_family == AF_INET) { + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_port = packet->data16; + sin.sin_addr.s_addr = (unsigned int)packet->data64ex; + addr = (struct sockaddr *)&sin; + addrLen = sizeof sin; + + rc = PvtcpTestAndBindLoopbackInet4(pvsk, &sin.sin_addr.s_addr, + sin.sin_port); + if (rc <= 0) { + /* Bind has already happened. */ + + pvsk->err = -rc; + goto out; + } + } else { /* AF_INET6 */ + memset(&sin6, 0, sizeof sin6); + sin6.sin6_family = AF_INET6; + sin6.sin6_port = packet->data16; + addr = (struct sockaddr *)&sin6; + addrLen = sizeof sin6; + + rc = PvtcpTestAndBindLoopbackInet6(pvsk, &packet->data64ex, + &packet->data64ex2, sin6.sin6_port); + if (rc <= 0) { + /* Bind has already happened. */ + + pvsk->err = -rc; + goto out; + } + PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], + packet->data64ex, packet->data64ex2); + } + + /* coverity[check_return] */ + pvsk->err = -kernel_bind(sk->sk_socket, addr, addrLen); + +out: + PvskSetOpFlag(pvsk, PVTCP_OP_BIND); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Sets a socket option. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ +void +PvtcpSetSockOptOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + unsigned int optlen = packet->len - sizeof *packet; + + PvtcpHoldSock(pvsk); + + if ((vecLen != 1) || (vec[0].iov_len != optlen) || (optlen < sizeof(int))) { + pvsk->rpcStatus = -EINVAL; + goto out; + } + + if (packet->data32 == SOL_TCP) { + /* + * The back-end implementation must always run in 'nodelay' mode. + * Consequently, we ignore, but we cache the TCP_NODELAY and TCP_CORK + * settings such that getsockopt() can return them as they were 'set'. + * Applications use these settings for performance; pvtcp does quite + * well if it's not interfered with. + */ + + int on; + + switch (packet->data32ex) { + case TCP_NODELAY: + memcpy(&on, vec[0].iov_base, sizeof on); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_TCP_NODELAY, on); + pvsk->rpcStatus = 0; + goto out; + case TCP_CORK: + memcpy(&on, vec[0].iov_base, sizeof on); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_TCP_CORK, on); + pvsk->rpcStatus = 0; + goto out; + } + } + + pvsk->rpcStatus = kernel_setsockopt(sock, + packet->data32, + packet->data32ex, + vec[0].iov_base, + optlen); + +out: + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvskSetOpFlag(pvsk, PVTCP_OP_SETSOCKOPT); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Retrieves a socket option. + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ +void +PvtcpGetSockOptOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + unsigned int optLen = (unsigned int)(packet->data64ex); + char *optBuf; + int rc = 0; + + PvtcpHoldSock(pvsk); + + if ((optLen < sizeof(int)) || (optLen > PVTCP_SOCK_SAFE_RCVSIZE)) { + pvsk->rpcStatus = -EINVAL; + goto out; + } + + optBuf = CommOS_Kmalloc(optLen); + if (!optBuf) { + pvsk->rpcStatus = -EINVAL; + goto out; + } + + if (packet->data32 == SOL_TCP) { + /* + * See comment in PvtcpSetSockOptOp() regarding special treatment for + * the TCP_NODELAY and TCP_CORK settings. + */ + + int on; + + switch (packet->data32ex) { + case TCP_NODELAY: + on = PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_TCP_NODELAY); + optLen = sizeof on; + memcpy(optBuf, &on, optLen); + goto done; + case TCP_CORK: + on = PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_TCP_CORK); + optLen = sizeof on; + memcpy(optBuf, &on, optLen); + goto done; + } + } + + rc = kernel_getsockopt(sock, packet->data32, + packet->data32ex, optBuf, &optLen); + +done: + if (!rc) { + pvsk->rpcReply = optBuf; + CommOS_MemBarrier(); + pvsk->rpcStatus = (int)optLen; + } else { + CommOS_Kfree(optBuf); + pvsk->rpcStatus = rc; + } + +out: + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvskSetOpFlag(pvsk, PVTCP_OP_GETSOCKOPT); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Performs ioctl on offload socket. + * @param channel communication channel with offloader + * @param state state associated with this channel + * @param packet packet header received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + */ + +void +PvtcpIoctlOp(CommChannel channel, + void *state, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, state); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + + PvtcpHoldSock(pvsk); + + /* Not implemented yet. */ + + (void)sock; + pvsk->rpcStatus = -ENOIOCTLCMD; + + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvskSetOpFlag(pvsk, PVTCP_OP_IOCTL); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Marks a socket for listening to incoming connections + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ + +void +PvtcpListenOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + int backlog = (int)packet->data32; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + pvsk->err = -kernel_listen(sk->sk_socket, backlog); + PvskSetOpFlag(pvsk, PVTCP_OP_LISTEN); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Accepts a connected socket + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back. + */ + +void +PvtcpAcceptOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + int rc; + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct socket *newsock = NULL; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + rc = kernel_accept(sk->sk_socket, &newsock, O_NONBLOCK); + if (rc == 0) { + rc = SockAllocInit(newsock, channel, packet->data64ex, pvsk); + if (rc) { + SockReleaseWrapper(newsock); + } + } + + if (rc == 0) { + struct sock *newsk = newsock->sk; + PvtcpSock *newpvsk = PvskFromSk(newsk); + + /* We temporarily use the state field to cache parent socket. */ + + newpvsk->state = (PvtcpState *)pvsk; + PvskSetOpFlag(newpvsk, PVTCP_OP_ACCEPT); + PvtcpSchedSock(newpvsk); + } else { + pvsk->err = -rc; + PvskSetOpFlag(pvsk, PVTCP_OP_ACCEPT); + PvtcpSchedSock(pvsk); + } + + PvtcpPutSock(pvsk); +} + + +/** + * @brief Connects an offload socket to given address + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect A writer task is scheduled, which will send reply back + */ + +void +PvtcpConnectOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + struct sock *sk = SkFromPvsk(pvsk); + struct sockaddr *addr; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + int addrLen; + int flags = 0; + int rc = 0; + int disconnect = 0; + + PvtcpHoldSock(pvsk); + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + + if (sk->sk_family == AF_INET) { + addr = (struct sockaddr *)&sin; + addrLen = sizeof sin; + memset(&sin, 0, sizeof sin); + sin.sin_port = packet->data16; + sin.sin_addr.s_addr = (unsigned int)packet->data64ex; + if (COMM_OPF_GET_VAL(packet->flags)) { + sin.sin_family = AF_UNSPEC; + disconnect = 1; + goto connect; + } + sin.sin_family = AF_INET; + PvtcpTestAndBindLoopbackInet4(pvsk, &sin.sin_addr.s_addr, 0); + } else { /* AF_INET6 */ + addr = (struct sockaddr *)&sin6; + addrLen = sizeof sin6; + memset(&sin6, 0, sizeof sin6); + sin6.sin6_port = packet->data16; + if (COMM_OPF_GET_VAL(packet->flags)) { + sin6.sin6_family = AF_UNSPEC; + PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], + packet->data64ex, packet->data64ex2); + disconnect = 1; + goto connect; + } + sin6.sin6_family = AF_INET6; + PvtcpTestAndBindLoopbackInet6(pvsk, &packet->data64ex, + &packet->data64ex2, 0); + PvtcpI6AddrUnpack(&sin6.sin6_addr.s6_addr32[0], + packet->data64ex, packet->data64ex2); + } + +connect: + rc = kernel_connect(sk->sk_socket, addr, addrLen, flags | O_NONBLOCK); + + /* + * For datagram sockets, ErrorReportCB is not called, so we need to + * explicitly set the pvsk error to be returned back to the guest. + * This should not be used on SOCK_STREAM sockets. You have been + * warned. + */ + + if (rc && (sk->sk_socket->type == SOCK_DGRAM)) { + pvsk->err = -rc; + } + + /* + * Quite likely, stream actual connect requests will set err to EINPROGRESS. + * That's fine, error_report will trigger an AIO/flow-op reply. When the + * connection is established, state_change schedules an AIO/connect reply. + * Record whether the request was a disconnect. + */ + + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT, disconnect); + PvskSetOpFlag(pvsk, PVTCP_OP_CONNECT); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/** + * @brief Initiates socket shutdown on an offload socket + * @param channel communication channel with offloader + * @param upperLayerState state associated with this channel + * @param packet first packet received in reply + * @param vec payload buffer descriptors + * @param vecLen payload buffer descriptor count + * @sideeffect Socket queue will be drained and socket shutdown performed. + */ + +void +PvtcpShutdownOp(CommChannel channel, + void *upperLayerState, + CommPacket *packet, + struct kvec *vec, + unsigned int vecLen) +{ + PvtcpSock *pvsk = PvtcpGetPvskOrReturn(packet->data64, upperLayerState); + int how = (int)packet->data32; + + PvtcpHoldSock(pvsk); + if ((how == SHUT_RD) || (how == SHUT_RDWR)) { + kernel_sock_shutdown(SkFromPvsk(pvsk)->sk_socket, SHUT_RD); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_RD, 1); + } + if ((how == SHUT_WR) || (how == SHUT_RDWR)) { + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_SHUT_WR, 1); + } + PVTCP_UNLOCK_DISP_DISCARD_VEC(); + PvtcpSchedSock(pvsk); + PvtcpPutSock(pvsk); +} + + +/* + * AIO functions called from the main AIO processing function. + * Most of these functions complete processing initiated by the corresponding + * offload operations above. + */ + +/** + * @brief Processes socket release in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket to release. + * @sideeffect the socket will be released upon return from this function. + */ + +static inline void +ReleaseAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_RELEASE, + .data64 = pvsk->peerSock, + .data64ex = PvtcpGetHandle(pvsk) + }; + unsigned long long timeout = COMM_MAX_TO; + + SOCK_OUT_LOCK(pvsk); + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Release' [0x%p] -> 0x%0x] reply.\n", + __FUNCTION__, pvsk, (unsigned)(pvsk->peerSock))); +#endif + /* + * 'sk' goes away in the final ProcessAIO::sock_put() + */ + SockReleaseWrapper(sock); + SOCK_OUT_UNLOCK(pvsk); + + PvtcpStateRemoveSocket(pvsk->channel, pvsk); +} + + +/** + * @brief Processes socket create reply in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk newly created socket to send ack for. + */ + +static inline void +CreateAIO(PvtcpSock *pvsk) +{ + struct sock *sk; + struct socket *sock; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_CREATE, + .data64 = pvsk->peerSock, + }; + unsigned long long timeout = COMM_MAX_TO; + int rc; + + sk = SkFromPvsk(pvsk); + if (!sk) { + /* + * This is a create-error socket. The error reply has been sent out + * already, by PvtcpFlowAIO(). This is a paranoid safety measure, as + * PVTCP_OP_CREATE OpFlag should not have been set. + */ + + return; + } + + sock = sk->sk_socket; + packet.data64ex = PvtcpGetHandle(pvsk); + + rc = CommSvc_Write(pvsk->channel, &packet, &timeout); + if (rc != packet.len) { + /* We mustn't leak it if PV can't get a hold of it. */ + + PvtcpStateRemoveSocket(pvsk->channel, pvsk); + SockReleaseWrapper(sock); + CommOS_Log(("%s: BOOG -- Couldn't send 'Create' reply [0x%p]!\n", + __FUNCTION__, sk)); + } else { +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Create' [0x%p] reply [%d].\n", + __FUNCTION__, pvsk, rc)); +#endif + } +} + + +/** + * @brief Processes socket bind in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket being bound. + */ + +static inline void +BindAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_BIND, + .data64 = pvsk->peerSock + }; + unsigned long long timeout = COMM_MAX_TO; + int rc; + + if (pvsk->peerSockSet) { + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin_port; + PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); + packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin6_port; + PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); + PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], + &packet.data64ex, &packet.data64ex2); + } + } + + if (rc) { + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = (unsigned int)(-rc); + packet.opCode = PVTCP_OP_FLOW; + } + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Bind' [0x%p, %d] reply.\n", + __FUNCTION__, pvsk, rc)); +#endif + } +} + + +/** + * @brief Sends result of setsockopt back to guest. + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket that was modified. + */ + +static inline void +SetSockOptAIO(PvtcpSock *pvsk) +{ + CommPacket packet; + unsigned long long timeout; + + packet.len = sizeof packet; + packet.flags = 0; + packet.opCode = PVTCP_OP_SETSOCKOPT; + packet.data64 = pvsk->peerSock; + packet.data32 = (unsigned int)(pvsk->rpcStatus); + timeout = COMM_MAX_TO; + CommSvc_Write(pvsk->channel, &packet, &timeout); + pvsk->rpcStatus = 0; +} + + +/** + * @brief Sends result of getsockopt back to guest. + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket that was modified. + */ + +static inline void +GetSockOptAIO(PvtcpSock *pvsk) +{ + CommPacket packet = { + .opCode = PVTCP_OP_GETSOCKOPT, + .flags = 0 + }; + unsigned long long timeout = COMM_MAX_TO; + + struct kvec vec[1]; + struct kvec *inVec = vec; + unsigned int vecLen = 1; + unsigned int iovOffset = 0; + + if (pvsk->rpcStatus > 0) { + packet.len = sizeof packet + pvsk->rpcStatus; + vec[0].iov_base = pvsk->rpcReply; + vec[0].iov_len = pvsk->rpcStatus; + } else { + vecLen = 0; + } + + packet.data64 = pvsk->peerSock; + packet.data32 = pvsk->rpcStatus; + + CommSvc_WriteVec(pvsk->channel, &packet, &inVec, &vecLen, + &timeout, &iovOffset); + + if (pvsk->rpcReply) { + CommOS_Kfree(pvsk->rpcReply); + pvsk->rpcReply = NULL; + } + pvsk->rpcStatus = 0; +} + + +/** + * @brief Sends result of ioctl back to guest. + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket that was modified. + */ + +static inline void +IoctlAIO(PvtcpSock *pvsk) +{ + CommPacket packet = { + .len = sizeof packet, + .opCode = PVTCP_OP_IOCTL, + .flags = 0 + }; + unsigned long long timeout = COMM_MAX_TO; + + packet.data64 = pvsk->peerSock; + packet.data32 = pvsk->rpcStatus; + CommSvc_Write(pvsk->channel, &packet, &timeout); + pvsk->rpcStatus = 0; +} + + +/** + * @brief Processes socket listen reply in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket being put in listen mode. + */ + +static inline void +ListenAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_LISTEN, + .data64 = pvsk->peerSock + }; + unsigned long long timeout = COMM_MAX_TO; + + if (pvsk->peerSockSet) { + if (sk->sk_state != TCP_LISTEN) { + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = (unsigned int)pvsk->err; + packet.opCode = PVTCP_OP_FLOW; + } + + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Listen' [0x%p] reply.\n", __FUNCTION__, pvsk)); +#endif + } +} + + +/** + * @brief Processes socket accept reply in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk new socket or socket to accept on (see PvtcpAcceptOp). + */ + +static inline void +AcceptAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_ACCEPT + }; + unsigned long long timeout = COMM_MAX_TO; + const int enable = 1; + int rc; + + if (pvsk->peerSockSet) { + unsigned long long payloadSocks[2] = { 0, 0 }; + struct kvec payloadVec[] = { + { .iov_base = &payloadSocks, .iov_len = sizeof payloadSocks } + }; + struct kvec *payload = payloadVec; + unsigned int payloadLen = 1; + unsigned int iovOffset = 0; + + packet.len = sizeof packet + sizeof payloadSocks; + + /* + * accept() succeeded, so this is the child socket; its state field + * was temporarily changed to hold the parent/accepting socket. + * The newly accepted socket and its peer need to be put in a + * payload since we use up all available header fields with + * addressing information. Finally, the state field is restored. + */ + + packet.data64 = ((PvtcpSock *)pvsk->state)->peerSock; + pvsk->state = CommSvc_GetState(pvsk->channel); + + payloadSocks[0] = pvsk->peerSock; + payloadSocks[1] = PvtcpGetHandle(pvsk); + + rc = 0; + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin_port; + PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); + packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin6_port; + PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); + PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], + &packet.data64ex, &packet.data64ex2); + } + } + + if (rc == 0) { + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_OOBINLINE, + (void *)&enable, sizeof enable); + } else { + PvtcpStateRemoveSocket(pvsk->channel, pvsk); + SockReleaseWrapper(sock); + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = (unsigned int)ECONNABORTED; + packet.len = sizeof packet; + packet.opCode = PVTCP_OP_FLOW; + } + + rc = CommSvc_WriteVec(pvsk->channel, &packet, + &payload, &payloadLen, &timeout, &iovOffset); + if ((rc != packet.len) && !COMM_OPF_TEST_ERR(packet.flags)) { + /* Mustn't leak the new socket if PV can't get a hold of it. */ + + PvtcpStateRemoveSocket(pvsk->channel, pvsk); + SockReleaseWrapper(sock); + } +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Accept' [0x%p] reply.\n", __FUNCTION__, pvsk)); +#endif + } +} + + +/** + * @brief Processes socket connect in an AIO thread. This function is + * called with the socket 'in' lock taken. + * @param[in,out] pvsk socket being connected. + */ + +static inline void +ConnectAIO(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + struct socket *sock = sk->sk_socket; + CommPacket packet = { + .len = sizeof packet, + .flags = 0, + .opCode = PVTCP_OP_CONNECT, + .data64 = pvsk->peerSock + }; + unsigned long long timeout = COMM_MAX_TO; + const int enable = 1; + int rc; + + if (!pvsk->peerSockSet || + (!PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT) && + (sk->sk_state != TCP_ESTABLISHED))) { + return; + } + + if (PvskTestFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT)) { + COMM_OPF_SET_VAL(packet.flags, 1); + PvskSetFlag(pvsk, PVTCP_OFF_PVSKF_DISCONNECT, 0); + } else if (sk->sk_state == TCP_ESTABLISHED) { + if (sk->sk_family == AF_INET) { + struct sockaddr_in sin = { .sin_family = AF_INET }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin_port; + PvtcpResetLoopbackInet4(pvsk, &sin.sin_addr.s_addr); + packet.data64ex = (unsigned long long)sin.sin_addr.s_addr; + } + } else { /* AF_INET6 */ + struct sockaddr_in6 sin = { .sin6_family = AF_INET6 }; + int addrLen = sizeof sin; + + rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &addrLen); + if (rc == 0) { + packet.data16 = sin.sin6_port; + PvtcpResetLoopbackInet6(pvsk, &sin.sin6_addr); + PvtcpI6AddrPack(&sin.sin6_addr.s6_addr32[0], + &packet.data64ex, &packet.data64ex2); + } + } + + if (rc == 0) { + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (void *)&enable, sizeof enable); + kernel_setsockopt(sock, SOL_SOCKET, SO_OOBINLINE, + (void *)&enable, sizeof enable); + } else { + COMM_OPF_SET_ERR(packet.flags); + packet.data32ex = ECONNABORTED; + packet.opCode = PVTCP_OP_FLOW; + } + } + + CommSvc_Write(pvsk->channel, &packet, &timeout); +#if defined(PVTCP_FULL_DEBUG) + CommOS_Debug(("%s: Sent 'Connect' [0x%p] reply.\n", __FUNCTION__, pvsk)); +#endif +} + + +/** + * @brief Server side main asynchronous processing function. It writes to + * socket queued output buffers, it reads from socket and outputs to PV; it + * also completes operation processing and sends applicable replies to PV. + * Finally, processes error reporting and delta size acks. + * @param arg socket work item. + */ + +void +PvtcpProcessAIO(CommOSWork *arg) +{ + PvtcpSock *pvsk = container_of(arg, PvtcpSock, work); + struct sock *sk = SkFromPvsk(pvsk); + + if (!SOCK_OUT_TRYLOCK(pvsk)) { + /* + * Queued output processing. If trylock failed, we don't retry. + * There are only two reasons for not being able to take the lock: + * - IoOp() has it -- when done, it reschedules us if we're not running. + * - OutputAIO() is already running on another core. + */ + + if (sk && sk->sk_socket) { + PvtcpOutputAIO(pvsk); + } + SOCK_OUT_UNLOCK(pvsk); + } + + /* All other processing needs the socket IN lock. */ + + if (!SOCK_IN_TRYLOCK(pvsk)) { + + if (sk && sk->sk_socket) { + int err; + + /* Input processing. */ + + /* + * Workqueue handlers are pinned to a CPU core and therefore not + * migratable. No need to disable preemption. + */ + err = PvtcpInputAIO(pvsk, perCpuBuf[smp_processor_id()]); + + /* Error and ack notifications. */ + + PvtcpFlowAIO(pvsk, err); + + if (!pvsk->opFlags) { + /* No other operations need to be completed. */ + + goto doneInUnlock; + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_RELEASE)) { + PvskResetOpFlag(pvsk, PVTCP_OP_RELEASE); + ReleaseAIO(pvsk); + + /* All possible in-flight operations must be dropped. */ + goto doneInUnlock; + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_CREATE)) { + /* No state locking required. */ + + PvskResetOpFlag(pvsk, PVTCP_OP_CREATE); + CreateAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_BIND)) { + PvskResetOpFlag(pvsk, PVTCP_OP_BIND); + BindAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_SETSOCKOPT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_SETSOCKOPT); + SetSockOptAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_GETSOCKOPT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_GETSOCKOPT); + GetSockOptAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_IOCTL)) { + PvskResetOpFlag(pvsk, PVTCP_OP_IOCTL); + IoctlAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_LISTEN)) { + PvskResetOpFlag(pvsk, PVTCP_OP_LISTEN); + ListenAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_ACCEPT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_ACCEPT); + AcceptAIO(pvsk); + } + + if (PvskTestOpFlag(pvsk, PVTCP_OP_CONNECT)) { + PvskResetOpFlag(pvsk, PVTCP_OP_CONNECT); + ConnectAIO(pvsk); + } + +doneInUnlock: + SOCK_IN_UNLOCK(pvsk); + } else { + /* + * Special case for error sockets which don't have a sk. + * Note that this socket was created by SockAllocErrInit() and so + * no 'real' socket sits atop it and is not present on any state + * netif list. The socket has a refcnt of one and it will get + * deallocated by the PvtcpPutSock() call below, so we don't need + * to unlock it. + */ + + PvtcpFlowAIO(pvsk, -ENETDOWN); + } + } else { + if ((pvsk->peerSockSet || PvskTestOpFlag(pvsk, PVTCP_OP_RELEASE)) && + sk && sk->sk_socket) { + PvtcpSchedSock(pvsk); + } + } + + PvtcpPutSock(pvsk); +} diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.h b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.h new file mode 100644 index 0000000..34992da --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux.h @@ -0,0 +1,226 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief Linux Offload definitions. + * This file is only meant to be included via pvtcp_off.h. + */ + +#ifndef _PVTCP_OFF_LINUX_H_ +#define _PVTCP_OFF_LINUX_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +typedef struct PvtcpSock { + struct sock *sk; + PVTCP_SOCK_COMMON_FIELDS; + PVTCP_OFF_SOCK_COMMON_FIELDS; + void (*destruct)(struct sock *sk); + void (*stateChange)(struct sock *sk); + void (*dataReady)(struct sock *sk, int bytes); + void (*writeSpace)(struct sock *sk); + void (*errorReport)(struct sock *sk); +} PvtcpSock; + + +typedef enum PvtcpSockNamespace { + PVTCP_SOCK_NAMESPACE_INITIAL, + PVTCP_SOCK_NAMESPACE_CHANNEL +} PvtcpSockNamespace; + + +/* Number of large datagram allocations. */ +extern unsigned long long pvtcpOffDgramAllocations; + +/* Inet4 loopback addresses. */ +extern unsigned int pvtcpLoopbackOffAddr; + +/* Get the 'struct sock' from a PvtcpSock. */ +#define SkFromPvsk(pvsk) ((pvsk)->sk) + +/* Get the PvtcpSock from a 'struct sock'. */ +#define PvskFromSk(sk) ((PvtcpSock *)(sk)->sk_user_data) + +int +PvtcpTestAndBindLoopbackInet4(PvtcpSock *pvsk, + unsigned int *addr, + unsigned short port); +int +PvtcpTestAndBindLoopbackInet6(PvtcpSock *pvsk, + unsigned long long *addr0, + unsigned long long *addr1, + unsigned short port); + +void PvtcpResetLoopbackInet4(PvtcpSock *pvsk, unsigned int *addr); +void PvtcpResetLoopbackInet6(PvtcpSock *pvsk, struct in6_addr *in6); + +void PvtcpFlowAIO(PvtcpSock *pvsk, int eof); +void PvtcpOutputAIO(PvtcpSock *pvsk); +int PvtcpInputAIO(PvtcpSock *pvsk, void *perCpuBuf); + + +/** + * @brief Switches a socket to the channel, or the initial name space. + * @param pvsk socket to switch. + * @param ns which namespace to switch to. + */ + +static inline void +PvtcpSwitchSock(PvtcpSock *pvsk, + PvtcpSockNamespace ns) +{ +#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) + struct sock *sk; + struct net *prevNet; + + if (!pvsk) { + return; + } + sk = SkFromPvsk(pvsk); + if (!sk) { + /* If this is a phony, create fail reporting pvsk, just return. */ + + return; + } + + prevNet = sock_net(sk); + switch (ns) { + case PVTCP_SOCK_NAMESPACE_INITIAL: + sock_net_set(sk, get_net(&init_net)); + break; + case PVTCP_SOCK_NAMESPACE_CHANNEL: + sock_net_set(sk, get_net(pvsk->state->namespace)); + break; + } + put_net(prevNet); +#endif +} + + +/** + * @brief Tests whether a socket has an explicit namespace. + * @param pvsk socket to test. + * @return 1 if the socket has a namespace, 0 otherwise. + */ + +static inline int +PvtcpHasSockNamespace(PvtcpSock *pvsk) +{ +#if defined(CONFIG_NET_NS) && !defined(PVTCP_NET_NS_DISABLE) + struct sock *sk; + int rc = 0; + + if (!pvsk) { + return rc; + } + sk = SkFromPvsk(pvsk); + if (!sk) { + /* If this is a phony, create fail reporting pvsk, just return 0. */ + + return rc; + } + + rc = (sock_net(sk) != &init_net); + return rc; +#else + return 0; +#endif +} + + +/** + * @brief Retains the pvsock's underlying socket. + * @param pvsk socket to retain. + */ + +static inline void +PvtcpHoldSock(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + + if (likely(sk)) { + sock_hold(sk); + } +} + + +/** + * @brief Releases a hold on the pvsock's underlying socket. If the underlying + * socket is NULL, this is an error socket and we deallocate it. + * @param pvsk socket to release hold on. + */ + +static inline void +PvtcpPutSock(PvtcpSock *pvsk) +{ + struct sock *sk = SkFromPvsk(pvsk); + + if (likely(sk)) { + sock_put(sk); + } else { + /* + * This is an error socket, which does _not_ have an underlying socket. + * We simply need to free it. + */ + + CommOS_Kfree(pvsk); + } +} + + +/** + * @brief Schedules an offload socket for AIO. + * @param pvsk socket to schedule. + * @sideeffect the socket will be processed by AIO threads. + */ + +static inline void +PvtcpSchedSock(PvtcpSock *pvsk) +{ + /* + * We must hold the socket before we enqueue it for AIO, such that it may + * not be released while in the workqueue. If CommSvc_ScheduleAIOWork() + * returned non-zero, it means the socket had already been enqueued. In + * that case, we release the hold. Otherwise, the hold is released by the + * AIO function (PvtcpProcessAIO()). + * Note that error pv sockets may only originate from synchronized RPCs, + * or to be more precise, from PvtcpCreateOp(), and not from IO processing; + * this means that they cannot be attempted to be enqueued more than once. + */ + + PvtcpHoldSock(pvsk); + if (CommSvc_ScheduleAIOWork(&pvsk->work)) { + PvtcpPutSock(pvsk); + } +} + +#endif // _PVTCP_OFF_LINUX_H_ diff --git a/arch/arm/mvp/pvtcpkm/pvtcp_off_linux_shim.S b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux_shim.S new file mode 100644 index 0000000..824286b --- /dev/null +++ b/arch/arm/mvp/pvtcpkm/pvtcp_off_linux_shim.S @@ -0,0 +1,70 @@ +/* + * Linux 2.6.32 and later Kernel module for VMware MVP PVTCP Server + * + * Copyright (C) 2010-2012 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#line 5 + +/** + * @file + * + * @brief PVTCP socket destructor shim. + * + * The module reference accounting code for socket destruction in the core + * Linux kernel does not know about PVTCP sockets, so it does not properly + * increment/decrement the reference count on pvtcpkm when calling through a + * function pointer into our destructor. If a module unload is requested on + * pvtcpkm while a socket is being destroyed, it is possible for the destructor + * to be preempted after decrementing the module reference count but before + * returning to the core kernel. If the module code is unmapped before the + * function return, it is possible that we will attempt to execute unmapped + * code, resulting in a host crash. + * + * This shim proxies socket destruction requests through to the PVTCP socket + * destructor, then jumps directly to module_put to drop the reference count. + * module_put will return directly to the caller, eliminating the race. + */ + +.text +.p2align 4 + +.global asmDestructorShim + +/** + * @brief Socket destructor callback. Calls into pvtcpkm to destroy a socket + * and then decrements the refcount. + * @param r0 pointer to struct sock + */ + +asmDestructorShim: + push {lr} + ldr r1, targetAddr @ Destroy socket + blx r1 + pop {lr} + cmp r0, #0 + bxne lr @ We shouldn't module_put, just return. + ldr r0, owner + ldr r1, modulePutAddr @ Jump to module_put. module_put + bx r1 @ returns directly to caller + +owner: + .word __this_module + +targetAddr: + .word DestructCB + +modulePutAddr: + .word module_put -- cgit v1.1