diff options
Diffstat (limited to 'src/crypto/cpu-intel.c')
-rw-r--r-- | src/crypto/cpu-intel.c | 157 |
1 files changed, 145 insertions, 12 deletions
diff --git a/src/crypto/cpu-intel.c b/src/crypto/cpu-intel.c index df0e127..924bab0 100644 --- a/src/crypto/cpu-intel.c +++ b/src/crypto/cpu-intel.c @@ -68,8 +68,58 @@ #include <stdio.h> #include <string.h> -/* OPENSSL_ia32_cpuid is defined in cpu-x86_64-asm.pl. */ -extern uint64_t OPENSSL_ia32_cpuid(uint32_t*); +#if defined(OPENSSL_WINDOWS) +#pragma warning(push, 3) +#include <immintrin.h> +#include <intrin.h> +#pragma warning(pop) +#endif + + +/* OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX + * is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through + * |*out_edx|. */ +static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx, + uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) { +#if defined(OPENSSL_WINDOWS) + int tmp[4]; + __cpuid(tmp, (int)leaf); + *out_eax = (uint32_t)tmp[0]; + *out_ebx = (uint32_t)tmp[1]; + *out_ecx = (uint32_t)tmp[2]; + *out_edx = (uint32_t)tmp[3]; +#elif defined(__pic__) && defined(OPENSSL_32_BIT) + /* Inline assembly may not clobber the PIC register. For 32-bit, this is EBX. + * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. */ + __asm__ volatile ( + "xor %%ecx, %%ecx\n" + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) + : "a"(leaf) + ); +#else + __asm__ volatile ( + "xor %%ecx, %%ecx\n" + "cpuid\n" + : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) + : "a"(leaf) + ); +#endif +} + +/* OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR). + * Currently only XCR0 is defined by Intel so |xcr| should always be zero. */ +static uint64_t OPENSSL_xgetbv(uint32_t xcr) { +#if defined(OPENSSL_WINDOWS) + return (uint64_t)_xgetbv(xcr); +#else + uint32_t eax, edx; + __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (((uint64_t)edx) << 32) | eax; +#endif +} /* handle_cpu_env applies the value from |in| to the CPUID values in |out[0]| * and |out[1]|. See the comment in |OPENSSL_cpuid_setup| about this. */ @@ -91,18 +141,101 @@ static void handle_cpu_env(uint32_t *out, const char *in) { } void OPENSSL_cpuid_setup(void) { - const char *env1, *env2; + /* Determine the vendor and maximum input value. */ + uint32_t eax, ebx, ecx, edx; + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0); -#if defined(OPENSSL_X86_64) - OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P); -#else - uint64_t vec = OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P); - /* 1<<10 sets a reserved bit to indicate that the variable - * was already initialised. */ - OPENSSL_ia32cap_P[0] = ((uint32_t)vec) | (1 << 10); - OPENSSL_ia32cap_P[1] = vec >> 32; -#endif + uint32_t num_ids = eax; + + int is_intel = ebx == 0x756e6547 /* Genu */ && + edx == 0x49656e69 /* ineI */ && + ecx == 0x6c65746e /* ntel */; + int is_amd = ebx == 0x68747541 /* Auth */ && + edx == 0x69746e65 /* enti */ && + ecx == 0x444d4163 /* cAMD */; + + int has_amd_xop = 0; + if (is_amd) { + /* AMD-specific logic. + * See http://developer.amd.com/wordpress/media/2012/10/254811.pdf */ + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000000); + uint32_t num_extended_ids = eax; + if (num_extended_ids >= 0x80000001) { + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000001); + if (ecx & (1 << 11)) { + has_amd_xop = 1; + } + } + } + + uint32_t extended_features = 0; + if (num_ids >= 7) { + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7); + extended_features = ebx; + } + /* Determine the number of cores sharing an L1 data cache to adjust the + * hyper-threading bit. */ + uint32_t cores_per_cache = 0; + if (is_amd) { + /* AMD CPUs never share an L1 data cache between threads but do set the HTT + * bit on multi-core CPUs. */ + cores_per_cache = 1; + } else if (num_ids >= 4) { + /* TODO(davidben): The Intel manual says this CPUID leaf enumerates all + * caches using ECX and doesn't say which is first. Does this matter? */ + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 4); + cores_per_cache = 1 + ((eax >> 14) & 0xfff); + } + + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); + + /* Adjust the hyper-threading bit. */ + if (edx & (1 << 28)) { + uint32_t num_logical_cores = (ebx >> 16) & 0xff; + if (cores_per_cache == 1 || num_logical_cores <= 1) { + edx &= ~(1 << 28); + } + } + + /* Reserved bit #20 was historically repurposed to control the in-memory + * representation of RC4 state. Always set it to zero. */ + edx &= ~(1 << 20); + + /* Reserved bit #30 is repurposed to signal an Intel CPU. */ + if (is_intel) { + edx |= (1 << 30); + } else { + edx &= ~(1 << 30); + } + + /* The SDBG bit is repurposed to denote AMD XOP support. */ + if (has_amd_xop) { + ecx |= (1 << 11); + } else { + ecx &= ~(1 << 11); + } + + uint64_t xcr0 = 0; + if (ecx & (1 << 27)) { + /* XCR0 may only be queried if the OSXSAVE bit is set. */ + xcr0 = OPENSSL_xgetbv(0); + } + /* See Intel manual, section 14.3. */ + if ((xcr0 & 6) != 6) { + /* YMM registers cannot be used. */ + ecx &= ~(1 << 28); /* AVX */ + ecx &= ~(1 << 12); /* FMA */ + ecx &= ~(1 << 11); /* AMD XOP */ + extended_features &= ~(1 << 5); /* AVX2 */ + } + + OPENSSL_ia32cap_P[0] = edx; + OPENSSL_ia32cap_P[1] = ecx; + OPENSSL_ia32cap_P[2] = extended_features; + OPENSSL_ia32cap_P[3] = 0; + + const char *env1, *env2; env1 = getenv("OPENSSL_ia32cap"); if (env1 == NULL) { return; |