diff options
author | ricea <ricea@chromium.org> | 2015-10-06 06:41:43 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-10-06 13:42:29 +0000 |
commit | 500dbe3cfc18c490cc466abef162896302284814 (patch) | |
tree | f6650cc5b302dc93f9dd6a2b96eae63167dd5aac /net/websockets | |
parent | 10eb8250bc72d8c7211244a0930f126bb5f635bd (diff) | |
download | chromium_src-500dbe3cfc18c490cc466abef162896302284814.zip chromium_src-500dbe3cfc18c490cc466abef162896302284814.tar.gz chromium_src-500dbe3cfc18c490cc466abef162896302284814.tar.bz2 |
Use vector operations for WebSocket masking.
GCC and Clang can transparently generate vector operations from normal
C++ code, just by changing the type. This permits using SIMD
instructions on x86(-64) and in future ARM without additional code
complexity.
This increases the speed of masking by 50% from 18GBps to 27GBps
on x86-64.
ARM support is waiting for Neon SIMD support to be enabled by
default. See crbug.com/448055
SSE2 has been the default on x86 platforms for over a year.
TEST=net_unittests
BUG=539259
Review URL: https://codereview.chromium.org/1382143002
Cr-Commit-Position: refs/heads/master@{#352592}
Diffstat (limited to 'net/websockets')
-rw-r--r-- | net/websockets/websocket_frame.cc | 30 | ||||
-rw-r--r-- | net/websockets/websocket_frame_perftest.cc | 8 |
2 files changed, 31 insertions, 7 deletions
diff --git a/net/websockets/websocket_frame.cc b/net/websockets/websocket_frame.cc index 3782f28..a702207 100644 --- a/net/websockets/websocket_frame.cc +++ b/net/websockets/websocket_frame.cc @@ -4,6 +4,9 @@ #include "net/websockets/websocket_frame.h" +#include <stddef.h> +#include <stdint.h> + #include <algorithm> #include "base/basictypes.h" @@ -17,6 +20,21 @@ namespace net { namespace { +// GCC (and Clang) can transparently use vector ops. Only try to do this on +// architectures where we know it works, otherwise gcc will attempt to emulate +// the vector ops, which is unlikely to be efficient. +// TODO(ricea): Add ARCH_CPU_ARM_FAMILY when arm_neon=1 becomes the default. +#if defined(COMPILER_GCC) && defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) + +using PackedMaskType = uint32_t __attribute__((vector_size(16))); + +#else + +using PackedMaskType = size_t; + +#endif // defined(COMPILER_GCC) && defined(ARCH_CPU_X86_FAMILY) && + // !defined(OS_NACL) + const uint8 kFinalBit = 0x80; const uint8 kReserved1Bit = 0x40; const uint8 kReserved2Bit = 0x20; @@ -171,16 +189,14 @@ void MaskWebSocketFramePayload(const WebSocketMaskingKey& masking_key, DCHECK_GE(data_size, 0); - // Most of the masking is done one word at a time, except for the beginning - // and the end of the buffer which may be unaligned. We use size_t to get the - // word size for this architecture. We require it be a multiple of - // kMaskingKeyLength in size. - typedef size_t PackedMaskType; - PackedMaskType packed_mask_key = 0; + // Most of the masking is done in chunks of sizeof(PackedMaskType), except for + // the beginning and the end of the buffer which may be unaligned. + // PackedMaskType must be a multiple of kMaskingKeyLength in size. + PackedMaskType packed_mask_key; static const size_t kPackedMaskKeySize = sizeof(packed_mask_key); static_assert((kPackedMaskKeySize >= kMaskingKeyLength && kPackedMaskKeySize % kMaskingKeyLength == 0), - "word size is not a multiple of mask length"); + "PackedMaskType size is not a multiple of mask length"); char* const end = data + data_size; // If the buffer is too small for the vectorised version to be useful, revert // to the byte-at-a-time implementation early. diff --git a/net/websockets/websocket_frame_perftest.cc b/net/websockets/websocket_frame_perftest.cc index 98ea624..bd1de1c 100644 --- a/net/websockets/websocket_frame_perftest.cc +++ b/net/websockets/websocket_frame_perftest.cc @@ -53,6 +53,14 @@ TEST_F(WebSocketFrameTestMaskBenchmark, BenchmarkMaskLongPayload) { Benchmark("Frame_mask_long_payload", &payload.front(), payload.size()); } +// A 31-byte payload is guaranteed to do 7 byte mask operations and 3 vector +// mask operations with an 8-byte vector. With a 16-byte vector it will fall +// back to the byte-only code path and do 31 byte mask operations. +TEST_F(WebSocketFrameTestMaskBenchmark, Benchmark31BytePayload) { + std::vector<char> payload(31, 'a'); + Benchmark("Frame_mask_31_payload", &payload.front(), payload.size()); +} + } // namespace } // namespace net |