Use vector operations for WebSocket masking.

GCC and Clang can transparently generate vector operations from normal C++ code, just by changing the type. This permits using SIMD instructions on x86(-64) and in future ARM without additional code complexity. This increases the speed of masking by 50% from 18GBps to 27GBps on x86-64. ARM support is waiting for Neon SIMD support to be enabled by default. See crbug.com/448055 SSE2 has been the default on x86 platforms for over a year. TEST=net_unittests BUG=539259 Review URL: https://codereview.chromium.org/1382143002 Cr-Commit-Position: refs/heads/master@{#352592}
author: ricea <ricea@chromium.org> 2015-10-06 06:41:43 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-10-06 13:42:29 +0000
commit: 500dbe3cfc18c490cc466abef162896302284814 (patch)
tree: f6650cc5b302dc93f9dd6a2b96eae63167dd5aac /net/websockets
parent: 10eb8250bc72d8c7211244a0930f126bb5f635bd (diff)
download: chromium_src-500dbe3cfc18c490cc466abef162896302284814.zip
chromium_src-500dbe3cfc18c490cc466abef162896302284814.tar.gz
chromium_src-500dbe3cfc18c490cc466abef162896302284814.tar.bz2
2 files changed, 31 insertions, 7 deletions
diff --git a/net/websockets/websocket_frame.cc b/net/websockets/websocket_frame.cc
index 3782f28..a702207 100644
--- a/net/websockets/websocket_frame.cc
+++ b/net/websockets/websocket_frame.cc
@@ -4,6 +4,9 @@
 
 #include "net/websockets/websocket_frame.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include <algorithm>
 
 #include "base/basictypes.h"
@@ -17,6 +20,21 @@ namespace net {
 
 namespace {
 
+// GCC (and Clang) can transparently use vector ops. Only try to do this on
+// architectures where we know it works, otherwise gcc will attempt to emulate
+// the vector ops, which is unlikely to be efficient.
+// TODO(ricea): Add ARCH_CPU_ARM_FAMILY when arm_neon=1 becomes the default.
+#if defined(COMPILER_GCC) && defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
+
+using PackedMaskType = uint32_t __attribute__((vector_size(16)));
+
+#else
+
+using PackedMaskType = size_t;
+
+#endif  // defined(COMPILER_GCC) && defined(ARCH_CPU_X86_FAMILY) &&
+        // !defined(OS_NACL)
+
 const uint8 kFinalBit = 0x80;
 const uint8 kReserved1Bit = 0x40;
 const uint8 kReserved2Bit = 0x20;
@@ -171,16 +189,14 @@ void MaskWebSocketFramePayload(const WebSocketMaskingKey& masking_key,
 
   DCHECK_GE(data_size, 0);
 
-  // Most of the masking is done one word at a time, except for the beginning
-  // and the end of the buffer which may be unaligned. We use size_t to get the
-  // word size for this architecture. We require it be a multiple of
-  // kMaskingKeyLength in size.
-  typedef size_t PackedMaskType;
-  PackedMaskType packed_mask_key = 0;
+  // Most of the masking is done in chunks of sizeof(PackedMaskType), except for
+  // the beginning and the end of the buffer which may be unaligned.
+  // PackedMaskType must be a multiple of kMaskingKeyLength in size.
+  PackedMaskType packed_mask_key;
   static const size_t kPackedMaskKeySize = sizeof(packed_mask_key);
   static_assert((kPackedMaskKeySize >= kMaskingKeyLength &&
                  kPackedMaskKeySize % kMaskingKeyLength == 0),
-                "word size is not a multiple of mask length");
+                "PackedMaskType size is not a multiple of mask length");
   char* const end = data + data_size;
   // If the buffer is too small for the vectorised version to be useful, revert
   // to the byte-at-a-time implementation early.
diff --git a/net/websockets/websocket_frame_perftest.cc b/net/websockets/websocket_frame_perftest.cc
index 98ea624..bd1de1c 100644
--- a/net/websockets/websocket_frame_perftest.cc
+++ b/net/websockets/websocket_frame_perftest.cc
@@ -53,6 +53,14 @@ TEST_F(WebSocketFrameTestMaskBenchmark, BenchmarkMaskLongPayload) {
   Benchmark("Frame_mask_long_payload", &payload.front(), payload.size());
 }
 
+// A 31-byte payload is guaranteed to do 7 byte mask operations and 3 vector
+// mask operations with an 8-byte vector. With a 16-byte vector it will fall
+// back to the byte-only code path and do 31 byte mask operations.
+TEST_F(WebSocketFrameTestMaskBenchmark, Benchmark31BytePayload) {
+  std::vector<char> payload(31, 'a');
+  Benchmark("Frame_mask_31_payload", &payload.front(), payload.size());
+}
+
 }  // namespace
 
 }  // namespace net
author	ricea <ricea@chromium.org>	2015-10-06 06:41:43 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-10-06 13:42:29 +0000
commit	500dbe3cfc18c490cc466abef162896302284814 (patch)
tree	f6650cc5b302dc93f9dd6a2b96eae63167dd5aac /net/websockets
parent	10eb8250bc72d8c7211244a0930f126bb5f635bd (diff)
download	chromium_src-500dbe3cfc18c490cc466abef162896302284814.zip chromium_src-500dbe3cfc18c490cc466abef162896302284814.tar.gz chromium_src-500dbe3cfc18c490cc466abef162896302284814.tar.bz2