summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrobert.bradford <robert.bradford@intel.com>2014-11-05 06:59:34 -0800
committerCommit bot <commit-bot@chromium.org>2014-11-05 14:59:51 +0000
commit02a95e3084f979084fa8586e1718a6e6dd4c22da (patch)
treedc9ba85e9c88dce29436e424bc86fbfb0cc1a22d
parentf6413d5aa9abf63f9236060264774bd66df48271 (diff)
downloadchromium_src-02a95e3084f979084fa8586e1718a6e6dd4c22da.zip
chromium_src-02a95e3084f979084fa8586e1718a6e6dd4c22da.tar.gz
chromium_src-02a95e3084f979084fa8586e1718a6e6dd4c22da.tar.bz2
Reland "Integrate SIMD optimisations for zlib"
This version uses a "pthread_once" implementation, using Windows synchronisation primitives, imported from tcmalloc. Previous CLs: https://codereview.chromium.org/677713002/ https://codereview.chromium.org/552123005 This version of the CL also runs fine on Windows Server 2003. These optimisations have been published on zlib mailing list and at https://github.com/jtkukunas/zlib/ This change merges the following optimisation patches: - "For x86, add CPUID check." - "Adds SSE2 optimized hash shifting to fill_window." - "add SSE4.2 optimized hash function" - "add PCLMULQDQ optimized CRC folding" From Jim Kukunas <james.t.kukunas@linux.intel.com>; and adapts them to the current zlib version in Chromium. The optimisations are enabled at runtime if all the necessary CPU features are present. As the optimisations require extra cflags to enable the compiler to use the instructions the optimisations are held in their own static library with a stub implementation to allow linking on other platforms. TEST=net_unittests(GZipUnitTest) passes, Chrome functions and performance improvement seen on RoboHornet benchmark on Linux Desktop BUG=401517 Review URL: https://codereview.chromium.org/678423002 Cr-Commit-Position: refs/heads/master@{#302799}
-rw-r--r--net/spdy/spdy_framer_test.cc173
-rw-r--r--third_party/zlib/BUILD.gn15
-rw-r--r--third_party/zlib/README.chromium16
-rw-r--r--third_party/zlib/crc32.c27
-rw-r--r--third_party/zlib/crc_folding.c493
-rw-r--r--third_party/zlib/deflate.c139
-rw-r--r--third_party/zlib/deflate.h12
-rw-r--r--third_party/zlib/fill_window_sse.c175
-rw-r--r--third_party/zlib/simd_stub.c35
-rw-r--r--third_party/zlib/x86.c112
-rw-r--r--third_party/zlib/x86.h13
-rw-r--r--third_party/zlib/zlib.gyp23
-rw-r--r--third_party/zlib/zutil.h6
-rw-r--r--tools/msan/blacklist.txt3
14 files changed, 1209 insertions, 33 deletions
diff --git a/net/spdy/spdy_framer_test.cc b/net/spdy/spdy_framer_test.cc
index e7eb092..7a1c3df 100644
--- a/net/spdy/spdy_framer_test.cc
+++ b/net/spdy/spdy_framer_test.cc
@@ -2045,8 +2045,8 @@ TEST_P(SpdyFramerTest, CreateSynStreamCompressed) {
{
const char kDescription[] =
"SYN_STREAM frame, low pri, no FIN";
-
const SpdyPriority priority = IsSpdy2() ? 2 : 4;
+
const unsigned char kV2FrameData[] = {
0x80, spdy_version_ch_, 0x00, 0x01,
0x00, 0x00, 0x00, 0x36,
@@ -2083,15 +2083,70 @@ TEST_P(SpdyFramerTest, CreateSynStreamCompressed) {
0x80, 0x00, 0x00, 0x00,
0x00, 0xFF, 0xFF,
};
+ const unsigned char kV2SIMDFrameData[] = {
+ 0x80, spdy_version_ch_, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x33,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x38, 0xea,
+ 0xdf, 0xa2, 0x51, 0xb2,
+ 0x62, 0x60, 0x62, 0x60,
+ 0x4e, 0x4a, 0x2c, 0x62,
+ 0x60, 0x06, 0x08, 0xa0,
+ 0xb4, 0xfc, 0x7c, 0x80,
+ 0x00, 0x62, 0x60, 0x06,
+ 0x13, 0x00, 0x01, 0x94,
+ 0x94, 0x58, 0x04, 0x10,
+ 0x40, 0x00, 0x00, 0x00,
+ 0x00, 0xff, 0xff,
+ };
+ const unsigned char kV3SIMDFrameData[] = {
+ 0x80, spdy_version_ch_, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x32,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x38, 0xea,
+ 0xe3, 0xc6, 0xa7, 0xc2,
+ 0x02, 0xe5, 0x0e, 0x50,
+ 0xc2, 0x4b, 0x4a, 0x04,
+ 0xe5, 0x0b, 0x66, 0x80,
+ 0x00, 0x4a, 0xcb, 0xcf,
+ 0x07, 0x08, 0x20, 0x24,
+ 0x0a, 0x20, 0x80, 0x92,
+ 0x12, 0x8b, 0x00, 0x02,
+ 0x08, 0x00, 0x00, 0x00,
+ 0xff, 0xff,
+ };
+
SpdySynStreamIR syn_stream(1);
syn_stream.set_priority(priority);
syn_stream.SetHeader("bar", "foo");
syn_stream.SetHeader("foo", "bar");
scoped_ptr<SpdyFrame> frame(framer.SerializeSynStream(syn_stream));
+ const unsigned char* frame_data =
+ reinterpret_cast<const unsigned char*>(frame->data());
if (IsSpdy2()) {
- CompareFrame(kDescription, *frame, kV2FrameData, arraysize(kV2FrameData));
+ // Try comparing with SIMD version, if that fails, do a failing check
+ // with pretty printing against non-SIMD version
+ if (memcmp(frame_data,
+ kV2SIMDFrameData,
+ std::min(arraysize(kV2SIMDFrameData), frame->size())) != 0) {
+ CompareCharArraysWithHexError(kDescription,
+ frame_data,
+ frame->size(),
+ kV2FrameData,
+ arraysize(kV2FrameData));
+ }
} else if (IsSpdy3()) {
- CompareFrame(kDescription, *frame, kV3FrameData, arraysize(kV3FrameData));
+ if (memcmp(frame_data,
+ kV3SIMDFrameData,
+ std::min(arraysize(kV3SIMDFrameData), frame->size())) != 0) {
+ CompareCharArraysWithHexError(kDescription,
+ frame_data,
+ frame->size(),
+ kV3FrameData,
+ arraysize(kV3FrameData));
+ }
} else {
LOG(FATAL) << "Unsupported version in test.";
}
@@ -2280,14 +2335,66 @@ TEST_P(SpdyFramerTest, CreateSynReplyCompressed) {
0x00, 0x00, 0x00, 0xff,
0xff,
};
+ const unsigned char kV2SIMDFrameData[] = {
+ 0x80, spdy_version_ch_, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x2f,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x38, 0xea,
+ 0xdf, 0xa2, 0x51, 0xb2,
+ 0x62, 0x60, 0x62, 0x60,
+ 0x4e, 0x4a, 0x2c, 0x62,
+ 0x60, 0x06, 0x08, 0xa0,
+ 0xb4, 0xfc, 0x7c, 0x80,
+ 0x00, 0x62, 0x60, 0x06,
+ 0x13, 0x00, 0x01, 0x94,
+ 0x94, 0x58, 0x04, 0x10,
+ 0x40, 0x00, 0x00, 0x00,
+ 0x00, 0xff, 0xff,
+ };
+ const unsigned char kV3SIMDFrameData[] = {
+ 0x80, spdy_version_ch_, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x2c,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x38, 0xea, 0xe3, 0xc6,
+ 0xa7, 0xc2, 0x02, 0xe5,
+ 0x0e, 0x50, 0xc2, 0x4b,
+ 0x4a, 0x04, 0xe5, 0x0b,
+ 0x66, 0x80, 0x00, 0x4a,
+ 0xcb, 0xcf, 0x07, 0x08,
+ 0x20, 0x24, 0x0a, 0x20,
+ 0x80, 0x92, 0x12, 0x8b,
+ 0x00, 0x02, 0x08, 0x00,
+ 0x00, 0x00, 0xff, 0xff,
+ };
+
SpdySynReplyIR syn_reply(1);
syn_reply.SetHeader("bar", "foo");
syn_reply.SetHeader("foo", "bar");
scoped_ptr<SpdyFrame> frame(framer.SerializeSynReply(syn_reply));
+ const unsigned char* frame_data =
+ reinterpret_cast<const unsigned char*>(frame->data());
if (IsSpdy2()) {
- CompareFrame(kDescription, *frame, kV2FrameData, arraysize(kV2FrameData));
+ // Try comparing with SIMD version, if that fails, do a failing check
+ // with pretty printing against non-SIMD version
+ if (memcmp(frame_data,
+ kV2SIMDFrameData,
+ std::min(arraysize(kV2SIMDFrameData), frame->size())) != 0) {
+ CompareCharArraysWithHexError(kDescription,
+ frame_data,
+ frame->size(),
+ kV2FrameData,
+ arraysize(kV2FrameData));
+ }
} else if (IsSpdy3()) {
- CompareFrame(kDescription, *frame, kV3FrameData, arraysize(kV3FrameData));
+ if (memcmp(frame_data,
+ kV3SIMDFrameData,
+ std::min(arraysize(kV3SIMDFrameData), frame->size())) != 0) {
+ CompareCharArraysWithHexError(kDescription,
+ frame_data,
+ frame->size(),
+ kV3FrameData,
+ arraysize(kV3FrameData));
+ }
} else {
LOG(FATAL) << "Unsupported version in test.";
}
@@ -2886,14 +2993,66 @@ TEST_P(SpdyFramerTest, CreateHeadersCompressed) {
0x00, 0x00, 0x00, 0xff,
0xff,
};
+ const unsigned char kV2SIMDFrameData[] = {
+ 0x80, spdy_version_ch_, 0x00, 0x08,
+ 0x00, 0x00, 0x00, 0x2f,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x38, 0xea,
+ 0xdf, 0xa2, 0x51, 0xb2,
+ 0x62, 0x60, 0x62, 0x60,
+ 0x4e, 0x4a, 0x2c, 0x62,
+ 0x60, 0x06, 0x08, 0xa0,
+ 0xb4, 0xfc, 0x7c, 0x80,
+ 0x00, 0x62, 0x60, 0x06,
+ 0x13, 0x00, 0x01, 0x94,
+ 0x94, 0x58, 0x04, 0x10,
+ 0x40, 0x00, 0x00, 0x00,
+ 0x00, 0xff, 0xff,
+ };
+ const unsigned char kV3SIMDFrameData[] = {
+ 0x80, spdy_version_ch_, 0x00, 0x08,
+ 0x00, 0x00, 0x00, 0x2c,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x38, 0xea, 0xe3, 0xc6,
+ 0xa7, 0xc2, 0x02, 0xe5,
+ 0x0e, 0x50, 0xc2, 0x4b,
+ 0x4a, 0x04, 0xe5, 0x0b,
+ 0x66, 0x80, 0x00, 0x4a,
+ 0xcb, 0xcf, 0x07, 0x08,
+ 0x20, 0x24, 0x0a, 0x20,
+ 0x80, 0x92, 0x12, 0x8b,
+ 0x00, 0x02, 0x08, 0x00,
+ 0x00, 0x00, 0xff, 0xff,
+ };
+
SpdyHeadersIR headers_ir(1);
headers_ir.SetHeader("bar", "foo");
headers_ir.SetHeader("foo", "bar");
scoped_ptr<SpdyFrame> frame(framer.SerializeHeaders(headers_ir));
+ const unsigned char* frame_data =
+ reinterpret_cast<const unsigned char*>(frame->data());
if (IsSpdy2()) {
- CompareFrame(kDescription, *frame, kV2FrameData, arraysize(kV2FrameData));
+ // Try comparing with SIMD version, if that fails, do a failing check
+ // with pretty printing against non-SIMD version
+ if (memcmp(frame_data,
+ kV2SIMDFrameData,
+ std::min(arraysize(kV2SIMDFrameData), frame->size())) != 0) {
+ CompareCharArraysWithHexError(kDescription,
+ frame_data,
+ frame->size(),
+ kV2FrameData,
+ arraysize(kV2FrameData));
+ }
} else if (IsSpdy3()) {
- CompareFrame(kDescription, *frame, kV3FrameData, arraysize(kV3FrameData));
+ if (memcmp(frame_data,
+ kV3SIMDFrameData,
+ std::min(arraysize(kV3SIMDFrameData), frame->size())) != 0) {
+ CompareCharArraysWithHexError(kDescription,
+ frame_data,
+ frame->size(),
+ kV3FrameData,
+ arraysize(kV3FrameData));
+ }
} else {
// Deflate compression doesn't apply to HPACK.
}
diff --git a/third_party/zlib/BUILD.gn b/third_party/zlib/BUILD.gn
index 0f21450..18cf816 100644
--- a/third_party/zlib/BUILD.gn
+++ b/third_party/zlib/BUILD.gn
@@ -6,6 +6,15 @@ config("zlib_config") {
include_dirs = [ "." ]
}
+static_library("zlib_x86_simd") {
+ if (cpu_arch == "x86" || cpu_arch == "x64") {
+ sources = [ "crc_folding.c", "fill_window_sse.c" ]
+ cflags = [ "-msse2", "-msse4.2", "-mpclmul" ]
+ } else {
+ sources = [ "simd_stub.c"]
+ }
+}
+
static_library("zlib") {
if (!is_win) {
# Don't stomp on "libzlib" on other platforms.
@@ -36,16 +45,22 @@ static_library("zlib") {
"trees.c",
"trees.h",
"uncompr.c",
+ "x86.h",
"zconf.h",
"zlib.h",
"zutil.c",
"zutil.h",
]
+ if (cpu_arch == "x86" || cpu_arch == "x64") {
+ sources += [ "x86.c" ]
+ }
+
configs -= [ "//build/config/compiler:chromium_code" ]
configs += [ "//build/config/compiler:no_chromium_code" ]
public_configs = [ ":zlib_config" ]
+ deps = [ ":zlib_x86_simd" ]
}
static_library("minizip") {
diff --git a/third_party/zlib/README.chromium b/third_party/zlib/README.chromium
index c9e06ba..b90bcff 100644
--- a/third_party/zlib/README.chromium
+++ b/third_party/zlib/README.chromium
@@ -19,3 +19,19 @@ The 'google.patch' file represents our changes from the original zlib-1.2.5.
A more significant change to support mixed-source data compression. See
crbug.com/139744 and mixed-source.patch.
+
+Integrated Intel SIMD optimisations from: https://github.com/jtkukunas/zlib/
+and modified to accomodate the older version and existing changes in tree.
+
+This introduces new files: simd_stub.c, crc_folding.c, fill_window_sse.c and
+x86.[ch]. All but the latter are built into a static library to allow the
+compiler to use the desired instructions only when valid. The latter version is
+only built on x86 (32-bit and 64-bit) systems with it's functionality stubbed
+on the others.
+
+Other changes to accomodate:
+- fill_window() implementation calls into _sse() variant when supported and the
+ original implementation renamed to _c()
+- read_buf was moved from local to ZLIB_INTERNAL for fill_window_sse.c to use
+- INSERT_STRING macro was made a function, insert_string() and an implementation using CRC instruction added
+- some crc funcionality moved into crc32.c
diff --git a/third_party/zlib/crc32.c b/third_party/zlib/crc32.c
index 91be372..75f2290 100644
--- a/third_party/zlib/crc32.c
+++ b/third_party/zlib/crc32.c
@@ -26,6 +26,8 @@
# endif /* !DYNAMIC_CRC_TABLE */
#endif /* MAKECRCH */
+#include "deflate.h"
+#include "x86.h"
#include "zutil.h" /* for STDC and FAR definitions */
#define local static
@@ -440,3 +442,28 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
{
return crc32_combine_(crc1, crc2, len2);
}
+
+ZLIB_INTERNAL void crc_reset(deflate_state *const s)
+{
+ if (x86_cpu_enable_simd) {
+ crc_fold_init(s);
+ return;
+ }
+ s->strm->adler = crc32(0L, Z_NULL, 0);
+}
+
+ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
+{
+ if (x86_cpu_enable_simd)
+ s->strm->adler = crc_fold_512to32(s);
+}
+
+ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
+{
+ if (x86_cpu_enable_simd) {
+ crc_fold_copy(strm->state, dst, strm->next_in, size);
+ return;
+ }
+ zmemcpy(dst, strm->next_in, size);
+ strm->adler = crc32(strm->adler, dst, size);
+}
diff --git a/third_party/zlib/crc_folding.c b/third_party/zlib/crc_folding.c
new file mode 100644
index 0000000..98c559c
--- /dev/null
+++ b/third_party/zlib/crc_folding.c
@@ -0,0 +1,493 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * Jim Guilford <james.guilford@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Erdinc Ozturk <erdinc.ozturk@intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "deflate.h"
+
+#include <inttypes.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <wmmintrin.h>
+
+#define CRC_LOAD(s) \
+ do { \
+ __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
+ __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
+ __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
+ __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
+ __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
+
+#define CRC_SAVE(s) \
+ _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
+ _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
+ _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
+ _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
+ _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
+ } while (0);
+
+ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
+{
+ CRC_LOAD(s)
+
+ xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+ xmm_crc1 = _mm_setzero_si128();
+ xmm_crc2 = _mm_setzero_si128();
+ xmm_crc3 = _mm_setzero_si128();
+
+ CRC_SAVE(s)
+
+ s->strm->adler = 0;
+}
+
+local void fold_1(deflate_state *const s,
+ __m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+ const __m128i xmm_fold4 = _mm_set_epi32(
+ 0x00000001, 0x54442bd4,
+ 0x00000001, 0xc6e41596);
+
+ __m128i x_tmp3;
+ __m128 ps_crc0, ps_crc3, ps_res;
+
+ x_tmp3 = *xmm_crc3;
+
+ *xmm_crc3 = *xmm_crc0;
+ *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+ ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+ ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+ ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
+
+ *xmm_crc0 = *xmm_crc1;
+ *xmm_crc1 = *xmm_crc2;
+ *xmm_crc2 = x_tmp3;
+ *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+local void fold_2(deflate_state *const s,
+ __m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+ const __m128i xmm_fold4 = _mm_set_epi32(
+ 0x00000001, 0x54442bd4,
+ 0x00000001, 0xc6e41596);
+
+ __m128i x_tmp3, x_tmp2;
+ __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
+
+ x_tmp3 = *xmm_crc3;
+ x_tmp2 = *xmm_crc2;
+
+ *xmm_crc3 = *xmm_crc1;
+ *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+ ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+ ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+ ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
+
+ *xmm_crc2 = *xmm_crc0;
+ *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+ ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+ ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+ ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
+
+ *xmm_crc0 = x_tmp2;
+ *xmm_crc1 = x_tmp3;
+ *xmm_crc2 = _mm_castps_si128(ps_res20);
+ *xmm_crc3 = _mm_castps_si128(ps_res31);
+}
+
+local void fold_3(deflate_state *const s,
+ __m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+ const __m128i xmm_fold4 = _mm_set_epi32(
+ 0x00000001, 0x54442bd4,
+ 0x00000001, 0xc6e41596);
+
+ __m128i x_tmp3;
+ __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
+
+ x_tmp3 = *xmm_crc3;
+
+ *xmm_crc3 = *xmm_crc2;
+ *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+ *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+ ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+ ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+ ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
+
+ *xmm_crc2 = *xmm_crc1;
+ *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+ ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+ ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+ ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
+
+ *xmm_crc1 = *xmm_crc0;
+ *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+ ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+ ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+ ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
+
+ *xmm_crc0 = x_tmp3;
+ *xmm_crc1 = _mm_castps_si128(ps_res10);
+ *xmm_crc2 = _mm_castps_si128(ps_res21);
+ *xmm_crc3 = _mm_castps_si128(ps_res32);
+}
+
+local void fold_4(deflate_state *const s,
+ __m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+ const __m128i xmm_fold4 = _mm_set_epi32(
+ 0x00000001, 0x54442bd4,
+ 0x00000001, 0xc6e41596);
+
+ __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
+ __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
+ __m128 ps_t0, ps_t1, ps_t2, ps_t3;
+ __m128 ps_res0, ps_res1, ps_res2, ps_res3;
+
+ x_tmp0 = *xmm_crc0;
+ x_tmp1 = *xmm_crc1;
+ x_tmp2 = *xmm_crc2;
+ x_tmp3 = *xmm_crc3;
+
+ *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+ x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
+ ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+ ps_t0 = _mm_castsi128_ps(x_tmp0);
+ ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
+
+ *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+ x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
+ ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+ ps_t1 = _mm_castsi128_ps(x_tmp1);
+ ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
+
+ *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+ x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
+ ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+ ps_t2 = _mm_castsi128_ps(x_tmp2);
+ ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
+
+ *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+ x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
+ ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+ ps_t3 = _mm_castsi128_ps(x_tmp3);
+ ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
+
+ *xmm_crc0 = _mm_castps_si128(ps_res0);
+ *xmm_crc1 = _mm_castps_si128(ps_res1);
+ *xmm_crc2 = _mm_castps_si128(ps_res2);
+ *xmm_crc3 = _mm_castps_si128(ps_res3);
+}
+
+local const unsigned zalign(32) pshufb_shf_table[60] = {
+ 0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
+ 0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
+ 0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
+ 0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
+ 0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
+ 0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
+ 0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
+ 0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
+ 0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
+ 0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
+ 0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
+ 0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
+ 0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
+ 0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
+ 0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
+};
+
+local void partial_fold(deflate_state *const s, const size_t len,
+ __m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3,
+ __m128i *xmm_crc_part)
+{
+
+ const __m128i xmm_fold4 = _mm_set_epi32(
+ 0x00000001, 0x54442bd4,
+ 0x00000001, 0xc6e41596);
+ const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
+
+ __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
+ __m128i xmm_a0_0, xmm_a0_1;
+ __m128 ps_crc3, psa0_0, psa0_1, ps_res;
+
+ xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
+ xmm_shr = xmm_shl;
+ xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
+
+ xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
+
+ *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
+ xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
+ *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
+
+ *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
+ xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
+ *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
+
+ *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
+ xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
+ *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
+
+ *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
+ *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
+ *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
+
+ xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
+ xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
+
+ ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+ psa0_0 = _mm_castsi128_ps(xmm_a0_0);
+ psa0_1 = _mm_castsi128_ps(xmm_a0_1);
+
+ ps_res = _mm_xor_ps(ps_crc3, psa0_0);
+ ps_res = _mm_xor_ps(ps_res, psa0_1);
+
+ *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
+ unsigned char *dst, const unsigned char *src, long len)
+{
+ unsigned long algn_diff;
+ __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+
+ CRC_LOAD(s)
+
+ if (len < 16) {
+ if (len == 0)
+ return;
+ goto partial;
+ }
+
+ algn_diff = 0 - (unsigned long)src & 0xF;
+ if (algn_diff) {
+ xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+ _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+
+ dst += algn_diff;
+ src += algn_diff;
+ len -= algn_diff;
+
+ partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
+ &xmm_crc_part);
+ }
+
+ while ((len -= 64) >= 0) {
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+
+ fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+
+ xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+
+ src += 64;
+ dst += 64;
+ }
+
+ /*
+ * len = num bytes left - 64
+ */
+ if (len + 16 >= 0) {
+ len += 16;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+
+ fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+
+ if (len == 0)
+ goto done;
+
+ dst += 48;
+ src += 48;
+ } else if (len + 32 >= 0) {
+ len += 32;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+
+ fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+
+ if (len == 0)
+ goto done;
+
+ dst += 32;
+ src += 32;
+ } else if (len + 48 >= 0) {
+ len += 48;
+
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+
+ fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+
+ if (len == 0)
+ goto done;
+
+ dst += 16;
+ src += 16;
+ } else {
+ len += 64;
+ if (len == 0)
+ goto done;
+ }
+
+partial:
+
+#if defined(_MSC_VER)
+ /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
+ {
+ int32_t parts[4] = {0, 0, 0, 0};
+ memcpy(&parts, src, len);
+ xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
+ }
+#else
+ {
+ int64_t parts[2] = {0, 0};
+ memcpy(&parts, src, len);
+ xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
+ }
+#endif
+
+ _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+ partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
+ &xmm_crc_part);
+done:
+ CRC_SAVE(s)
+}
+
+local const unsigned zalign(16) crc_k[] = {
+ 0xccaa009e, 0x00000000, /* rk1 */
+ 0x751997d0, 0x00000001, /* rk2 */
+ 0xccaa009e, 0x00000000, /* rk5 */
+ 0x63cd6124, 0x00000001, /* rk6 */
+ 0xf7011640, 0x00000001, /* rk7 */
+ 0xdb710640, 0x00000001 /* rk8 */
+};
+
+local const unsigned zalign(16) crc_mask[4] = {
+ 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
+};
+
+local const unsigned zalign(16) crc_mask2[4] = {
+ 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+};
+
+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
+{
+ const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
+ const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
+
+ unsigned crc;
+ __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
+
+ CRC_LOAD(s)
+
+ /*
+ * k1
+ */
+ crc_fold = _mm_load_si128((__m128i *)crc_k);
+
+ x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
+ xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
+
+ x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
+ xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
+
+ x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
+ xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+
+ /*
+ * k5
+ */
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
+
+ xmm_crc0 = xmm_crc3;
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+ xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+
+ xmm_crc0 = xmm_crc3;
+ xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+ xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
+
+ /*
+ * k7
+ */
+ xmm_crc1 = xmm_crc3;
+ xmm_crc2 = xmm_crc3;
+ crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
+
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+ xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
+
+ xmm_crc2 = xmm_crc3;
+ xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
+
+ crc = _mm_extract_epi32(xmm_crc3, 2);
+ return ~crc;
+ CRC_SAVE(s)
+}
diff --git a/third_party/zlib/deflate.c b/third_party/zlib/deflate.c
index 8043e5b..55ec215 100644
--- a/third_party/zlib/deflate.c
+++ b/third_party/zlib/deflate.c
@@ -49,7 +49,10 @@
/* @(#) $Id$ */
+#include <assert.h>
+
#include "deflate.h"
+#include "x86.h"
const char deflate_copyright[] =
" deflate 1.2.5 Copyright 1995-2010 Jean-loup Gailly and Mark Adler ";
@@ -85,7 +88,7 @@ local block_state deflate_huff OF((deflate_state *s, int flush));
local void lm_init OF((deflate_state *s));
local void putShortMSB OF((deflate_state *s, uInt b));
local void flush_pending OF((z_streamp strm));
-local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+
#ifdef ASMV
void match_init OF((void)); /* asm code initialization */
uInt longest_match OF((deflate_state *s, IPos cur_match, int clas));
@@ -98,6 +101,23 @@ local void check_match OF((deflate_state *s, IPos start, IPos match,
int length));
#endif
+/* For fill_window_sse.c to use */
+ZLIB_INTERNAL int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+
+/* From crc32.c */
+extern void ZLIB_INTERNAL crc_reset(deflate_state *const s);
+extern void ZLIB_INTERNAL crc_finalize(deflate_state *const s);
+extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size);
+
+#ifdef _MSC_VER
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+
+/* Inline optimisation */
+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str);
+
/* ===========================================================================
* Local data
*/
@@ -164,7 +184,6 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
*/
#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
-
/* ===========================================================================
* Insert string str in the dictionary and set match_head to the previous head
* of the hash chain (the most recent string with same hash key). Return
@@ -175,17 +194,28 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
* input characters and the first MIN_MATCH bytes of str are valid
* (except for the last MIN_MATCH-1 bytes of the input file).
*/
+local INLINE Pos insert_string_c(deflate_state *const s, const Pos str)
+{
+ Pos ret;
+
+ UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]);
#ifdef FASTEST
-#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
- match_head = s->head[s->ins_h], \
- s->head[s->ins_h] = (Pos)(str))
+ ret = s->head[s->ins_h];
#else
-#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
- match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
- s->head[s->ins_h] = (Pos)(str))
+ ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
#endif
+ s->head[s->ins_h] = str;
+
+ return ret;
+}
+
+local INLINE Pos insert_string(deflate_state *const s, const Pos str)
+{
+ if (x86_cpu_enable_simd)
+ return insert_string_sse(s, str);
+ return insert_string_c(s, str);
+}
+
/* ===========================================================================
* Initialize the hash table (avoiding 64K overflow for 16 bit systems).
@@ -219,6 +249,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
const char *version;
int stream_size;
{
+ unsigned window_padding = 8;
deflate_state *s;
int wrap = 1;
static const char my_version[] = ZLIB_VERSION;
@@ -228,6 +259,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
* output size for (length,distance) codes is <= 24 bits.
*/
+ x86_check_features();
+
if (version == Z_NULL || version[0] != my_version[0] ||
stream_size != sizeof(z_stream)) {
return Z_VERSION_ERROR;
@@ -274,12 +307,17 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
s->w_size = 1 << s->w_bits;
s->w_mask = s->w_size - 1;
- s->hash_bits = memLevel + 7;
+ if (x86_cpu_enable_simd) {
+ s->hash_bits = 15;
+ } else {
+ s->hash_bits = memLevel + 7;
+ }
+
s->hash_size = 1 << s->hash_bits;
s->hash_mask = s->hash_size - 1;
s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
- s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
+ s->window = (Bytef *) ZALLOC(strm, s->w_size + window_padding, 2*sizeof(Byte));
s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos));
s->class_bitmap = NULL;
@@ -347,7 +385,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
s->ins_h = s->window[0];
UPDATE_HASH(s, s->ins_h, s->window[1]);
for (n = 0; n <= length - MIN_MATCH; n++) {
- INSERT_STRING(s, n, hash_head);
+ insert_string(s, n);
}
if (hash_head) hash_head = 0; /* to make compiler happy */
return Z_OK;
@@ -613,7 +651,7 @@ int ZEXPORT deflate (strm, flush)
if (s->status == INIT_STATE) {
#ifdef GZIP
if (s->wrap == 2) {
- strm->adler = crc32(0L, Z_NULL, 0);
+ crc_reset(s);
put_byte(s, 31);
put_byte(s, 139);
put_byte(s, 8);
@@ -891,6 +929,7 @@ int ZEXPORT deflate (strm, flush)
/* Write the trailer */
#ifdef GZIP
if (s->wrap == 2) {
+ crc_finalize(s);
put_byte(s, (Byte)(strm->adler & 0xff));
put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
@@ -1013,7 +1052,7 @@ int ZEXPORT deflateCopy (dest, source)
* allocating a large strm->next_in buffer and copying from it.
* (See also flush_pending()).
*/
-local int read_buf(strm, buf, size)
+ZLIB_INTERNAL int read_buf(strm, buf, size)
z_streamp strm;
Bytef *buf;
unsigned size;
@@ -1025,15 +1064,17 @@ local int read_buf(strm, buf, size)
strm->avail_in -= len;
- if (strm->state->wrap == 1) {
- strm->adler = adler32(strm->adler, strm->next_in, len);
- }
#ifdef GZIP
- else if (strm->state->wrap == 2) {
- strm->adler = crc32(strm->adler, strm->next_in, len);
+ if (strm->state->wrap == 2) {
+ copy_with_crc(strm, buf, len);
}
+ else
#endif
- zmemcpy(buf, strm->next_in, len);
+ {
+ zmemcpy(buf, strm->next_in, len);
+ if (strm->state->wrap == 1)
+ strm->adler = adler32(strm->adler, buf, len);
+ }
strm->next_in += len;
strm->total_in += len;
@@ -1445,7 +1486,19 @@ local void check_match(s, start, match, length)
* performed for at least two bytes (required for the zip translate_eol
* option -- not supported here).
*/
-local void fill_window(s)
+local void fill_window_c(deflate_state *s);
+
+local void fill_window(deflate_state *s)
+{
+ if (x86_cpu_enable_simd) {
+ fill_window_sse(s);
+ return;
+ }
+
+ fill_window_c(s);
+}
+
+local void fill_window_c(s)
deflate_state *s;
{
register unsigned n, m;
@@ -1711,7 +1764,7 @@ local block_state deflate_fast(s, flush, clas)
*/
hash_head = NIL;
if (s->lookahead >= MIN_MATCH) {
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
}
/* Find the longest match, discarding those <= prev_length.
@@ -1742,7 +1795,7 @@ local block_state deflate_fast(s, flush, clas)
s->match_length--; /* string at strstart already in table */
do {
s->strstart++;
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
/* strstart never exceeds WSIZE-MAX_MATCH, so there are
* always MIN_MATCH bytes ahead.
*/
@@ -1821,7 +1874,7 @@ local block_state deflate_slow(s, flush, clas)
*/
hash_head = NIL;
if (s->lookahead >= MIN_MATCH) {
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
}
/* Find the longest match, discarding those <= prev_length.
@@ -1890,7 +1943,7 @@ local block_state deflate_slow(s, flush, clas)
s->prev_length -= 2;
do {
if (++s->strstart <= max_insert) {
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
}
} while (--s->prev_length != 0);
s->match_available = 0;
@@ -2031,3 +2084,37 @@ local block_state deflate_huff(s, flush)
FLUSH_BLOCK(s, flush == Z_FINISH);
return flush == Z_FINISH ? finish_done : block_done;
}
+
+/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will
+ * use intrinsic without extra params
+ */
+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str)
+{
+ Pos ret;
+ unsigned *ip, val, h = 0;
+
+ ip = (unsigned *)&s->window[str];
+ val = *ip;
+
+ if (s->level >= 6)
+ val &= 0xFFFFFF;
+
+/* Windows clang should use inline asm */
+#if defined(_MSC_VER) && !defined(__clang__)
+ h = _mm_crc32_u32(h, val);
+#elif defined(__i386__) || defined(__amd64__)
+ __asm__ __volatile__ (
+ "crc32 %1,%0\n\t"
+ : "+r" (h)
+ : "r" (val)
+ );
+#else
+ /* This should never happen */
+ assert(0);
+#endif
+
+ ret = s->head[h & s->hash_mask];
+ s->head[h & s->hash_mask] = str;
+ s->prev[str & s->w_mask] = ret;
+ return ret;
+}
diff --git a/third_party/zlib/deflate.h b/third_party/zlib/deflate.h
index 2fe6fd6..d15f2b5 100644
--- a/third_party/zlib/deflate.h
+++ b/third_party/zlib/deflate.h
@@ -107,6 +107,8 @@ typedef struct internal_state {
Byte method; /* STORED (for zip only) or DEFLATED */
int last_flush; /* value of flush param for previous deflate call */
+ unsigned zalign(16) crc0[4 * 5];
+
/* used by deflate.c: */
uInt w_size; /* LZ77 window size (32K by default) */
@@ -344,4 +346,14 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
flush = _tr_tally(s, distance, length)
#endif
+/* Functions that are SIMD optimised on x86 */
+void ZLIB_INTERNAL crc_fold_init(deflate_state* const s);
+void ZLIB_INTERNAL crc_fold_copy(deflate_state* const s,
+ unsigned char* dst,
+ const unsigned char* src,
+ long len);
+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state* const s);
+
+void ZLIB_INTERNAL fill_window_sse(deflate_state* s);
+
#endif /* DEFLATE_H */
diff --git a/third_party/zlib/fill_window_sse.c b/third_party/zlib/fill_window_sse.c
new file mode 100644
index 0000000..949ccce
--- /dev/null
+++ b/third_party/zlib/fill_window_sse.c
@@ -0,0 +1,175 @@
+/*
+ * Fill Window with SSE2-optimized hash shifting
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <immintrin.h>
+#include "deflate.h"
+
+#define UPDATE_HASH(s,h,i) \
+ {\
+ if (s->level < 6) { \
+ h = (3483 * (s->window[i]) +\
+ 23081* (s->window[i+1]) +\
+ 6954 * (s->window[i+2]) +\
+ 20947* (s->window[i+3])) & s->hash_mask;\
+ } else {\
+ h = (25881* (s->window[i]) +\
+ 24674* (s->window[i+1]) +\
+ 25811* (s->window[i+2])) & s->hash_mask;\
+ }\
+ }\
+
+extern int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+
+void fill_window_sse(deflate_state *s)
+{
+ const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
+
+ register unsigned n;
+ register Posf *p;
+ unsigned more; /* Amount of free space at the end of the window. */
+ uInt wsize = s->w_size;
+
+ Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+ do {
+ more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+ /* Deal with !@#$% 64K limit: */
+ if (sizeof(int) <= 2) {
+ if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+ more = wsize;
+
+ } else if (more == (unsigned)(-1)) {
+ /* Very unlikely, but possible on 16 bit machine if
+ * strstart == 0 && lookahead == 1 (input done a byte at time)
+ */
+ more--;
+ }
+ }
+
+ /* If the window is almost full and there is insufficient lookahead,
+ * move the upper half to the lower one to make room in the upper half.
+ */
+ if (s->strstart >= wsize+MAX_DIST(s)) {
+
+ zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
+ s->match_start -= wsize;
+ s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
+ s->block_start -= (long) wsize;
+
+ /* Slide the hash table (could be avoided with 32 bit values
+ at the expense of memory usage). We slide even when level == 0
+ to keep the hash table consistent if we switch back to level > 0
+ later. (Using level 0 permanently is not an optimal usage of
+ zlib, so we don't care about this pathological case.)
+ */
+ n = s->hash_size;
+ p = &s->head[n];
+ p -= 8;
+ do {
+ __m128i value, result;
+
+ value = _mm_loadu_si128((__m128i *)p);
+ result = _mm_subs_epu16(value, xmm_wsize);
+ _mm_storeu_si128((__m128i *)p, result);
+
+ p -= 8;
+ n -= 8;
+ } while (n > 0);
+
+ n = wsize;
+#ifndef FASTEST
+ p = &s->prev[n];
+ p -= 8;
+ do {
+ __m128i value, result;
+
+ value = _mm_loadu_si128((__m128i *)p);
+ result = _mm_subs_epu16(value, xmm_wsize);
+ _mm_storeu_si128((__m128i *)p, result);
+
+ p -= 8;
+ n -= 8;
+ } while (n > 0);
+#endif
+ more += wsize;
+ }
+ if (s->strm->avail_in == 0) break;
+
+ /* If there was no sliding:
+ * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+ * more == window_size - lookahead - strstart
+ * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+ * => more >= window_size - 2*WSIZE + 2
+ * In the BIG_MEM or MMAP case (not yet supported),
+ * window_size == input_size + MIN_LOOKAHEAD &&
+ * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+ * Otherwise, window_size == 2*WSIZE so more >= 2.
+ * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+ */
+ Assert(more >= 2, "more < 2");
+
+ n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+ s->lookahead += n;
+
+ /* Initialize the hash value now that we have some input: */
+ if (s->lookahead >= MIN_MATCH) {
+ uInt str = s->strstart;
+ s->ins_h = s->window[str];
+ if (str >= 1)
+ UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
+#if MIN_MATCH != 3
+ Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+ }
+ /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+ * but this is not important since only literal bytes will be emitted.
+ */
+
+ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+ /* If the WIN_INIT bytes after the end of the current data have never been
+ * written, then zero those bytes in order to avoid memory check reports of
+ * the use of uninitialized (or uninitialised as Julian writes) bytes by
+ * the longest match routines. Update the high water mark for the next
+ * time through here. WIN_INIT is set to MAX_MATCH since the longest match
+ * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+ */
+ if (s->high_water < s->window_size) {
+ ulg curr = s->strstart + (ulg)(s->lookahead);
+ ulg init;
+
+ if (s->high_water < curr) {
+ /* Previous high water mark below current data -- zero WIN_INIT
+ * bytes or up to end of window, whichever is less.
+ */
+ init = s->window_size - curr;
+ if (init > WIN_INIT)
+ init = WIN_INIT;
+ zmemzero(s->window + curr, (unsigned)init);
+ s->high_water = curr + init;
+ }
+ else if (s->high_water < (ulg)curr + WIN_INIT) {
+ /* High water mark at or above current data, but below current data
+ * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+ * to end of window, whichever is less.
+ */
+ init = (ulg)curr + WIN_INIT - s->high_water;
+ if (init > s->window_size - s->high_water)
+ init = s->window_size - s->high_water;
+ zmemzero(s->window + s->high_water, (unsigned)init);
+ s->high_water += init;
+ }
+ }
+
+ Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+ "not enough room for search");
+}
diff --git a/third_party/zlib/simd_stub.c b/third_party/zlib/simd_stub.c
new file mode 100644
index 0000000..bb2ddc3
--- /dev/null
+++ b/third_party/zlib/simd_stub.c
@@ -0,0 +1,35 @@
+/* simd_stub.c -- stub implementations
+* Copyright (C) 2014 Intel Corporation
+* For conditions of distribution and use, see copyright notice in zlib.h
+*/
+#include <assert.h>
+
+#include "deflate.h"
+#include "x86.h"
+
+int x86_cpu_enable_simd;
+
+void ZLIB_INTERNAL crc_fold_init(deflate_state *const s) {
+ assert(0);
+}
+
+void ZLIB_INTERNAL crc_fold_copy(deflate_state *const s,
+ unsigned char *dst,
+ const unsigned char *src,
+ long len) {
+ assert(0);
+}
+
+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
+ assert(0);
+ return 0;
+}
+
+void ZLIB_INTERNAL fill_window_sse(deflate_state *s)
+{
+ assert(0);
+}
+
+void x86_check_features(void)
+{
+}
diff --git a/third_party/zlib/x86.c b/third_party/zlib/x86.c
new file mode 100644
index 0000000..35ec516
--- /dev/null
+++ b/third_party/zlib/x86.c
@@ -0,0 +1,112 @@
+/*
+ * x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ * Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "x86.h"
+
+int x86_cpu_enable_simd;
+
+#ifndef _MSC_VER
+#include <pthread.h>
+
+pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
+static void _x86_check_features(void);
+
+void x86_check_features(void)
+{
+ pthread_once(&cpu_check_inited_once, _x86_check_features);
+}
+
+static void _x86_check_features(void)
+{
+ int x86_cpu_has_sse2;
+ int x86_cpu_has_sse42;
+ int x86_cpu_has_pclmulqdq;
+ unsigned eax, ebx, ecx, edx;
+
+ eax = 1;
+#ifdef __i386__
+ __asm__ __volatile__ (
+ "xchg %%ebx, %1\n\t"
+ "cpuid\n\t"
+ "xchg %1, %%ebx\n\t"
+ : "+a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
+ );
+#else
+ __asm__ __volatile__ (
+ "cpuid\n\t"
+ : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+ );
+#endif /* (__i386__) */
+
+ x86_cpu_has_sse2 = edx & 0x4000000;
+ x86_cpu_has_sse42 = ecx & 0x100000;
+ x86_cpu_has_pclmulqdq = ecx & 0x2;
+
+ x86_cpu_enable_simd = x86_cpu_has_sse2 &&
+ x86_cpu_has_sse42 &&
+ x86_cpu_has_pclmulqdq;
+}
+#else
+#include <intrin.h>
+#include <windows.h>
+#include <stdint.h>
+
+static volatile int32_t once_control = 0;
+static void _x86_check_features(void);
+static int fake_pthread_once(volatile int32_t *once_control,
+ void (*init_routine)(void));
+
+void x86_check_features(void)
+{
+ fake_pthread_once(&once_control, _x86_check_features);
+}
+
+/* Copied from "perftools_pthread_once" in tcmalloc */
+static int fake_pthread_once(volatile int32_t *once_control,
+ void (*init_routine)(void)) {
+ // Try for a fast path first. Note: this should be an acquire semantics read
+ // It is on x86 and x64, where Windows runs.
+ if (*once_control != 1) {
+ while (1) {
+ switch (InterlockedCompareExchange(once_control, 2, 0)) {
+ case 0:
+ init_routine();
+ InterlockedExchange(once_control, 1);
+ return 0;
+ case 1:
+ // The initializer has already been executed
+ return 0;
+ default:
+ // The initializer is being processed by another thread
+ SwitchToThread();
+ }
+ }
+ }
+ return 0;
+}
+
+static void _x86_check_features(void)
+{
+ int x86_cpu_has_sse2;
+ int x86_cpu_has_sse42;
+ int x86_cpu_has_pclmulqdq;
+ int regs[4];
+
+ __cpuid(regs, 1);
+
+ x86_cpu_has_sse2 = regs[3] & 0x4000000;
+ x86_cpu_has_sse42= regs[2] & 0x100000;
+ x86_cpu_has_pclmulqdq = regs[2] & 0x2;
+
+ x86_cpu_enable_simd = x86_cpu_has_sse2 &&
+ x86_cpu_has_sse42 &&
+ x86_cpu_has_pclmulqdq;
+}
+#endif /* _MSC_VER */
diff --git a/third_party/zlib/x86.h b/third_party/zlib/x86.h
new file mode 100644
index 0000000..ac3d180
--- /dev/null
+++ b/third_party/zlib/x86.h
@@ -0,0 +1,13 @@
+/* x86.h -- check for x86 CPU features
+* Copyright (C) 2013 Intel Corporation Jim Kukunas
+* For conditions of distribution and use, see copyright notice in zlib.h
+*/
+
+#ifndef X86_H
+#define X86_H
+
+extern int x86_cpu_enable_simd;
+
+void x86_check_features(void);
+
+#endif /* X86_H */
diff --git a/third_party/zlib/zlib.gyp b/third_party/zlib/zlib.gyp
index aef41ac..8c5ae44 100644
--- a/third_party/zlib/zlib.gyp
+++ b/third_party/zlib/zlib.gyp
@@ -5,6 +5,22 @@
{
'targets': [
{
+ 'target_name' : 'zlib_x86_simd',
+ 'type': 'static_library',
+ 'conditions': [
+ # See http://crbug.com/420616 gyp on mac & ios doesn't apply cflags
+ ['OS!="ios" and OS!="mac" and (target_arch=="ia32" or target_arch=="x64")', {
+ 'cflags' : ["-msse2", "-msse4.2", "-mpclmul"],
+ 'sources' : [ 'crc_folding.c',
+ 'fill_window_sse.c']
+ }, {
+ 'sources' : [ 'simd_stub.c' ],
+ }], ['OS=="android"', {
+ 'toolsets': ['target', 'host'],
+ }],
+ ],
+ },
+ {
'target_name': 'zlib',
'type': 'static_library',
'sources': [
@@ -31,11 +47,15 @@
'trees.c',
'trees.h',
'uncompr.c',
+ 'x86.h',
'zconf.h',
'zlib.h',
'zutil.c',
'zutil.h',
],
+ 'dependencies' : [
+ 'zlib_x86_simd'
+ ],
'include_dirs': [
'.',
],
@@ -45,6 +65,9 @@
],
},
'conditions': [
+ ['OS!="ios" and OS!="mac" and (target_arch=="ia32" or target_arch=="x64")', {
+ 'sources' : [ 'x86.c', ],
+ }],
['OS!="win"', {
'product_name': 'chrome_zlib',
}], ['OS=="android"', {
diff --git a/third_party/zlib/zutil.h b/third_party/zlib/zutil.h
index 39cf373..3c8326f 100644
--- a/third_party/zlib/zutil.h
+++ b/third_party/zlib/zutil.h
@@ -142,6 +142,12 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
# define OS_CODE 0x0a
#endif
+#ifdef _MSC_VER
+#define zalign(x) __declspec(align(x))
+#else
+#define zalign(x) __attribute__((aligned((x))))
+#endif
+
#ifdef WIN32
# ifndef __CYGWIN__ /* Cygwin is Unix, not Win32 */
# define OS_CODE 0x0b
diff --git a/tools/msan/blacklist.txt b/tools/msan/blacklist.txt
index 96f5487..07b31c1 100644
--- a/tools/msan/blacklist.txt
+++ b/tools/msan/blacklist.txt
@@ -10,6 +10,9 @@ fun:ff_get_cpu_flags_x86
# Uninit in zlib. http://crbug.com/116277
fun:*MOZ_Z_deflate*
+# Uninit in zlib with SIMD intrinsic http://crbug.com/426868
+fun:crc_fold512_to_32
+
# Uninit in OSMesa. http://crbug.com/347967
fun:unpack_RGBA8888
fun:unpack_RGB888