[qcms] Turbo charge SSE2 qcms_transform_data_rgba_out_lut

Reorganize CLUT table accesses to permit horizontal sums used to compute the color corrected output at the end of each loop. This eliminates 9 SSE intructions from each loop in comparison to our initial attempt at tetrahedral SSE. The speed increase is significant, especially on Win32 MSVC 2013 which prompted rewritting the tetrahedron selection code as ifs, not case statements, since MSVC produces superior code using ifs compared to modern gcc and clang. The implementation is bit-exact with the software implementation (I used the test harness to accept no differences to verify that there were no differences compared to the software path). Using a 2000 x 2000 (random) test image, the SSE code provides a 3.8x (380%) speed increase compared to the software case in MSVC 2013. On Intel Core i7 CPU @ 2.90Ghz, Win7, the test image takes 418ms on average in software. The new SSE code path takes 108ms, and is 10% faster than gcc.4.9.3 cygwin. BUG=506607 Review URL: https://codereview.chromium.org/1251373002 Cr-Commit-Position: refs/heads/master@{#340107}
author: noel <noel@chromium.org> 2015-07-23 10:20:56 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-07-23 17:21:28 +0000
commit: 4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69 (patch)
tree: cf6c8ca743da7db1141598f13af569d9858e4d5f
parent: 7a0c6fefa3646f26e4c81c2d5b2f259a48afecd6 (diff)
download: chromium_src-4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69.zip
chromium_src-4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69.tar.gz
chromium_src-4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69.tar.bz2
2 files changed, 209 insertions, 1 deletions
diff --git a/third_party/qcms/README.chromium b/third_party/qcms/README.chromium
index 2c3912c..5d4178d 100644
--- a/third_party/qcms/README.chromium
+++ b/third_party/qcms/README.chromium
@@ -93,6 +93,8 @@ The following changes have been made since qcms was imported:
    - https://bugzilla.mozilla.org/show_bug.cgi?id=1163740
  - Add SSE2 code for qcms_transform_data_rgba_out_lut_sse2
    - https://code.google.com/p/chromium/issues/detail?id=506607
+ - Turbo charge SSE2 qcms_transform_data_rgba_out_lut_sse2
+   - https://code.google.com/p/chromium/issues/detail?id=506607
 
 For the Chromium changes, since the import, in a patch format run:
   git diff b8456f38 src
diff --git a/third_party/qcms/src/transform-sse2.c b/third_party/qcms/src/transform-sse2.c
index 65129e5..29717c9 100644
--- a/third_party/qcms/src/transform-sse2.c
+++ b/third_party/qcms/src/transform-sse2.c
@@ -275,5 +275,211 @@ void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform *transform,
                                               size_t length,
                                               qcms_format_type output_format)
 {
-    // FIXME: implement.
+    const int r_out = output_format.r;
+    const int b_out = output_format.b;
+
+    size_t i;
+
+    const int xy_len = 1;
+    const int x_len = transform->grid_size;
+    const int len = x_len * x_len;
+
+    const __m128 __clut_stride = _mm_set_ps((float)(3 * xy_len), (float)(3 * x_len), (float)(3 * len), 0);
+    const __m128 __grid_scaled = _mm_set1_ps((1.0f / 255.0f) * (transform->grid_size - 1));
+
+    const __m128 __255 = _mm_set1_ps(255.0f);
+    const __m128 __one = _mm_set1_ps(1.0f);
+    const __m128 __000 = _mm_setzero_ps();
+
+    const float* r_table = transform->r_clut;
+    const float* g_table = transform->g_clut;
+    const float* b_table = transform->b_clut;
+
+    int i3, i2, i1, i0;
+
+    __m128 c3;
+    __m128 c2;
+    __m128 c1;
+    __m128 c0;
+
+    __m128 in;
+
+    __m128 xyz_r;
+    __m128 xyz_0;
+    __m128 xyz_n;
+
+    ALIGN float xyz_r_f[4];
+    ALIGN int   xyz_0_i[4];
+    ALIGN int   xyz_n_i[4];
+
+    __m128i result;
+
+#define TETRA_SRC_RGB(r, g, b) _mm_set_ps((float)b, (float)g, (float)r, 0.f)
+
+    for (i = 0; i < length; ++i) {
+        // compute input point in cube lattice (grid) co-ordinates
+        in = _mm_mul_ps(TETRA_SRC_RGB(src[0], src[1], src[2]), __grid_scaled);
+
+        // floor: convert to int (truncate), convert back to float
+        xyz_0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(in));
+
+        // ceil: where in is greater than xyz_0 = floor(in), add 1
+        xyz_n = _mm_add_ps(xyz_0, _mm_and_ps(_mm_cmpgt_ps(in, xyz_0), __one));
+
+        // compute the input point relative to the sub-cube origin
+        xyz_r = _mm_sub_ps(in, xyz_0);
+
+#define rx (xyz_r_f[1])
+#define ry (xyz_r_f[2])
+#define rz (xyz_r_f[3])
+
+        _mm_store_ps(xyz_r_f, xyz_r);
+
+#define x0 (xyz_0_i[1])
+#define y0 (xyz_0_i[2])
+#define z0 (xyz_0_i[3])
+
+        xyz_0 = _mm_mul_ps(xyz_0, __clut_stride);
+        _mm_store_si128((__m128i*) xyz_0_i, _mm_cvtps_epi32(xyz_0));
+
+#define xn (xyz_n_i[1])
+#define yn (xyz_n_i[2])
+#define zn (xyz_n_i[3])
+
+        xyz_n = _mm_mul_ps(xyz_n, __clut_stride);
+        _mm_store_si128((__m128i*) xyz_n_i, _mm_cvtps_epi32(xyz_n));
+
+        dest[3] = src[3];
+        src += 4;
+
+#define SET_I0_AND_PREFETCH_CLUT() \
+        _mm_prefetch((char*)&(r_table[i0 = x0 + y0 + z0]), _MM_HINT_T0)
+
+#if !defined(_MSC_VER)
+        SET_I0_AND_PREFETCH_CLUT();
+#endif
+
+#define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \
+        c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \
+        c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \
+        c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \
+        c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)
+
+        if (rx >= ry) {
+
+#if defined(_MSC_VER)
+            SET_I0_AND_PREFETCH_CLUT();
+#endif
+            if (ry >= rz) {         // rx >= ry && ry >= rz
+
+                i3 = yn + (i1 = xn);
+                i1 += i0 - x0;
+                i2 = i3 + z0;
+                i3 += zn;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c3 = _mm_sub_ps(c3, c2);
+                c2 = _mm_sub_ps(c2, c1);
+                c1 = _mm_sub_ps(c1, c0);
+
+            } else if (rx >= rz) {  // rx >= rz && rz >= ry
+
+                i3 = zn + (i1 = xn);
+                i1 += i0 - x0;
+                i2 = i3 + yn;
+                i3 += y0;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c2 = _mm_sub_ps(c2, c3);
+                c3 = _mm_sub_ps(c3, c1);
+                c1 = _mm_sub_ps(c1, c0);
+
+            } else {                // rz > rx && rx >= ry
+
+                i2 = xn + (i3 = zn);
+                i3 += i0 - z0;
+                i1 = i2 + y0;
+                i2 += yn;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c2 = _mm_sub_ps(c2, c1);
+                c1 = _mm_sub_ps(c1, c3);
+                c3 = _mm_sub_ps(c3, c0);
+            }
+        } else {
+
+#if defined(_MSC_VER)
+            SET_I0_AND_PREFETCH_CLUT();
+#endif
+            if (rx >= rz) {         // ry > rx && rx >= rz
+
+                i3 = xn + (i2 = yn);
+                i2 += i0 - y0;
+                i1 = i3 + z0;
+                i3 += zn;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c3 = _mm_sub_ps(c3, c1);
+                c1 = _mm_sub_ps(c1, c2);
+                c2 = _mm_sub_ps(c2, c0);
+
+            } else if (ry >= rz) {  // ry >= rz && rz > rx
+
+                i3 = zn + (i2 = yn);
+                i2 += i0 - y0;
+                i1 = i3 + xn;
+                i3 += x0;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c1 = _mm_sub_ps(c1, c3);
+                c3 = _mm_sub_ps(c3, c2);
+                c2 = _mm_sub_ps(c2, c0);
+
+            } else {                // rz > ry && ry > rx
+
+                i2 = yn + (i3 = zn);
+                i3 += i0 - z0;
+                i1 = i2 + xn;
+                i2 += x0;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c1 = _mm_sub_ps(c1, c2);
+                c2 = _mm_sub_ps(c2, c3);
+                c3 = _mm_sub_ps(c3, c0);
+            }
+        }
+
+        // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz
+
+        in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(1, 1, 1, 1));
+        c1 = _mm_mul_ps(c1, in);
+        in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(2, 2, 2, 2));
+        c2 = _mm_mul_ps(c2, in);
+        in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(3, 3, 3, 3));
+        c3 = _mm_mul_ps(c3, in);
+
+        in = _mm_add_ps(c3, c2);
+        in = _mm_add_ps(in, c1);
+        in = _mm_add_ps(in, c0);
+
+        // clamp to [0.0..1.0] and scale by 255
+
+        in = _mm_max_ps(in, __000);
+        in = _mm_min_ps(in, __one);
+        in = _mm_mul_ps(in, __255);
+
+        result = _mm_cvtps_epi32(in); // convert to int (rounding)
+
+        dest[r_out] = (unsigned char) _mm_extract_epi16(result, 2);
+        dest[1]     = (unsigned char) _mm_extract_epi16(result, 4);
+        dest[b_out] = (unsigned char) _mm_extract_epi16(result, 6);
+
+        dest += 4;
+    }
 }
author	noel <noel@chromium.org>	2015-07-23 10:20:56 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-07-23 17:21:28 +0000
commit	4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69 (patch)
tree	cf6c8ca743da7db1141598f13af569d9858e4d5f
parent	7a0c6fefa3646f26e4c81c2d5b2f259a48afecd6 (diff)
download	chromium_src-4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69.zip chromium_src-4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69.tar.gz chromium_src-4cc0a0b8600bb9db62271a2b8729bf26cb9d5b69.tar.bz2