2 files changed, 209 insertions, 1 deletions
diff --git a/third_party/qcms/README.chromium b/third_party/qcms/README.chromium
index 2c3912c..5d4178d 100644
--- a/third_party/qcms/README.chromium
+++ b/third_party/qcms/README.chromium
@@ -93,6 +93,8 @@ The following changes have been made since qcms was imported:
    - https://bugzilla.mozilla.org/show_bug.cgi?id=1163740
  - Add SSE2 code for qcms_transform_data_rgba_out_lut_sse2
    - https://code.google.com/p/chromium/issues/detail?id=506607
+ - Turbo charge SSE2 qcms_transform_data_rgba_out_lut_sse2
+   - https://code.google.com/p/chromium/issues/detail?id=506607
 
 For the Chromium changes, since the import, in a patch format run:
   git diff b8456f38 src
diff --git a/third_party/qcms/src/transform-sse2.c b/third_party/qcms/src/transform-sse2.c
index 65129e5..29717c9 100644
--- a/third_party/qcms/src/transform-sse2.c
+++ b/third_party/qcms/src/transform-sse2.c
@@ -275,5 +275,211 @@ void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform *transform,
                                               size_t length,
                                               qcms_format_type output_format)
 {
-    // FIXME: implement.
+    const int r_out = output_format.r;
+    const int b_out = output_format.b;
+
+    size_t i;
+
+    const int xy_len = 1;
+    const int x_len = transform->grid_size;
+    const int len = x_len * x_len;
+
+    const __m128 __clut_stride = _mm_set_ps((float)(3 * xy_len), (float)(3 * x_len), (float)(3 * len), 0);
+    const __m128 __grid_scaled = _mm_set1_ps((1.0f / 255.0f) * (transform->grid_size - 1));
+
+    const __m128 __255 = _mm_set1_ps(255.0f);
+    const __m128 __one = _mm_set1_ps(1.0f);
+    const __m128 __000 = _mm_setzero_ps();
+
+    const float* r_table = transform->r_clut;
+    const float* g_table = transform->g_clut;
+    const float* b_table = transform->b_clut;
+
+    int i3, i2, i1, i0;
+
+    __m128 c3;
+    __m128 c2;
+    __m128 c1;
+    __m128 c0;
+
+    __m128 in;
+
+    __m128 xyz_r;
+    __m128 xyz_0;
+    __m128 xyz_n;
+
+    ALIGN float xyz_r_f[4];
+    ALIGN int   xyz_0_i[4];
+    ALIGN int   xyz_n_i[4];
+
+    __m128i result;
+
+#define TETRA_SRC_RGB(r, g, b) _mm_set_ps((float)b, (float)g, (float)r, 0.f)
+
+    for (i = 0; i < length; ++i) {
+        // compute input point in cube lattice (grid) co-ordinates
+        in = _mm_mul_ps(TETRA_SRC_RGB(src[0], src[1], src[2]), __grid_scaled);
+
+        // floor: convert to int (truncate), convert back to float
+        xyz_0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(in));
+
+        // ceil: where in is greater than xyz_0 = floor(in), add 1
+        xyz_n = _mm_add_ps(xyz_0, _mm_and_ps(_mm_cmpgt_ps(in, xyz_0), __one));
+
+        // compute the input point relative to the sub-cube origin
+        xyz_r = _mm_sub_ps(in, xyz_0);
+
+#define rx (xyz_r_f[1])
+#define ry (xyz_r_f[2])
+#define rz (xyz_r_f[3])
+
+        _mm_store_ps(xyz_r_f, xyz_r);
+
+#define x0 (xyz_0_i[1])
+#define y0 (xyz_0_i[2])
+#define z0 (xyz_0_i[3])
+
+        xyz_0 = _mm_mul_ps(xyz_0, __clut_stride);
+        _mm_store_si128((__m128i*) xyz_0_i, _mm_cvtps_epi32(xyz_0));
+
+#define xn (xyz_n_i[1])
+#define yn (xyz_n_i[2])
+#define zn (xyz_n_i[3])
+
+        xyz_n = _mm_mul_ps(xyz_n, __clut_stride);
+        _mm_store_si128((__m128i*) xyz_n_i, _mm_cvtps_epi32(xyz_n));
+
+        dest[3] = src[3];
+        src += 4;
+
+#define SET_I0_AND_PREFETCH_CLUT() \
+        _mm_prefetch((char*)&(r_table[i0 = x0 + y0 + z0]), _MM_HINT_T0)
+
+#if !defined(_MSC_VER)
+        SET_I0_AND_PREFETCH_CLUT();
+#endif
+
+#define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \
+        c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \
+        c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \
+        c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \
+        c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)
+
+        if (rx >= ry) {
+
+#if defined(_MSC_VER)
+            SET_I0_AND_PREFETCH_CLUT();
+#endif
+            if (ry >= rz) {         // rx >= ry && ry >= rz
+
+                i3 = yn + (i1 = xn);
+                i1 += i0 - x0;
+                i2 = i3 + z0;
+                i3 += zn;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c3 = _mm_sub_ps(c3, c2);
+                c2 = _mm_sub_ps(c2, c1);
+                c1 = _mm_sub_ps(c1, c0);
+
+            } else if (rx >= rz) {  // rx >= rz && rz >= ry
+
+                i3 = zn + (i1 = xn);
+                i1 += i0 - x0;
+                i2 = i3 + yn;
+                i3 += y0;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c2 = _mm_sub_ps(c2, c3);
+                c3 = _mm_sub_ps(c3, c1);
+                c1 = _mm_sub_ps(c1, c0);
+
+            } else {                // rz > rx && rx >= ry
+
+                i2 = xn + (i3 = zn);
+                i3 += i0 - z0;
+                i1 = i2 + y0;
+                i2 += yn;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c2 = _mm_sub_ps(c2, c1);
+                c1 = _mm_sub_ps(c1, c3);
+                c3 = _mm_sub_ps(c3, c0);
+            }
+        } else {
+
+#if defined(_MSC_VER)
+            SET_I0_AND_PREFETCH_CLUT();
+#endif
+            if (rx >= rz) {         // ry > rx && rx >= rz
+
+                i3 = xn + (i2 = yn);
+                i2 += i0 - y0;
+                i1 = i3 + z0;
+                i3 += zn;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c3 = _mm_sub_ps(c3, c1);
+                c1 = _mm_sub_ps(c1, c2);
+                c2 = _mm_sub_ps(c2, c0);
+
+            } else if (ry >= rz) {  // ry >= rz && rz > rx
+
+                i3 = zn + (i2 = yn);
+                i2 += i0 - y0;
+                i1 = i3 + xn;
+                i3 += x0;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c1 = _mm_sub_ps(c1, c3);
+                c3 = _mm_sub_ps(c3, c2);
+                c2 = _mm_sub_ps(c2, c0);
+
+            } else {                // rz > ry && ry > rx
+
+                i2 = yn + (i3 = zn);
+                i3 += i0 - z0;
+                i1 = i2 + xn;
+                i2 += x0;
+
+                TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+                c1 = _mm_sub_ps(c1, c2);
+                c2 = _mm_sub_ps(c2, c3);
+                c3 = _mm_sub_ps(c3, c0);
+            }
+        }
+
+        // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz
+
+        in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(1, 1, 1, 1));
+        c1 = _mm_mul_ps(c1, in);
+        in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(2, 2, 2, 2));
+        c2 = _mm_mul_ps(c2, in);
+        in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(3, 3, 3, 3));
+        c3 = _mm_mul_ps(c3, in);
+
+        in = _mm_add_ps(c3, c2);
+        in = _mm_add_ps(in, c1);
+        in = _mm_add_ps(in, c0);
+
+        // clamp to [0.0..1.0] and scale by 255
+
+        in = _mm_max_ps(in, __000);
+        in = _mm_min_ps(in, __one);
+        in = _mm_mul_ps(in, __255);
+
+        result = _mm_cvtps_epi32(in); // convert to int (rounding)
+
+        dest[r_out] = (unsigned char) _mm_extract_epi16(result, 2);
+        dest[1]     = (unsigned char) _mm_extract_epi16(result, 4);
+        dest[b_out] = (unsigned char) _mm_extract_epi16(result, 6);
+
+        dest += 4;
+    }
 }