summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--third_party/qcms/README.chromium2
-rw-r--r--third_party/qcms/src/transform-sse2.c208
2 files changed, 209 insertions, 1 deletions
diff --git a/third_party/qcms/README.chromium b/third_party/qcms/README.chromium
index 2c3912c..5d4178d 100644
--- a/third_party/qcms/README.chromium
+++ b/third_party/qcms/README.chromium
@@ -93,6 +93,8 @@ The following changes have been made since qcms was imported:
- https://bugzilla.mozilla.org/show_bug.cgi?id=1163740
- Add SSE2 code for qcms_transform_data_rgba_out_lut_sse2
- https://code.google.com/p/chromium/issues/detail?id=506607
+ - Turbo charge SSE2 qcms_transform_data_rgba_out_lut_sse2
+ - https://code.google.com/p/chromium/issues/detail?id=506607
For the Chromium changes, since the import, in a patch format run:
git diff b8456f38 src
diff --git a/third_party/qcms/src/transform-sse2.c b/third_party/qcms/src/transform-sse2.c
index 65129e5..29717c9 100644
--- a/third_party/qcms/src/transform-sse2.c
+++ b/third_party/qcms/src/transform-sse2.c
@@ -275,5 +275,211 @@ void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform *transform,
size_t length,
qcms_format_type output_format)
{
- // FIXME: implement.
+ const int r_out = output_format.r;
+ const int b_out = output_format.b;
+
+ size_t i;
+
+ const int xy_len = 1;
+ const int x_len = transform->grid_size;
+ const int len = x_len * x_len;
+
+ const __m128 __clut_stride = _mm_set_ps((float)(3 * xy_len), (float)(3 * x_len), (float)(3 * len), 0);
+ const __m128 __grid_scaled = _mm_set1_ps((1.0f / 255.0f) * (transform->grid_size - 1));
+
+ const __m128 __255 = _mm_set1_ps(255.0f);
+ const __m128 __one = _mm_set1_ps(1.0f);
+ const __m128 __000 = _mm_setzero_ps();
+
+ const float* r_table = transform->r_clut;
+ const float* g_table = transform->g_clut;
+ const float* b_table = transform->b_clut;
+
+ int i3, i2, i1, i0;
+
+ __m128 c3;
+ __m128 c2;
+ __m128 c1;
+ __m128 c0;
+
+ __m128 in;
+
+ __m128 xyz_r;
+ __m128 xyz_0;
+ __m128 xyz_n;
+
+ ALIGN float xyz_r_f[4];
+ ALIGN int xyz_0_i[4];
+ ALIGN int xyz_n_i[4];
+
+ __m128i result;
+
+#define TETRA_SRC_RGB(r, g, b) _mm_set_ps((float)b, (float)g, (float)r, 0.f)
+
+ for (i = 0; i < length; ++i) {
+ // compute input point in cube lattice (grid) co-ordinates
+ in = _mm_mul_ps(TETRA_SRC_RGB(src[0], src[1], src[2]), __grid_scaled);
+
+ // floor: convert to int (truncate), convert back to float
+ xyz_0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(in));
+
+ // ceil: where in is greater than xyz_0 = floor(in), add 1
+ xyz_n = _mm_add_ps(xyz_0, _mm_and_ps(_mm_cmpgt_ps(in, xyz_0), __one));
+
+ // compute the input point relative to the sub-cube origin
+ xyz_r = _mm_sub_ps(in, xyz_0);
+
+#define rx (xyz_r_f[1])
+#define ry (xyz_r_f[2])
+#define rz (xyz_r_f[3])
+
+ _mm_store_ps(xyz_r_f, xyz_r);
+
+#define x0 (xyz_0_i[1])
+#define y0 (xyz_0_i[2])
+#define z0 (xyz_0_i[3])
+
+ xyz_0 = _mm_mul_ps(xyz_0, __clut_stride);
+ _mm_store_si128((__m128i*) xyz_0_i, _mm_cvtps_epi32(xyz_0));
+
+#define xn (xyz_n_i[1])
+#define yn (xyz_n_i[2])
+#define zn (xyz_n_i[3])
+
+ xyz_n = _mm_mul_ps(xyz_n, __clut_stride);
+ _mm_store_si128((__m128i*) xyz_n_i, _mm_cvtps_epi32(xyz_n));
+
+ dest[3] = src[3];
+ src += 4;
+
+#define SET_I0_AND_PREFETCH_CLUT() \
+ _mm_prefetch((char*)&(r_table[i0 = x0 + y0 + z0]), _MM_HINT_T0)
+
+#if !defined(_MSC_VER)
+ SET_I0_AND_PREFETCH_CLUT();
+#endif
+
+#define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \
+ c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \
+ c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \
+ c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \
+ c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)
+
+ if (rx >= ry) {
+
+#if defined(_MSC_VER)
+ SET_I0_AND_PREFETCH_CLUT();
+#endif
+ if (ry >= rz) { // rx >= ry && ry >= rz
+
+ i3 = yn + (i1 = xn);
+ i1 += i0 - x0;
+ i2 = i3 + z0;
+ i3 += zn;
+
+ TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+ c3 = _mm_sub_ps(c3, c2);
+ c2 = _mm_sub_ps(c2, c1);
+ c1 = _mm_sub_ps(c1, c0);
+
+ } else if (rx >= rz) { // rx >= rz && rz >= ry
+
+ i3 = zn + (i1 = xn);
+ i1 += i0 - x0;
+ i2 = i3 + yn;
+ i3 += y0;
+
+ TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+ c2 = _mm_sub_ps(c2, c3);
+ c3 = _mm_sub_ps(c3, c1);
+ c1 = _mm_sub_ps(c1, c0);
+
+ } else { // rz > rx && rx >= ry
+
+ i2 = xn + (i3 = zn);
+ i3 += i0 - z0;
+ i1 = i2 + y0;
+ i2 += yn;
+
+ TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+ c2 = _mm_sub_ps(c2, c1);
+ c1 = _mm_sub_ps(c1, c3);
+ c3 = _mm_sub_ps(c3, c0);
+ }
+ } else {
+
+#if defined(_MSC_VER)
+ SET_I0_AND_PREFETCH_CLUT();
+#endif
+ if (rx >= rz) { // ry > rx && rx >= rz
+
+ i3 = xn + (i2 = yn);
+ i2 += i0 - y0;
+ i1 = i3 + z0;
+ i3 += zn;
+
+ TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+ c3 = _mm_sub_ps(c3, c1);
+ c1 = _mm_sub_ps(c1, c2);
+ c2 = _mm_sub_ps(c2, c0);
+
+ } else if (ry >= rz) { // ry >= rz && rz > rx
+
+ i3 = zn + (i2 = yn);
+ i2 += i0 - y0;
+ i1 = i3 + xn;
+ i3 += x0;
+
+ TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+ c1 = _mm_sub_ps(c1, c3);
+ c3 = _mm_sub_ps(c3, c2);
+ c2 = _mm_sub_ps(c2, c0);
+
+ } else { // rz > ry && ry > rx
+
+ i2 = yn + (i3 = zn);
+ i3 += i0 - z0;
+ i1 = i2 + xn;
+ i2 += x0;
+
+ TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
+
+ c1 = _mm_sub_ps(c1, c2);
+ c2 = _mm_sub_ps(c2, c3);
+ c3 = _mm_sub_ps(c3, c0);
+ }
+ }
+
+ // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz
+
+ in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(1, 1, 1, 1));
+ c1 = _mm_mul_ps(c1, in);
+ in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(2, 2, 2, 2));
+ c2 = _mm_mul_ps(c2, in);
+ in = _mm_shuffle_ps(xyz_r, xyz_r, _MM_SHUFFLE(3, 3, 3, 3));
+ c3 = _mm_mul_ps(c3, in);
+
+ in = _mm_add_ps(c3, c2);
+ in = _mm_add_ps(in, c1);
+ in = _mm_add_ps(in, c0);
+
+ // clamp to [0.0..1.0] and scale by 255
+
+ in = _mm_max_ps(in, __000);
+ in = _mm_min_ps(in, __one);
+ in = _mm_mul_ps(in, __255);
+
+ result = _mm_cvtps_epi32(in); // convert to int (rounding)
+
+ dest[r_out] = (unsigned char) _mm_extract_epi16(result, 2);
+ dest[1] = (unsigned char) _mm_extract_epi16(result, 4);
+ dest[b_out] = (unsigned char) _mm_extract_epi16(result, 6);
+
+ dest += 4;
+ }
}