SIMD implementation of Convolver for Lanczos filter etc.

replace current convolver function (horizontal/vertical) with SSE2 intrinsic version. Performance is not tuned to the optimal carefully in this patch. but it still should beat C version easily. BUG=62820 TEST=unittest. and image_operation_bench. Review URL: http://codereview.chromium.org/6334070 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@77527 0039d316-1c4b-4281-b951-d872f2087c98
author: jiesun@chromium.org <jiesun@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-03-09 21:55:38 +0000
committer: jiesun@chromium.org <jiesun@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-03-09 21:55:38 +0000
commit: 3c2632de729773052b5561575db4e81017aff538 (patch)
tree: 77d5a9c10da3330445f81db20fb88b785d457885 /skia/ext/convolver_unittest.cc
parent: 7c48dafd4f686038dde05650c029995e77013aa0 (diff)
download: chromium_src-3c2632de729773052b5561575db4e81017aff538.zip
chromium_src-3c2632de729773052b5561575db4e81017aff538.tar.gz
chromium_src-3c2632de729773052b5561575db4e81017aff538.tar.bz2
1 files changed, 116 insertions, 2 deletions
diff --git a/skia/ext/convolver_unittest.cc b/skia/ext/convolver_unittest.cc
index 5520b2c..9ac40c9 100644
--- a/skia/ext/convolver_unittest.cc
+++ b/skia/ext/convolver_unittest.cc
@@ -7,8 +7,14 @@
 #include <vector>
 
 #include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/time.h"
 #include "skia/ext/convolver.h"
 #include "testing/gtest/include/gtest/gtest.h"
+#include "third_party/skia/include/core/SkBitmap.h"
+#include "third_party/skia/include/core/SkColorPriv.h"
+#include "third_party/skia/include/core/SkRect.h"
+#include "third_party/skia/include/core/SkTypes.h"
 
 namespace skia {
 
@@ -35,7 +41,7 @@ void TestImpulseConvolution(const unsigned char* data, int width, int height) {
   std::vector<unsigned char> output;
   output.resize(byte_count);
   BGRAConvolve2D(data, width * 4, true, filter_x, filter_y,
-                 filter_x.num_values() * 4, &output[0]);
+                 filter_x.num_values() * 4, &output[0], false);
 
   // Output should exactly match input.
   EXPECT_EQ(0, memcmp(data, &output[0], byte_count));
@@ -106,7 +112,7 @@ TEST(Convolver, Halve) {
 
   // Do the convolution.
   BGRAConvolve2D(&input[0], src_width, true, filter_x, filter_y,
-                 filter_x.num_values() * 4, &output[0]);
+                 filter_x.num_values() * 4, &output[0], false);
 
   // Compute the expected results and check, allowing for a small difference
   // to account for rounding errors.
@@ -204,4 +210,112 @@ TEST(Convolver, AddFilter) {
   ASSERT_EQ(0, filter_length);
 }
 
+TEST(Convolver, SIMDVerification) {
+#if defined(SIMD_SSE2)
+  base::CPU cpu;
+  if (!cpu.has_sse2()) return;
+
+  int source_sizes[][2] = { {1920, 1080}, {720, 480}, {1377, 523}, {325, 241} };
+  int dest_sizes[][2] = { {1280, 1024}, {480, 270}, {177, 123} };
+  float filter[] = { 0.05f, -0.15f, 0.6f, 0.6f, -0.15f, 0.05f };
+
+  srand(static_cast<unsigned int>(time(0)));
+
+  // Loop over some specific source and destination dimensions.
+  for (unsigned int i = 0; i < arraysize(source_sizes); ++i) {
+    unsigned int source_width = source_sizes[i][0];
+    unsigned int source_height = source_sizes[i][1];
+    for (unsigned int j = 0; j < arraysize(dest_sizes); ++j) {
+      unsigned int dest_width = source_sizes[j][0];
+      unsigned int dest_height = source_sizes[j][1];
+
+      // Preparing convolve coefficients.
+      ConvolutionFilter1D x_filter, y_filter;
+      for (unsigned int p = 0; p < dest_width; ++p) {
+        unsigned int offset = source_width * p / dest_width;
+        if (offset > source_width - arraysize(filter))
+          offset = source_width - arraysize(filter);
+        x_filter.AddFilter(offset, filter, arraysize(filter));
+      }
+      for (unsigned int p = 0; p < dest_height; ++p) {
+        unsigned int offset = source_height * p / dest_height;
+        if (offset > source_height - arraysize(filter))
+          offset = source_height - arraysize(filter);
+        y_filter.AddFilter(offset, filter, arraysize(filter));
+      }
+
+      // Allocate input and output skia bitmap.
+      SkBitmap source, result_c, result_sse;
+      source.setConfig(SkBitmap::kARGB_8888_Config,
+                       source_width, source_height);
+      source.allocPixels();
+      result_c.setConfig(SkBitmap::kARGB_8888_Config,
+                         dest_width, dest_height);
+      result_c.allocPixels();
+      result_sse.setConfig(SkBitmap::kARGB_8888_Config,
+                           dest_width, dest_height);
+      result_sse.allocPixels();
+
+      // Randomize source bitmap for testing.
+      unsigned char* src_ptr = static_cast<unsigned char*>(source.getPixels());
+      for (int y = 0; y < source.height(); y++) {
+        for (int x = 0; x < source.rowBytes(); x++)
+          src_ptr[x] = rand() % 255;
+        src_ptr += source.rowBytes();
+      }
+
+      // Test both cases with different has_alpha.
+      for (int alpha = 0; alpha < 2; alpha++) {
+        // Convolve using C code.
+        base::TimeTicks resize_start;
+        base::TimeDelta delta_c, delta_sse;
+        unsigned char* r1 = static_cast<unsigned char*>(result_c.getPixels());
+        unsigned char* r2 = static_cast<unsigned char*>(result_sse.getPixels());
+
+        resize_start = base::TimeTicks::Now();
+        BGRAConvolve2D(static_cast<const uint8*>(source.getPixels()),
+                       static_cast<int>(source.rowBytes()),
+                       alpha ? true : false, x_filter, y_filter,
+                       static_cast<int>(result_c.rowBytes()), r1, false);
+        delta_c = base::TimeTicks::Now() - resize_start;
+
+        resize_start = base::TimeTicks::Now();
+        // Convolve using SSE2 code
+        BGRAConvolve2D(static_cast<const uint8*>(source.getPixels()),
+                       static_cast<int>(source.rowBytes()),
+                       alpha ? true : false, x_filter, y_filter,
+                       static_cast<int>(result_sse.rowBytes()), r2, true);
+        delta_sse = base::TimeTicks::Now() - resize_start;
+
+        // Unfortunately I could not enable the performance check now.
+        // Most bots use debug version, and there are great difference between
+        // the code generation for intrinsic, etc. In release version speed
+        // difference was 150%-200% depend on alpha channel presence;
+        // while in debug version speed difference was 96%-120%.
+        // TODO(jiesun): optimize further until we could enable this for
+        // debug version too.
+        // EXPECT_LE(delta_sse, delta_c);
+
+        int64 c_us = delta_c.InMicroseconds();
+        int64 sse_us = delta_sse.InMicroseconds();
+        LOG(INFO) << "from:" << source_width << "x" << source_height
+                  << " to:" << dest_width << "x" << dest_height
+                  << (alpha ? " with alpha" : " w/o alpha");
+        LOG(INFO) << "c:" << c_us << " sse:" << sse_us;
+        LOG(INFO) << "ratio:" << static_cast<float>(c_us) / sse_us;
+
+        // Comparing result.
+        for (unsigned int i = 0; i < dest_height; i++) {
+          for (unsigned int x = 0; x < dest_width * 4; x++) {  // RGBA always.
+            EXPECT_EQ(r1[x], r2[x]);
+          }
+          r1 += result_c.rowBytes();
+          r2 += result_sse.rowBytes();
+        }
+      }
+    }
+  }
+#endif
+}
+
 }  // namespace skia
author	jiesun@chromium.org <jiesun@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-03-09 21:55:38 +0000
committer	jiesun@chromium.org <jiesun@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-03-09 21:55:38 +0000
commit	3c2632de729773052b5561575db4e81017aff538 (patch)
tree	77d5a9c10da3330445f81db20fb88b785d457885 /skia/ext/convolver_unittest.cc
parent	7c48dafd4f686038dde05650c029995e77013aa0 (diff)
download	chromium_src-3c2632de729773052b5561575db4e81017aff538.zip chromium_src-3c2632de729773052b5561575db4e81017aff538.tar.gz chromium_src-3c2632de729773052b5561575db4e81017aff538.tar.bz2