[Courgette] Extract qsufsort module and add tests.

This is to prepare for a follow-up CL to optimize qsufsort(), which reduces run time by ~40%, and "courgette -gen" run time by ~10%. Review URL: https://codereview.chromium.org/1272453003 Cr-Commit-Position: refs/heads/master@{#342883}
author: huangs <huangs@chromium.org> 2015-08-11 13:58:52 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-08-11 20:59:42 +0000
commit: c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9 (patch)
tree: 413857f35ff531a6c5a0b41b90c3ba5acb034231 /courgette
parent: 4e6546e538bed9e0758b5566a7604853b9e32f3e (diff)
download: chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.zip
chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.gz
chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.bz2
6 files changed, 338 insertions, 163 deletions
diff --git a/courgette/BUILD.gn b/courgette/BUILD.gn
index a885678..8826ba2 100644
--- a/courgette/BUILD.gn
+++ b/courgette/BUILD.gn
@@ -47,6 +47,7 @@ static_library("courgette_lib") {
     "third_party/bsdiff_apply.cc",
     "third_party/bsdiff_create.cc",
     "third_party/paged_array.h",
+    "third_party/qsufsort.h",
     "types_elf.h",
     "types_win_pe.h",
   ]
@@ -104,6 +105,7 @@ test("courgette_unittests") {
     "memory_allocator_unittest.cc",
     "streams_unittest.cc",
     "third_party/paged_array_unittest.cc",
+    "third_party/qsufsort_unittest.cc",
     "typedrva_unittest.cc",
     "versioning_unittest.cc",
   ]
diff --git a/courgette/courgette.gyp b/courgette/courgette.gyp
index 1eb669d..8ff08bc 100644
--- a/courgette/courgette.gyp
+++ b/courgette/courgette.gyp
@@ -11,10 +11,6 @@
       'adjustment_method.h',
       'assembly_program.cc',
       'assembly_program.h',
-      'third_party/bsdiff.h',
-      'third_party/bsdiff_apply.cc',
-      'third_party/bsdiff_create.cc',
-      'third_party/paged_array.h',
       'courgette.h',
       'crc.cc',
       'crc.h',
@@ -45,6 +41,11 @@
       'simple_delta.h',
       'streams.cc',
       'streams.h',
+      'third_party/bsdiff.h',
+      'third_party/bsdiff_apply.cc',
+      'third_party/bsdiff_create.cc',
+      'third_party/paged_array.h',
+      'third_party/qsufsort.h',
       'types_elf.h',
       'types_win_pe.h',
       'patch_generator_x86_32.h',
@@ -109,7 +110,8 @@
         'streams_unittest.cc',
         'typedrva_unittest.cc',
         'versioning_unittest.cc',
-        'third_party/paged_array_unittest.cc'
+        'third_party/paged_array_unittest.cc',
+        'third_party/qsufsort_unittest.cc',
       ],
       'dependencies': [
         'courgette_lib',
diff --git a/courgette/third_party/README.chromium b/courgette/third_party/README.chromium
index b3b28bd..3f7f308 100644
--- a/courgette/third_party/README.chromium
+++ b/courgette/third_party/README.chromium
@@ -21,3 +21,5 @@ List of changes made to original code:
   - reformatted code to be closer to Google coding standards
   - renamed variables
   - added comments
+  - extracted qsufsort into qsufsort.h in 'courgette::qsuf' namespace
+  - added unit tests for qsufsort
diff --git a/courgette/third_party/bsdiff_create.cc b/courgette/third_party/bsdiff_create.cc
index b43b53a..d6ea695 100644
--- a/courgette/third_party/bsdiff_create.cc
+++ b/courgette/third_party/bsdiff_create.cc
@@ -20,6 +20,8 @@
   2010-05-26 - Use a paged array for V and I. The address space may be too
                fragmented for these big arrays to be contiguous.
                  --Stephen Adams <sra@chromium.org>
+  2015-08-03 - Extract qsufsort portion to a separate file.
+                 --Samuel Huang <huangs@chromium.org>
 */
 
 #include "courgette/third_party/bsdiff.h"
@@ -35,162 +37,10 @@
 #include "courgette/crc.h"
 #include "courgette/streams.h"
 #include "courgette/third_party/paged_array.h"
+#include "courgette/third_party/qsufsort.h"
 
 namespace courgette {
 
-// ------------------------------------------------------------------------
-//
-// The following code is taken verbatim from 'bsdiff.c'. Please keep all the
-// code formatting and variable names.  The changes from the original are (1)
-// replacing tabs with spaces, (2) indentation, (3) using 'const', and (4)
-// changing the V and I parameters from int* to PagedArray<int>&.
-//
-// The code appears to be a rewritten version of the suffix array algorithm
-// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
-// Sadakane, special cased for bytes.
-
-static void
-split(PagedArray<int>& I,PagedArray<int>& V,int start,int len,int h)
-{
-  int i,j,k,x,tmp,jj,kk;
-
-  if(len<16) {
-    for(k=start;k<start+len;k+=j) {
-      j=1;x=V[I[k]+h];
-      for(i=1;k+i<start+len;i++) {
-        if(V[I[k+i]+h]<x) {
-          x=V[I[k+i]+h];
-          j=0;
-        };
-        if(V[I[k+i]+h]==x) {
-          tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
-          j++;
-        };
-      };
-      for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
-      if(j==1) I[k]=-1;
-    };
-    return;
-  };
-
-  x=V[I[start+len/2]+h];
-  jj=0;kk=0;
-  for(i=start;i<start+len;i++) {
-    if(V[I[i]+h]<x) jj++;
-    if(V[I[i]+h]==x) kk++;
-  };
-  jj+=start;kk+=jj;
-
-  i=start;j=0;k=0;
-  while(i<jj) {
-    if(V[I[i]+h]<x) {
-      i++;
-    } else if(V[I[i]+h]==x) {
-      tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
-      j++;
-    } else {
-      tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
-      k++;
-    };
-  };
-
-  while(jj+j<kk) {
-    if(V[I[jj+j]+h]==x) {
-      j++;
-    } else {
-      tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
-      k++;
-    };
-  };
-
-  if(jj>start) split(I,V,start,jj-start,h);
-
-  for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
-  if(jj==kk-1) I[jj]=-1;
-
-  if(start+len>kk) split(I,V,kk,start+len-kk,h);
-}
-
-static void
-qsufsort(PagedArray<int>& I, PagedArray<int>& V,const unsigned char *old,int oldsize)
-{
-  int buckets[256];
-  int i,h,len;
-
-  for(i=0;i<256;i++) buckets[i]=0;
-  for(i=0;i<oldsize;i++) buckets[old[i]]++;
-  for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
-  for(i=255;i>0;i--) buckets[i]=buckets[i-1];
-  buckets[0]=0;
-
-  for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
-  I[0]=oldsize;
-  for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
-  V[oldsize]=0;
-  for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
-  I[0]=-1;
-
-  for(h=1;I[0]!=-(oldsize+1);h+=h) {
-    len=0;
-    for(i=0;i<oldsize+1;) {
-      if(I[i]<0) {
-        len-=I[i];
-        i-=I[i];
-      } else {
-        if(len) I[i-len]=-len;
-        len=V[I[i]]+1-i;
-        split(I,V,i,len,h);
-        i+=len;
-        len=0;
-      };
-    };
-    if(len) I[i-len]=-len;
-  };
-
-  for(i=0;i<oldsize+1;i++) I[V[i]]=i;
-}
-
-static int
-matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)
-{
-  int i;
-
-  for(i=0;(i<oldsize)&&(i<newsize);i++)
-    if(old[i]!=newbuf[i]) break;
-
-  return i;
-}
-
-static int
-search(PagedArray<int>& I,const unsigned char *old,int oldsize,
-       const unsigned char *newbuf,int newsize,int st,int en,int *pos)
-{
-  int x,y;
-
-  if(en-st<2) {
-    x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);
-    y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);
-
-    if(x>y) {
-      *pos=I[st];
-      return x;
-    } else {
-      *pos=I[en];
-      return y;
-    }
-  }
-
-  x=st+(en-st)/2;
-  if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {
-    return search(I,old,oldsize,newbuf,newsize,x,en,pos);
-  } else {
-    return search(I,old,oldsize,newbuf,newsize,st,x,pos);
-  }
-}
-
-//  End of 'verbatim' code.
-// ------------------------------------------------------------------------
-
 static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {
   bool ok = stream->Write(header->tag, sizeof(header->tag));
   ok &= stream->WriteVarint32(header->slen);
@@ -236,7 +86,7 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
   }
 
   base::Time q_start_time = base::Time::Now();
-  qsufsort(I, V, old, oldsize);
+  qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);
   VLOG(1) << " done qsufsort "
           << (base::Time::Now() - q_start_time).InSecondsF();
   V.clear();
@@ -300,9 +150,8 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
 
     scan += match_length;
     for (int scsc = scan;  scan < newsize;  ++scan) {
-      match_length = search(I, old, oldsize,
-                            newbuf + scan, newsize - scan,
-                            0, oldsize, &pos);
+      match_length = qsuf::search<PagedArray<int>&>(
+          I, old, oldsize, newbuf + scan, newsize - scan, 0, oldsize, &pos);
 
       for ( ; scsc < scan + match_length ; scsc++)
         if ((scsc + lastoffset < oldsize) &&
@@ -451,4 +300,4 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
   return OK;
 }
 
-}  // namespace
+}  // namespace courgette
diff --git a/courgette/third_party/qsufsort.h b/courgette/third_party/qsufsort.h
new file mode 100644
index 0000000..f6abfb73
--- /dev/null
+++ b/courgette/third_party/qsufsort.h
@@ -0,0 +1,185 @@
+/*
+  qsufsort.h -- Suffix array generation.
+
+  Copyright 2003 Colin Percival
+
+  For the terms under which this work may be distributed, please see
+  the adjoining file "LICENSE".
+
+  ChangeLog:
+  2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
+               values throughout.
+                 --Benjamin Smedberg <benjamin@smedbergs.us>
+  2010-05-26 - Use a paged array for V and I. The address space may be too
+               fragmented for these big arrays to be contiguous.
+                 --Stephen Adams <sra@chromium.org>
+  2015-08-03 - Extrat qsufsort to a separate file as template.
+                 --Samuel Huang <huangs@chromium.org>
+*/
+
+#include <algorithm>
+#include <cstring>
+
+namespace courgette {
+namespace qsuf {
+
+// ------------------------------------------------------------------------
+//
+// The following code is taken verbatim from 'bsdiff.c'. Please keep all the
+// code formatting and variable names.  The changes from the original are:
+// (1) replacing tabs with spaces,
+// (2) indentation,
+// (3) using 'const',
+// (4) changing the V and I parameters from int* to template <typename T>.
+//
+// The code appears to be a rewritten version of the suffix array algorithm
+// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
+// Sadakane, special cased for bytes.
+
+template <typename T>
+static void
+split(T I,T V,int start,int len,int h)
+{
+  int i,j,k,x,tmp,jj,kk;
+
+  if(len<16) {
+    for(k=start;k<start+len;k+=j) {
+      j=1;x=V[I[k]+h];
+      for(i=1;k+i<start+len;i++) {
+        if(V[I[k+i]+h]<x) {
+          x=V[I[k+i]+h];
+          j=0;
+        };
+        if(V[I[k+i]+h]==x) {
+          tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
+          j++;
+        };
+      };
+      for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
+      if(j==1) I[k]=-1;
+    };
+    return;
+  };
+
+  x=V[I[start+len/2]+h];
+  jj=0;kk=0;
+  for(i=start;i<start+len;i++) {
+    if(V[I[i]+h]<x) jj++;
+    if(V[I[i]+h]==x) kk++;
+  };
+  jj+=start;kk+=jj;
+
+  i=start;j=0;k=0;
+  while(i<jj) {
+    if(V[I[i]+h]<x) {
+      i++;
+    } else if(V[I[i]+h]==x) {
+      tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
+      j++;
+    } else {
+      tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
+      k++;
+    };
+  };
+
+  while(jj+j<kk) {
+    if(V[I[jj+j]+h]==x) {
+      j++;
+    } else {
+      tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
+      k++;
+    };
+  };
+
+  if(jj>start) split<T>(I,V,start,jj-start,h);
+
+  for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
+  if(jj==kk-1) I[jj]=-1;
+
+  if(start+len>kk) split<T>(I,V,kk,start+len-kk,h);
+}
+
+template <class T>
+static void
+qsufsort(T I, T V,const unsigned char *old,int oldsize)
+{
+  int buckets[256];
+  int i,h,len;
+
+  for(i=0;i<256;i++) buckets[i]=0;
+  for(i=0;i<oldsize;i++) buckets[old[i]]++;
+  for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
+  for(i=255;i>0;i--) buckets[i]=buckets[i-1];
+  buckets[0]=0;
+
+  for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
+  I[0]=oldsize;
+  for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
+  V[oldsize]=0;
+  for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
+  I[0]=-1;
+
+  for(h=1;I[0]!=-(oldsize+1);h+=h) {
+    len=0;
+    for(i=0;i<oldsize+1;) {
+      if(I[i]<0) {
+        len-=I[i];
+        i-=I[i];
+      } else {
+        if(len) I[i-len]=-len;
+        len=V[I[i]]+1-i;
+        split<T>(I,V,i,len,h);
+        i+=len;
+        len=0;
+      };
+    };
+    if(len) I[i-len]=-len;
+  };
+
+  for(i=0;i<oldsize+1;i++) I[V[i]]=i;
+}
+
+static int
+matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)
+{
+  int i;
+
+  for(i=0;(i<oldsize)&&(i<newsize);i++)
+    if(old[i]!=newbuf[i]) break;
+
+  return i;
+}
+
+template <class T>
+static int
+search(T I,const unsigned char *old,int oldsize,
+       const unsigned char *newbuf,int newsize,int st,int en,int *pos)
+{
+  int x,y;
+
+  if(en-st<2) {
+    x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);
+    y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);
+
+    if(x>y) {
+      *pos=I[st];
+      return x;
+    } else {
+      *pos=I[en];
+      return y;
+    }
+  }
+
+  x=st+(en-st)/2;
+  if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {
+    return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos);
+  } else {
+    return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos);
+  }
+}
+
+//  End of 'verbatim' code.
+// ------------------------------------------------------------------------
+
+}  // namespace qsuf
+}  // namespace courgette
diff --git a/courgette/third_party/qsufsort_unittest.cc b/courgette/third_party/qsufsort_unittest.cc
new file mode 100644
index 0000000..9eb6720
--- /dev/null
+++ b/courgette/third_party/qsufsort_unittest.cc
@@ -0,0 +1,135 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "courgette/third_party/qsufsort.h"
+
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "base/macros.h"
+#include "base/memory/scoped_ptr.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(QSufSortTest, Sort) {
+  const char* test_cases[] = {
+    "",
+    "a",
+    "za",
+    "CACAO",
+    "banana",
+    "tobeornottobe",
+    "The quick brown fox jumps over the lazy dog.",
+    "elephantelephantelephantelephantelephant",
+    "-------------------------",
+    "011010011001011010010110011010010",
+    "3141592653589793238462643383279502884197169399375105",
+    "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
+  };
+
+  for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
+    int len = static_cast<int>(::strlen(test_cases[idx]));
+    const unsigned char* s =
+        reinterpret_cast<const unsigned char*>(test_cases[idx]);
+
+    // Generate the suffix array as I.
+    std::vector<int> I(len + 1);
+    std::vector<int> V(len + 1);
+    courgette::qsuf::qsufsort<int*>(&I[0], &V[0], s, len);
+
+    // Expect that I[] is a permutation of [0, len].
+    std::vector<int> I_sorted(I);
+    std::sort(I_sorted.begin(), I_sorted.end());
+    for (int i = 0; i < len + 1; ++i) {
+      EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]";
+    }
+
+    // First string must be empty string.
+    EXPECT_EQ(len, I[0]) << "test_case[" << idx << "]";
+
+    // Expect that the |len + 1| suffixes are strictly ordered.
+    const unsigned char* end = s + len;
+    for (int i = 0; i < len; ++i) {
+      const unsigned char* suf1 = s + I[i];
+      const unsigned char* suf2 = s + I[i + 1];
+      bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
+      EXPECT_TRUE(is_less) << "test_case[" << idx << "]";
+    }
+  }
+}
+
+TEST(QSufSortTest, Search) {
+  // Initialize main string and the suffix array.
+  // Positions:          00000000001111111111122222222233333333334444
+  //                     01234567890123456789012345678901234567890123
+  const char* old_str = "the quick brown fox jumps over the lazy dog.";
+  int old_size = static_cast<int>(::strlen(old_str));
+  const unsigned char* old_buf =
+        reinterpret_cast<const unsigned char*>(old_str);
+  std::vector<int> I(old_size + 1);
+  std::vector<int> V(old_size + 1);
+  courgette::qsuf::qsufsort<int*>(&I[0], &V[0], old_buf, old_size);
+
+  // Test queries.
+  const struct {
+    int exp_pos;  // -1 means "don't care".
+    int exp_match_len;
+    const char* query_str;
+  } test_cases[] = {
+    // Entire string.
+    {0, 44, "the quick brown fox jumps over the lazy dog."},
+    // Empty string.
+    {-1, 0, ""},  // Current algorithm does not enforce |pos| == 0.
+    // Exact and unique suffix match.
+    {43, 1, "."},
+    {31, 13, "the lazy dog."},
+    // Exact and unique non-suffix match.
+    {4, 5, "quick"},
+    {0, 9, "the quick"},  // Unique prefix.
+    // Entire word match with mutiple results: take lexicographical first.
+    {31, 3, "the"},  // Non-unique prefix: "the l"... < "the q"...
+    {9, 1, " "},  // " brown"... wins.
+    // Partial and unique match of query prefix.
+    {16, 10, "fox jumps with the hosps"},
+    // Partial and multiple match of query prefix: no guarantees on |pos|.
+    // Take lexicographical first for matching portion *only*, so same results:
+    {-1, 4, "the apple"},  // query      < "the l"... < "the q"...
+    {-1, 4, "the opera"},  // "the l"... < query      < "the q"...
+    {-1, 4, "the zebra"},  // "the l"... < "the q"... < query
+    // Prefix match dominates suffix match.
+    {26, 5, "over quick brown fox"},
+    // No match.
+    {-1, 0, ","},
+    {-1, 0, "1234"},
+    {-1, 0, "THE QUICK BROWN FOX"},
+    {-1, 0, "(the"},
+  };
+
+  for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
+    const auto& test_case = test_cases[idx];
+    int new_size = static_cast<int>(::strlen(test_case.query_str));
+    const unsigned char* new_buf =
+        reinterpret_cast<const unsigned char*>(test_case.query_str);
+
+    // Perform the search.
+    int pos = 0;
+    int match_len = courgette::qsuf::search(
+        &I[0], old_buf, old_size, new_buf, new_size, 0, old_size, &pos);
+
+    // Check basic properties and match with expected values.
+    EXPECT_GE(match_len, 0) << "test_case[" << idx << "]";
+    EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]";
+    if (match_len > 0) {
+      EXPECT_GE(pos, 0) << "test_case[" << idx << "]";
+      EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]";
+      EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len))
+          << "test_case[" << idx << "]";
+    }
+    if (test_case.exp_pos >= 0) {
+      EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]";
+    }
+    EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]";
+  }
+}
author	huangs <huangs@chromium.org>	2015-08-11 13:58:52 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-08-11 20:59:42 +0000
commit	c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9 (patch)
tree	413857f35ff531a6c5a0b41b90c3ba5acb034231 /courgette
parent	4e6546e538bed9e0758b5566a7604853b9e32f3e (diff)
download	chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.zip chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.gz chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.bz2