summaryrefslogtreecommitdiffstats
path: root/courgette
diff options
context:
space:
mode:
authorhuangs <huangs@chromium.org>2015-08-11 13:58:52 -0700
committerCommit bot <commit-bot@chromium.org>2015-08-11 20:59:42 +0000
commitc1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9 (patch)
tree413857f35ff531a6c5a0b41b90c3ba5acb034231 /courgette
parent4e6546e538bed9e0758b5566a7604853b9e32f3e (diff)
downloadchromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.zip
chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.gz
chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.bz2
[Courgette] Extract qsufsort module and add tests.
This is to prepare for a follow-up CL to optimize qsufsort(), which reduces run time by ~40%, and "courgette -gen" run time by ~10%. Review URL: https://codereview.chromium.org/1272453003 Cr-Commit-Position: refs/heads/master@{#342883}
Diffstat (limited to 'courgette')
-rw-r--r--courgette/BUILD.gn2
-rw-r--r--courgette/courgette.gyp12
-rw-r--r--courgette/third_party/README.chromium2
-rw-r--r--courgette/third_party/bsdiff_create.cc165
-rw-r--r--courgette/third_party/qsufsort.h185
-rw-r--r--courgette/third_party/qsufsort_unittest.cc135
6 files changed, 338 insertions, 163 deletions
diff --git a/courgette/BUILD.gn b/courgette/BUILD.gn
index a885678..8826ba2 100644
--- a/courgette/BUILD.gn
+++ b/courgette/BUILD.gn
@@ -47,6 +47,7 @@ static_library("courgette_lib") {
"third_party/bsdiff_apply.cc",
"third_party/bsdiff_create.cc",
"third_party/paged_array.h",
+ "third_party/qsufsort.h",
"types_elf.h",
"types_win_pe.h",
]
@@ -104,6 +105,7 @@ test("courgette_unittests") {
"memory_allocator_unittest.cc",
"streams_unittest.cc",
"third_party/paged_array_unittest.cc",
+ "third_party/qsufsort_unittest.cc",
"typedrva_unittest.cc",
"versioning_unittest.cc",
]
diff --git a/courgette/courgette.gyp b/courgette/courgette.gyp
index 1eb669d..8ff08bc 100644
--- a/courgette/courgette.gyp
+++ b/courgette/courgette.gyp
@@ -11,10 +11,6 @@
'adjustment_method.h',
'assembly_program.cc',
'assembly_program.h',
- 'third_party/bsdiff.h',
- 'third_party/bsdiff_apply.cc',
- 'third_party/bsdiff_create.cc',
- 'third_party/paged_array.h',
'courgette.h',
'crc.cc',
'crc.h',
@@ -45,6 +41,11 @@
'simple_delta.h',
'streams.cc',
'streams.h',
+ 'third_party/bsdiff.h',
+ 'third_party/bsdiff_apply.cc',
+ 'third_party/bsdiff_create.cc',
+ 'third_party/paged_array.h',
+ 'third_party/qsufsort.h',
'types_elf.h',
'types_win_pe.h',
'patch_generator_x86_32.h',
@@ -109,7 +110,8 @@
'streams_unittest.cc',
'typedrva_unittest.cc',
'versioning_unittest.cc',
- 'third_party/paged_array_unittest.cc'
+ 'third_party/paged_array_unittest.cc',
+ 'third_party/qsufsort_unittest.cc',
],
'dependencies': [
'courgette_lib',
diff --git a/courgette/third_party/README.chromium b/courgette/third_party/README.chromium
index b3b28bd..3f7f308 100644
--- a/courgette/third_party/README.chromium
+++ b/courgette/third_party/README.chromium
@@ -21,3 +21,5 @@ List of changes made to original code:
- reformatted code to be closer to Google coding standards
- renamed variables
- added comments
+ - extracted qsufsort into qsufsort.h in 'courgette::qsuf' namespace
+ - added unit tests for qsufsort
diff --git a/courgette/third_party/bsdiff_create.cc b/courgette/third_party/bsdiff_create.cc
index b43b53a..d6ea695 100644
--- a/courgette/third_party/bsdiff_create.cc
+++ b/courgette/third_party/bsdiff_create.cc
@@ -20,6 +20,8 @@
2010-05-26 - Use a paged array for V and I. The address space may be too
fragmented for these big arrays to be contiguous.
--Stephen Adams <sra@chromium.org>
+ 2015-08-03 - Extract qsufsort portion to a separate file.
+ --Samuel Huang <huangs@chromium.org>
*/
#include "courgette/third_party/bsdiff.h"
@@ -35,162 +37,10 @@
#include "courgette/crc.h"
#include "courgette/streams.h"
#include "courgette/third_party/paged_array.h"
+#include "courgette/third_party/qsufsort.h"
namespace courgette {
-// ------------------------------------------------------------------------
-//
-// The following code is taken verbatim from 'bsdiff.c'. Please keep all the
-// code formatting and variable names. The changes from the original are (1)
-// replacing tabs with spaces, (2) indentation, (3) using 'const', and (4)
-// changing the V and I parameters from int* to PagedArray<int>&.
-//
-// The code appears to be a rewritten version of the suffix array algorithm
-// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
-// Sadakane, special cased for bytes.
-
-static void
-split(PagedArray<int>& I,PagedArray<int>& V,int start,int len,int h)
-{
- int i,j,k,x,tmp,jj,kk;
-
- if(len<16) {
- for(k=start;k<start+len;k+=j) {
- j=1;x=V[I[k]+h];
- for(i=1;k+i<start+len;i++) {
- if(V[I[k+i]+h]<x) {
- x=V[I[k+i]+h];
- j=0;
- };
- if(V[I[k+i]+h]==x) {
- tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
- j++;
- };
- };
- for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
- if(j==1) I[k]=-1;
- };
- return;
- };
-
- x=V[I[start+len/2]+h];
- jj=0;kk=0;
- for(i=start;i<start+len;i++) {
- if(V[I[i]+h]<x) jj++;
- if(V[I[i]+h]==x) kk++;
- };
- jj+=start;kk+=jj;
-
- i=start;j=0;k=0;
- while(i<jj) {
- if(V[I[i]+h]<x) {
- i++;
- } else if(V[I[i]+h]==x) {
- tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
- j++;
- } else {
- tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
- k++;
- };
- };
-
- while(jj+j<kk) {
- if(V[I[jj+j]+h]==x) {
- j++;
- } else {
- tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
- k++;
- };
- };
-
- if(jj>start) split(I,V,start,jj-start,h);
-
- for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
- if(jj==kk-1) I[jj]=-1;
-
- if(start+len>kk) split(I,V,kk,start+len-kk,h);
-}
-
-static void
-qsufsort(PagedArray<int>& I, PagedArray<int>& V,const unsigned char *old,int oldsize)
-{
- int buckets[256];
- int i,h,len;
-
- for(i=0;i<256;i++) buckets[i]=0;
- for(i=0;i<oldsize;i++) buckets[old[i]]++;
- for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
- for(i=255;i>0;i--) buckets[i]=buckets[i-1];
- buckets[0]=0;
-
- for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
- I[0]=oldsize;
- for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
- V[oldsize]=0;
- for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
- I[0]=-1;
-
- for(h=1;I[0]!=-(oldsize+1);h+=h) {
- len=0;
- for(i=0;i<oldsize+1;) {
- if(I[i]<0) {
- len-=I[i];
- i-=I[i];
- } else {
- if(len) I[i-len]=-len;
- len=V[I[i]]+1-i;
- split(I,V,i,len,h);
- i+=len;
- len=0;
- };
- };
- if(len) I[i-len]=-len;
- };
-
- for(i=0;i<oldsize+1;i++) I[V[i]]=i;
-}
-
-static int
-matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)
-{
- int i;
-
- for(i=0;(i<oldsize)&&(i<newsize);i++)
- if(old[i]!=newbuf[i]) break;
-
- return i;
-}
-
-static int
-search(PagedArray<int>& I,const unsigned char *old,int oldsize,
- const unsigned char *newbuf,int newsize,int st,int en,int *pos)
-{
- int x,y;
-
- if(en-st<2) {
- x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);
- y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);
-
- if(x>y) {
- *pos=I[st];
- return x;
- } else {
- *pos=I[en];
- return y;
- }
- }
-
- x=st+(en-st)/2;
- if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {
- return search(I,old,oldsize,newbuf,newsize,x,en,pos);
- } else {
- return search(I,old,oldsize,newbuf,newsize,st,x,pos);
- }
-}
-
-// End of 'verbatim' code.
-// ------------------------------------------------------------------------
-
static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {
bool ok = stream->Write(header->tag, sizeof(header->tag));
ok &= stream->WriteVarint32(header->slen);
@@ -236,7 +86,7 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
}
base::Time q_start_time = base::Time::Now();
- qsufsort(I, V, old, oldsize);
+ qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);
VLOG(1) << " done qsufsort "
<< (base::Time::Now() - q_start_time).InSecondsF();
V.clear();
@@ -300,9 +150,8 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
scan += match_length;
for (int scsc = scan; scan < newsize; ++scan) {
- match_length = search(I, old, oldsize,
- newbuf + scan, newsize - scan,
- 0, oldsize, &pos);
+ match_length = qsuf::search<PagedArray<int>&>(
+ I, old, oldsize, newbuf + scan, newsize - scan, 0, oldsize, &pos);
for ( ; scsc < scan + match_length ; scsc++)
if ((scsc + lastoffset < oldsize) &&
@@ -451,4 +300,4 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
return OK;
}
-} // namespace
+} // namespace courgette
diff --git a/courgette/third_party/qsufsort.h b/courgette/third_party/qsufsort.h
new file mode 100644
index 0000000..f6abfb73
--- /dev/null
+++ b/courgette/third_party/qsufsort.h
@@ -0,0 +1,185 @@
+/*
+ qsufsort.h -- Suffix array generation.
+
+ Copyright 2003 Colin Percival
+
+ For the terms under which this work may be distributed, please see
+ the adjoining file "LICENSE".
+
+ ChangeLog:
+ 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
+ values throughout.
+ --Benjamin Smedberg <benjamin@smedbergs.us>
+ 2010-05-26 - Use a paged array for V and I. The address space may be too
+ fragmented for these big arrays to be contiguous.
+ --Stephen Adams <sra@chromium.org>
+ 2015-08-03 - Extrat qsufsort to a separate file as template.
+ --Samuel Huang <huangs@chromium.org>
+*/
+
+#include <algorithm>
+#include <cstring>
+
+namespace courgette {
+namespace qsuf {
+
+// ------------------------------------------------------------------------
+//
+// The following code is taken verbatim from 'bsdiff.c'. Please keep all the
+// code formatting and variable names. The changes from the original are:
+// (1) replacing tabs with spaces,
+// (2) indentation,
+// (3) using 'const',
+// (4) changing the V and I parameters from int* to template <typename T>.
+//
+// The code appears to be a rewritten version of the suffix array algorithm
+// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
+// Sadakane, special cased for bytes.
+
+template <typename T>
+static void
+split(T I,T V,int start,int len,int h)
+{
+ int i,j,k,x,tmp,jj,kk;
+
+ if(len<16) {
+ for(k=start;k<start+len;k+=j) {
+ j=1;x=V[I[k]+h];
+ for(i=1;k+i<start+len;i++) {
+ if(V[I[k+i]+h]<x) {
+ x=V[I[k+i]+h];
+ j=0;
+ };
+ if(V[I[k+i]+h]==x) {
+ tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
+ j++;
+ };
+ };
+ for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
+ if(j==1) I[k]=-1;
+ };
+ return;
+ };
+
+ x=V[I[start+len/2]+h];
+ jj=0;kk=0;
+ for(i=start;i<start+len;i++) {
+ if(V[I[i]+h]<x) jj++;
+ if(V[I[i]+h]==x) kk++;
+ };
+ jj+=start;kk+=jj;
+
+ i=start;j=0;k=0;
+ while(i<jj) {
+ if(V[I[i]+h]<x) {
+ i++;
+ } else if(V[I[i]+h]==x) {
+ tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
+ j++;
+ } else {
+ tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
+ k++;
+ };
+ };
+
+ while(jj+j<kk) {
+ if(V[I[jj+j]+h]==x) {
+ j++;
+ } else {
+ tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
+ k++;
+ };
+ };
+
+ if(jj>start) split<T>(I,V,start,jj-start,h);
+
+ for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
+ if(jj==kk-1) I[jj]=-1;
+
+ if(start+len>kk) split<T>(I,V,kk,start+len-kk,h);
+}
+
+template <class T>
+static void
+qsufsort(T I, T V,const unsigned char *old,int oldsize)
+{
+ int buckets[256];
+ int i,h,len;
+
+ for(i=0;i<256;i++) buckets[i]=0;
+ for(i=0;i<oldsize;i++) buckets[old[i]]++;
+ for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
+ for(i=255;i>0;i--) buckets[i]=buckets[i-1];
+ buckets[0]=0;
+
+ for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
+ I[0]=oldsize;
+ for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
+ V[oldsize]=0;
+ for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
+ I[0]=-1;
+
+ for(h=1;I[0]!=-(oldsize+1);h+=h) {
+ len=0;
+ for(i=0;i<oldsize+1;) {
+ if(I[i]<0) {
+ len-=I[i];
+ i-=I[i];
+ } else {
+ if(len) I[i-len]=-len;
+ len=V[I[i]]+1-i;
+ split<T>(I,V,i,len,h);
+ i+=len;
+ len=0;
+ };
+ };
+ if(len) I[i-len]=-len;
+ };
+
+ for(i=0;i<oldsize+1;i++) I[V[i]]=i;
+}
+
+static int
+matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)
+{
+ int i;
+
+ for(i=0;(i<oldsize)&&(i<newsize);i++)
+ if(old[i]!=newbuf[i]) break;
+
+ return i;
+}
+
+template <class T>
+static int
+search(T I,const unsigned char *old,int oldsize,
+ const unsigned char *newbuf,int newsize,int st,int en,int *pos)
+{
+ int x,y;
+
+ if(en-st<2) {
+ x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);
+ y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);
+
+ if(x>y) {
+ *pos=I[st];
+ return x;
+ } else {
+ *pos=I[en];
+ return y;
+ }
+ }
+
+ x=st+(en-st)/2;
+ if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {
+ return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos);
+ } else {
+ return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos);
+ }
+}
+
+// End of 'verbatim' code.
+// ------------------------------------------------------------------------
+
+} // namespace qsuf
+} // namespace courgette
diff --git a/courgette/third_party/qsufsort_unittest.cc b/courgette/third_party/qsufsort_unittest.cc
new file mode 100644
index 0000000..9eb6720
--- /dev/null
+++ b/courgette/third_party/qsufsort_unittest.cc
@@ -0,0 +1,135 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "courgette/third_party/qsufsort.h"
+
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "base/macros.h"
+#include "base/memory/scoped_ptr.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(QSufSortTest, Sort) {
+ const char* test_cases[] = {
+ "",
+ "a",
+ "za",
+ "CACAO",
+ "banana",
+ "tobeornottobe",
+ "The quick brown fox jumps over the lazy dog.",
+ "elephantelephantelephantelephantelephant",
+ "-------------------------",
+ "011010011001011010010110011010010",
+ "3141592653589793238462643383279502884197169399375105",
+ "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
+ };
+
+ for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
+ int len = static_cast<int>(::strlen(test_cases[idx]));
+ const unsigned char* s =
+ reinterpret_cast<const unsigned char*>(test_cases[idx]);
+
+ // Generate the suffix array as I.
+ std::vector<int> I(len + 1);
+ std::vector<int> V(len + 1);
+ courgette::qsuf::qsufsort<int*>(&I[0], &V[0], s, len);
+
+ // Expect that I[] is a permutation of [0, len].
+ std::vector<int> I_sorted(I);
+ std::sort(I_sorted.begin(), I_sorted.end());
+ for (int i = 0; i < len + 1; ++i) {
+ EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]";
+ }
+
+ // First string must be empty string.
+ EXPECT_EQ(len, I[0]) << "test_case[" << idx << "]";
+
+ // Expect that the |len + 1| suffixes are strictly ordered.
+ const unsigned char* end = s + len;
+ for (int i = 0; i < len; ++i) {
+ const unsigned char* suf1 = s + I[i];
+ const unsigned char* suf2 = s + I[i + 1];
+ bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
+ EXPECT_TRUE(is_less) << "test_case[" << idx << "]";
+ }
+ }
+}
+
+TEST(QSufSortTest, Search) {
+ // Initialize main string and the suffix array.
+ // Positions: 00000000001111111111122222222233333333334444
+ // 01234567890123456789012345678901234567890123
+ const char* old_str = "the quick brown fox jumps over the lazy dog.";
+ int old_size = static_cast<int>(::strlen(old_str));
+ const unsigned char* old_buf =
+ reinterpret_cast<const unsigned char*>(old_str);
+ std::vector<int> I(old_size + 1);
+ std::vector<int> V(old_size + 1);
+ courgette::qsuf::qsufsort<int*>(&I[0], &V[0], old_buf, old_size);
+
+ // Test queries.
+ const struct {
+ int exp_pos; // -1 means "don't care".
+ int exp_match_len;
+ const char* query_str;
+ } test_cases[] = {
+ // Entire string.
+ {0, 44, "the quick brown fox jumps over the lazy dog."},
+ // Empty string.
+ {-1, 0, ""}, // Current algorithm does not enforce |pos| == 0.
+ // Exact and unique suffix match.
+ {43, 1, "."},
+ {31, 13, "the lazy dog."},
+ // Exact and unique non-suffix match.
+ {4, 5, "quick"},
+ {0, 9, "the quick"}, // Unique prefix.
+ // Entire word match with mutiple results: take lexicographical first.
+ {31, 3, "the"}, // Non-unique prefix: "the l"... < "the q"...
+ {9, 1, " "}, // " brown"... wins.
+ // Partial and unique match of query prefix.
+ {16, 10, "fox jumps with the hosps"},
+ // Partial and multiple match of query prefix: no guarantees on |pos|.
+ // Take lexicographical first for matching portion *only*, so same results:
+ {-1, 4, "the apple"}, // query < "the l"... < "the q"...
+ {-1, 4, "the opera"}, // "the l"... < query < "the q"...
+ {-1, 4, "the zebra"}, // "the l"... < "the q"... < query
+ // Prefix match dominates suffix match.
+ {26, 5, "over quick brown fox"},
+ // No match.
+ {-1, 0, ","},
+ {-1, 0, "1234"},
+ {-1, 0, "THE QUICK BROWN FOX"},
+ {-1, 0, "(the"},
+ };
+
+ for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
+ const auto& test_case = test_cases[idx];
+ int new_size = static_cast<int>(::strlen(test_case.query_str));
+ const unsigned char* new_buf =
+ reinterpret_cast<const unsigned char*>(test_case.query_str);
+
+ // Perform the search.
+ int pos = 0;
+ int match_len = courgette::qsuf::search(
+ &I[0], old_buf, old_size, new_buf, new_size, 0, old_size, &pos);
+
+ // Check basic properties and match with expected values.
+ EXPECT_GE(match_len, 0) << "test_case[" << idx << "]";
+ EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]";
+ if (match_len > 0) {
+ EXPECT_GE(pos, 0) << "test_case[" << idx << "]";
+ EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]";
+ EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len))
+ << "test_case[" << idx << "]";
+ }
+ if (test_case.exp_pos >= 0) {
+ EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]";
+ }
+ EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]";
+ }
+}