diff options
author | huangs <huangs@chromium.org> | 2015-08-11 13:58:52 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-08-11 20:59:42 +0000 |
commit | c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9 (patch) | |
tree | 413857f35ff531a6c5a0b41b90c3ba5acb034231 | |
parent | 4e6546e538bed9e0758b5566a7604853b9e32f3e (diff) | |
download | chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.zip chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.gz chromium_src-c1de821fbf6dbefdc8eaf8db9e47e1b9ad20d0a9.tar.bz2 |
[Courgette] Extract qsufsort module and add tests.
This is to prepare for a follow-up CL to optimize qsufsort(), which
reduces run time by ~40%, and "courgette -gen" run time by ~10%.
Review URL: https://codereview.chromium.org/1272453003
Cr-Commit-Position: refs/heads/master@{#342883}
-rw-r--r-- | courgette/BUILD.gn | 2 | ||||
-rw-r--r-- | courgette/courgette.gyp | 12 | ||||
-rw-r--r-- | courgette/third_party/README.chromium | 2 | ||||
-rw-r--r-- | courgette/third_party/bsdiff_create.cc | 165 | ||||
-rw-r--r-- | courgette/third_party/qsufsort.h | 185 | ||||
-rw-r--r-- | courgette/third_party/qsufsort_unittest.cc | 135 | ||||
-rwxr-xr-x | tools/checklicenses/checklicenses.py | 3 |
7 files changed, 341 insertions, 163 deletions
diff --git a/courgette/BUILD.gn b/courgette/BUILD.gn index a885678..8826ba2 100644 --- a/courgette/BUILD.gn +++ b/courgette/BUILD.gn @@ -47,6 +47,7 @@ static_library("courgette_lib") { "third_party/bsdiff_apply.cc", "third_party/bsdiff_create.cc", "third_party/paged_array.h", + "third_party/qsufsort.h", "types_elf.h", "types_win_pe.h", ] @@ -104,6 +105,7 @@ test("courgette_unittests") { "memory_allocator_unittest.cc", "streams_unittest.cc", "third_party/paged_array_unittest.cc", + "third_party/qsufsort_unittest.cc", "typedrva_unittest.cc", "versioning_unittest.cc", ] diff --git a/courgette/courgette.gyp b/courgette/courgette.gyp index 1eb669d..8ff08bc 100644 --- a/courgette/courgette.gyp +++ b/courgette/courgette.gyp @@ -11,10 +11,6 @@ 'adjustment_method.h', 'assembly_program.cc', 'assembly_program.h', - 'third_party/bsdiff.h', - 'third_party/bsdiff_apply.cc', - 'third_party/bsdiff_create.cc', - 'third_party/paged_array.h', 'courgette.h', 'crc.cc', 'crc.h', @@ -45,6 +41,11 @@ 'simple_delta.h', 'streams.cc', 'streams.h', + 'third_party/bsdiff.h', + 'third_party/bsdiff_apply.cc', + 'third_party/bsdiff_create.cc', + 'third_party/paged_array.h', + 'third_party/qsufsort.h', 'types_elf.h', 'types_win_pe.h', 'patch_generator_x86_32.h', @@ -109,7 +110,8 @@ 'streams_unittest.cc', 'typedrva_unittest.cc', 'versioning_unittest.cc', - 'third_party/paged_array_unittest.cc' + 'third_party/paged_array_unittest.cc', + 'third_party/qsufsort_unittest.cc', ], 'dependencies': [ 'courgette_lib', diff --git a/courgette/third_party/README.chromium b/courgette/third_party/README.chromium index b3b28bd..3f7f308 100644 --- a/courgette/third_party/README.chromium +++ b/courgette/third_party/README.chromium @@ -21,3 +21,5 @@ List of changes made to original code: - reformatted code to be closer to Google coding standards - renamed variables - added comments + - extracted qsufsort into qsufsort.h in 'courgette::qsuf' namespace + - added unit tests for qsufsort diff --git a/courgette/third_party/bsdiff_create.cc b/courgette/third_party/bsdiff_create.cc index b43b53a..d6ea695 100644 --- a/courgette/third_party/bsdiff_create.cc +++ b/courgette/third_party/bsdiff_create.cc @@ -20,6 +20,8 @@ 2010-05-26 - Use a paged array for V and I. The address space may be too fragmented for these big arrays to be contiguous. --Stephen Adams <sra@chromium.org> + 2015-08-03 - Extract qsufsort portion to a separate file. + --Samuel Huang <huangs@chromium.org> */ #include "courgette/third_party/bsdiff.h" @@ -35,162 +37,10 @@ #include "courgette/crc.h" #include "courgette/streams.h" #include "courgette/third_party/paged_array.h" +#include "courgette/third_party/qsufsort.h" namespace courgette { -// ------------------------------------------------------------------------ -// -// The following code is taken verbatim from 'bsdiff.c'. Please keep all the -// code formatting and variable names. The changes from the original are (1) -// replacing tabs with spaces, (2) indentation, (3) using 'const', and (4) -// changing the V and I parameters from int* to PagedArray<int>&. -// -// The code appears to be a rewritten version of the suffix array algorithm -// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko -// Sadakane, special cased for bytes. - -static void -split(PagedArray<int>& I,PagedArray<int>& V,int start,int len,int h) -{ - int i,j,k,x,tmp,jj,kk; - - if(len<16) { - for(k=start;k<start+len;k+=j) { - j=1;x=V[I[k]+h]; - for(i=1;k+i<start+len;i++) { - if(V[I[k+i]+h]<x) { - x=V[I[k+i]+h]; - j=0; - }; - if(V[I[k+i]+h]==x) { - tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp; - j++; - }; - }; - for(i=0;i<j;i++) V[I[k+i]]=k+j-1; - if(j==1) I[k]=-1; - }; - return; - }; - - x=V[I[start+len/2]+h]; - jj=0;kk=0; - for(i=start;i<start+len;i++) { - if(V[I[i]+h]<x) jj++; - if(V[I[i]+h]==x) kk++; - }; - jj+=start;kk+=jj; - - i=start;j=0;k=0; - while(i<jj) { - if(V[I[i]+h]<x) { - i++; - } else if(V[I[i]+h]==x) { - tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp; - j++; - } else { - tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp; - k++; - }; - }; - - while(jj+j<kk) { - if(V[I[jj+j]+h]==x) { - j++; - } else { - tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp; - k++; - }; - }; - - if(jj>start) split(I,V,start,jj-start,h); - - for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1; - if(jj==kk-1) I[jj]=-1; - - if(start+len>kk) split(I,V,kk,start+len-kk,h); -} - -static void -qsufsort(PagedArray<int>& I, PagedArray<int>& V,const unsigned char *old,int oldsize) -{ - int buckets[256]; - int i,h,len; - - for(i=0;i<256;i++) buckets[i]=0; - for(i=0;i<oldsize;i++) buckets[old[i]]++; - for(i=1;i<256;i++) buckets[i]+=buckets[i-1]; - for(i=255;i>0;i--) buckets[i]=buckets[i-1]; - buckets[0]=0; - - for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i; - I[0]=oldsize; - for(i=0;i<oldsize;i++) V[i]=buckets[old[i]]; - V[oldsize]=0; - for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1; - I[0]=-1; - - for(h=1;I[0]!=-(oldsize+1);h+=h) { - len=0; - for(i=0;i<oldsize+1;) { - if(I[i]<0) { - len-=I[i]; - i-=I[i]; - } else { - if(len) I[i-len]=-len; - len=V[I[i]]+1-i; - split(I,V,i,len,h); - i+=len; - len=0; - }; - }; - if(len) I[i-len]=-len; - }; - - for(i=0;i<oldsize+1;i++) I[V[i]]=i; -} - -static int -matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize) -{ - int i; - - for(i=0;(i<oldsize)&&(i<newsize);i++) - if(old[i]!=newbuf[i]) break; - - return i; -} - -static int -search(PagedArray<int>& I,const unsigned char *old,int oldsize, - const unsigned char *newbuf,int newsize,int st,int en,int *pos) -{ - int x,y; - - if(en-st<2) { - x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize); - y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize); - - if(x>y) { - *pos=I[st]; - return x; - } else { - *pos=I[en]; - return y; - } - } - - x=st+(en-st)/2; - if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) { - return search(I,old,oldsize,newbuf,newsize,x,en,pos); - } else { - return search(I,old,oldsize,newbuf,newsize,st,x,pos); - } -} - -// End of 'verbatim' code. -// ------------------------------------------------------------------------ - static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) { bool ok = stream->Write(header->tag, sizeof(header->tag)); ok &= stream->WriteVarint32(header->slen); @@ -236,7 +86,7 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream, } base::Time q_start_time = base::Time::Now(); - qsufsort(I, V, old, oldsize); + qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize); VLOG(1) << " done qsufsort " << (base::Time::Now() - q_start_time).InSecondsF(); V.clear(); @@ -300,9 +150,8 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream, scan += match_length; for (int scsc = scan; scan < newsize; ++scan) { - match_length = search(I, old, oldsize, - newbuf + scan, newsize - scan, - 0, oldsize, &pos); + match_length = qsuf::search<PagedArray<int>&>( + I, old, oldsize, newbuf + scan, newsize - scan, 0, oldsize, &pos); for ( ; scsc < scan + match_length ; scsc++) if ((scsc + lastoffset < oldsize) && @@ -451,4 +300,4 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream, return OK; } -} // namespace +} // namespace courgette diff --git a/courgette/third_party/qsufsort.h b/courgette/third_party/qsufsort.h new file mode 100644 index 0000000..f6abfb73 --- /dev/null +++ b/courgette/third_party/qsufsort.h @@ -0,0 +1,185 @@ +/* + qsufsort.h -- Suffix array generation. + + Copyright 2003 Colin Percival + + For the terms under which this work may be distributed, please see + the adjoining file "LICENSE". + + ChangeLog: + 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit + values throughout. + --Benjamin Smedberg <benjamin@smedbergs.us> + 2010-05-26 - Use a paged array for V and I. The address space may be too + fragmented for these big arrays to be contiguous. + --Stephen Adams <sra@chromium.org> + 2015-08-03 - Extrat qsufsort to a separate file as template. + --Samuel Huang <huangs@chromium.org> +*/ + +#include <algorithm> +#include <cstring> + +namespace courgette { +namespace qsuf { + +// ------------------------------------------------------------------------ +// +// The following code is taken verbatim from 'bsdiff.c'. Please keep all the +// code formatting and variable names. The changes from the original are: +// (1) replacing tabs with spaces, +// (2) indentation, +// (3) using 'const', +// (4) changing the V and I parameters from int* to template <typename T>. +// +// The code appears to be a rewritten version of the suffix array algorithm +// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko +// Sadakane, special cased for bytes. + +template <typename T> +static void +split(T I,T V,int start,int len,int h) +{ + int i,j,k,x,tmp,jj,kk; + + if(len<16) { + for(k=start;k<start+len;k+=j) { + j=1;x=V[I[k]+h]; + for(i=1;k+i<start+len;i++) { + if(V[I[k+i]+h]<x) { + x=V[I[k+i]+h]; + j=0; + }; + if(V[I[k+i]+h]==x) { + tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp; + j++; + }; + }; + for(i=0;i<j;i++) V[I[k+i]]=k+j-1; + if(j==1) I[k]=-1; + }; + return; + }; + + x=V[I[start+len/2]+h]; + jj=0;kk=0; + for(i=start;i<start+len;i++) { + if(V[I[i]+h]<x) jj++; + if(V[I[i]+h]==x) kk++; + }; + jj+=start;kk+=jj; + + i=start;j=0;k=0; + while(i<jj) { + if(V[I[i]+h]<x) { + i++; + } else if(V[I[i]+h]==x) { + tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp; + j++; + } else { + tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp; + k++; + }; + }; + + while(jj+j<kk) { + if(V[I[jj+j]+h]==x) { + j++; + } else { + tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp; + k++; + }; + }; + + if(jj>start) split<T>(I,V,start,jj-start,h); + + for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1; + if(jj==kk-1) I[jj]=-1; + + if(start+len>kk) split<T>(I,V,kk,start+len-kk,h); +} + +template <class T> +static void +qsufsort(T I, T V,const unsigned char *old,int oldsize) +{ + int buckets[256]; + int i,h,len; + + for(i=0;i<256;i++) buckets[i]=0; + for(i=0;i<oldsize;i++) buckets[old[i]]++; + for(i=1;i<256;i++) buckets[i]+=buckets[i-1]; + for(i=255;i>0;i--) buckets[i]=buckets[i-1]; + buckets[0]=0; + + for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i; + I[0]=oldsize; + for(i=0;i<oldsize;i++) V[i]=buckets[old[i]]; + V[oldsize]=0; + for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1; + I[0]=-1; + + for(h=1;I[0]!=-(oldsize+1);h+=h) { + len=0; + for(i=0;i<oldsize+1;) { + if(I[i]<0) { + len-=I[i]; + i-=I[i]; + } else { + if(len) I[i-len]=-len; + len=V[I[i]]+1-i; + split<T>(I,V,i,len,h); + i+=len; + len=0; + }; + }; + if(len) I[i-len]=-len; + }; + + for(i=0;i<oldsize+1;i++) I[V[i]]=i; +} + +static int +matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize) +{ + int i; + + for(i=0;(i<oldsize)&&(i<newsize);i++) + if(old[i]!=newbuf[i]) break; + + return i; +} + +template <class T> +static int +search(T I,const unsigned char *old,int oldsize, + const unsigned char *newbuf,int newsize,int st,int en,int *pos) +{ + int x,y; + + if(en-st<2) { + x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize); + y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize); + + if(x>y) { + *pos=I[st]; + return x; + } else { + *pos=I[en]; + return y; + } + } + + x=st+(en-st)/2; + if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) { + return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos); + } else { + return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos); + } +} + +// End of 'verbatim' code. +// ------------------------------------------------------------------------ + +} // namespace qsuf +} // namespace courgette diff --git a/courgette/third_party/qsufsort_unittest.cc b/courgette/third_party/qsufsort_unittest.cc new file mode 100644 index 0000000..9eb6720 --- /dev/null +++ b/courgette/third_party/qsufsort_unittest.cc @@ -0,0 +1,135 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/third_party/qsufsort.h" + +#include <algorithm> +#include <cstring> +#include <string> +#include <vector> + +#include "base/macros.h" +#include "base/memory/scoped_ptr.h" +#include "testing/gtest/include/gtest/gtest.h" + +TEST(QSufSortTest, Sort) { + const char* test_cases[] = { + "", + "a", + "za", + "CACAO", + "banana", + "tobeornottobe", + "The quick brown fox jumps over the lazy dog.", + "elephantelephantelephantelephantelephant", + "-------------------------", + "011010011001011010010110011010010", + "3141592653589793238462643383279502884197169399375105", + "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", + }; + + for (size_t idx = 0; idx < arraysize(test_cases); ++idx) { + int len = static_cast<int>(::strlen(test_cases[idx])); + const unsigned char* s = + reinterpret_cast<const unsigned char*>(test_cases[idx]); + + // Generate the suffix array as I. + std::vector<int> I(len + 1); + std::vector<int> V(len + 1); + courgette::qsuf::qsufsort<int*>(&I[0], &V[0], s, len); + + // Expect that I[] is a permutation of [0, len]. + std::vector<int> I_sorted(I); + std::sort(I_sorted.begin(), I_sorted.end()); + for (int i = 0; i < len + 1; ++i) { + EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]"; + } + + // First string must be empty string. + EXPECT_EQ(len, I[0]) << "test_case[" << idx << "]"; + + // Expect that the |len + 1| suffixes are strictly ordered. + const unsigned char* end = s + len; + for (int i = 0; i < len; ++i) { + const unsigned char* suf1 = s + I[i]; + const unsigned char* suf2 = s + I[i + 1]; + bool is_less = std::lexicographical_compare(suf1, end, suf2, end); + EXPECT_TRUE(is_less) << "test_case[" << idx << "]"; + } + } +} + +TEST(QSufSortTest, Search) { + // Initialize main string and the suffix array. + // Positions: 00000000001111111111122222222233333333334444 + // 01234567890123456789012345678901234567890123 + const char* old_str = "the quick brown fox jumps over the lazy dog."; + int old_size = static_cast<int>(::strlen(old_str)); + const unsigned char* old_buf = + reinterpret_cast<const unsigned char*>(old_str); + std::vector<int> I(old_size + 1); + std::vector<int> V(old_size + 1); + courgette::qsuf::qsufsort<int*>(&I[0], &V[0], old_buf, old_size); + + // Test queries. + const struct { + int exp_pos; // -1 means "don't care". + int exp_match_len; + const char* query_str; + } test_cases[] = { + // Entire string. + {0, 44, "the quick brown fox jumps over the lazy dog."}, + // Empty string. + {-1, 0, ""}, // Current algorithm does not enforce |pos| == 0. + // Exact and unique suffix match. + {43, 1, "."}, + {31, 13, "the lazy dog."}, + // Exact and unique non-suffix match. + {4, 5, "quick"}, + {0, 9, "the quick"}, // Unique prefix. + // Entire word match with mutiple results: take lexicographical first. + {31, 3, "the"}, // Non-unique prefix: "the l"... < "the q"... + {9, 1, " "}, // " brown"... wins. + // Partial and unique match of query prefix. + {16, 10, "fox jumps with the hosps"}, + // Partial and multiple match of query prefix: no guarantees on |pos|. + // Take lexicographical first for matching portion *only*, so same results: + {-1, 4, "the apple"}, // query < "the l"... < "the q"... + {-1, 4, "the opera"}, // "the l"... < query < "the q"... + {-1, 4, "the zebra"}, // "the l"... < "the q"... < query + // Prefix match dominates suffix match. + {26, 5, "over quick brown fox"}, + // No match. + {-1, 0, ","}, + {-1, 0, "1234"}, + {-1, 0, "THE QUICK BROWN FOX"}, + {-1, 0, "(the"}, + }; + + for (size_t idx = 0; idx < arraysize(test_cases); ++idx) { + const auto& test_case = test_cases[idx]; + int new_size = static_cast<int>(::strlen(test_case.query_str)); + const unsigned char* new_buf = + reinterpret_cast<const unsigned char*>(test_case.query_str); + + // Perform the search. + int pos = 0; + int match_len = courgette::qsuf::search( + &I[0], old_buf, old_size, new_buf, new_size, 0, old_size, &pos); + + // Check basic properties and match with expected values. + EXPECT_GE(match_len, 0) << "test_case[" << idx << "]"; + EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]"; + if (match_len > 0) { + EXPECT_GE(pos, 0) << "test_case[" << idx << "]"; + EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]"; + EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len)) + << "test_case[" << idx << "]"; + } + if (test_case.exp_pos >= 0) { + EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]"; + } + EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]"; + } +} diff --git a/tools/checklicenses/checklicenses.py b/tools/checklicenses/checklicenses.py index 3f8a81a..1261a27 100755 --- a/tools/checklicenses/checklicenses.py +++ b/tools/checklicenses/checklicenses.py @@ -129,6 +129,9 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = { 'courgette/third_party/bsdiff_create.cc': [ # http://crbug.com/98095 'UNKNOWN', ], + 'courgette/third_party/qsufsort.h': [ # http://crbug.com/98095 + 'UNKNOWN', + ], 'native_client': [ # http://crbug.com/98099 'UNKNOWN', ], |