0

[Courgette] Extract qsufsort module and add tests.

This is to prepare for a follow-up CL to optimize qsufsort(), which
reduces run time by ~40%, and "courgette -gen" run time by ~10%.

Review URL: https://codereview.chromium.org/1272453003

Cr-Commit-Position: refs/heads/master@{#342883}
This commit is contained in:
huangs
2015-08-11 13:58:52 -07:00
committed by Commit bot
parent 4e6546e538
commit c1de821fbf
7 changed files with 341 additions and 163 deletions

@ -47,6 +47,7 @@ static_library("courgette_lib") {
"third_party/bsdiff_apply.cc",
"third_party/bsdiff_create.cc",
"third_party/paged_array.h",
"third_party/qsufsort.h",
"types_elf.h",
"types_win_pe.h",
]
@ -104,6 +105,7 @@ test("courgette_unittests") {
"memory_allocator_unittest.cc",
"streams_unittest.cc",
"third_party/paged_array_unittest.cc",
"third_party/qsufsort_unittest.cc",
"typedrva_unittest.cc",
"versioning_unittest.cc",
]

@ -11,10 +11,6 @@
'adjustment_method.h',
'assembly_program.cc',
'assembly_program.h',
'third_party/bsdiff.h',
'third_party/bsdiff_apply.cc',
'third_party/bsdiff_create.cc',
'third_party/paged_array.h',
'courgette.h',
'crc.cc',
'crc.h',
@ -45,6 +41,11 @@
'simple_delta.h',
'streams.cc',
'streams.h',
'third_party/bsdiff.h',
'third_party/bsdiff_apply.cc',
'third_party/bsdiff_create.cc',
'third_party/paged_array.h',
'third_party/qsufsort.h',
'types_elf.h',
'types_win_pe.h',
'patch_generator_x86_32.h',
@ -109,7 +110,8 @@
'streams_unittest.cc',
'typedrva_unittest.cc',
'versioning_unittest.cc',
'third_party/paged_array_unittest.cc'
'third_party/paged_array_unittest.cc',
'third_party/qsufsort_unittest.cc',
],
'dependencies': [
'courgette_lib',

@ -21,3 +21,5 @@ List of changes made to original code:
- reformatted code to be closer to Google coding standards
- renamed variables
- added comments
- extracted qsufsort into qsufsort.h in 'courgette::qsuf' namespace
- added unit tests for qsufsort

@ -20,6 +20,8 @@
2010-05-26 - Use a paged array for V and I. The address space may be too
fragmented for these big arrays to be contiguous.
--Stephen Adams <sra@chromium.org>
2015-08-03 - Extract qsufsort portion to a separate file.
--Samuel Huang <huangs@chromium.org>
*/
#include "courgette/third_party/bsdiff.h"
@ -35,162 +37,10 @@
#include "courgette/crc.h"
#include "courgette/streams.h"
#include "courgette/third_party/paged_array.h"
#include "courgette/third_party/qsufsort.h"
namespace courgette {
// ------------------------------------------------------------------------
//
// The following code is taken verbatim from 'bsdiff.c'. Please keep all the
// code formatting and variable names. The changes from the original are (1)
// replacing tabs with spaces, (2) indentation, (3) using 'const', and (4)
// changing the V and I parameters from int* to PagedArray<int>&.
//
// The code appears to be a rewritten version of the suffix array algorithm
// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
// Sadakane, special cased for bytes.
static void
split(PagedArray<int>& I,PagedArray<int>& V,int start,int len,int h)
{
int i,j,k,x,tmp,jj,kk;
if(len<16) {
for(k=start;k<start+len;k+=j) {
j=1;x=V[I[k]+h];
for(i=1;k+i<start+len;i++) {
if(V[I[k+i]+h]<x) {
x=V[I[k+i]+h];
j=0;
};
if(V[I[k+i]+h]==x) {
tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
j++;
};
};
for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
if(j==1) I[k]=-1;
};
return;
};
x=V[I[start+len/2]+h];
jj=0;kk=0;
for(i=start;i<start+len;i++) {
if(V[I[i]+h]<x) jj++;
if(V[I[i]+h]==x) kk++;
};
jj+=start;kk+=jj;
i=start;j=0;k=0;
while(i<jj) {
if(V[I[i]+h]<x) {
i++;
} else if(V[I[i]+h]==x) {
tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
j++;
} else {
tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
k++;
};
};
while(jj+j<kk) {
if(V[I[jj+j]+h]==x) {
j++;
} else {
tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
k++;
};
};
if(jj>start) split(I,V,start,jj-start,h);
for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
if(jj==kk-1) I[jj]=-1;
if(start+len>kk) split(I,V,kk,start+len-kk,h);
}
static void
qsufsort(PagedArray<int>& I, PagedArray<int>& V,const unsigned char *old,int oldsize)
{
int buckets[256];
int i,h,len;
for(i=0;i<256;i++) buckets[i]=0;
for(i=0;i<oldsize;i++) buckets[old[i]]++;
for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
for(i=255;i>0;i--) buckets[i]=buckets[i-1];
buckets[0]=0;
for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
I[0]=oldsize;
for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
V[oldsize]=0;
for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
I[0]=-1;
for(h=1;I[0]!=-(oldsize+1);h+=h) {
len=0;
for(i=0;i<oldsize+1;) {
if(I[i]<0) {
len-=I[i];
i-=I[i];
} else {
if(len) I[i-len]=-len;
len=V[I[i]]+1-i;
split(I,V,i,len,h);
i+=len;
len=0;
};
};
if(len) I[i-len]=-len;
};
for(i=0;i<oldsize+1;i++) I[V[i]]=i;
}
static int
matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)
{
int i;
for(i=0;(i<oldsize)&&(i<newsize);i++)
if(old[i]!=newbuf[i]) break;
return i;
}
static int
search(PagedArray<int>& I,const unsigned char *old,int oldsize,
const unsigned char *newbuf,int newsize,int st,int en,int *pos)
{
int x,y;
if(en-st<2) {
x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);
y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);
if(x>y) {
*pos=I[st];
return x;
} else {
*pos=I[en];
return y;
}
}
x=st+(en-st)/2;
if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {
return search(I,old,oldsize,newbuf,newsize,x,en,pos);
} else {
return search(I,old,oldsize,newbuf,newsize,st,x,pos);
}
}
// End of 'verbatim' code.
// ------------------------------------------------------------------------
static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {
bool ok = stream->Write(header->tag, sizeof(header->tag));
ok &= stream->WriteVarint32(header->slen);
@ -236,7 +86,7 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
}
base::Time q_start_time = base::Time::Now();
qsufsort(I, V, old, oldsize);
qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);
VLOG(1) << " done qsufsort "
<< (base::Time::Now() - q_start_time).InSecondsF();
V.clear();
@ -300,9 +150,8 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
scan += match_length;
for (int scsc = scan; scan < newsize; ++scan) {
match_length = search(I, old, oldsize,
newbuf + scan, newsize - scan,
0, oldsize, &pos);
match_length = qsuf::search<PagedArray<int>&>(
I, old, oldsize, newbuf + scan, newsize - scan, 0, oldsize, &pos);
for ( ; scsc < scan + match_length ; scsc++)
if ((scsc + lastoffset < oldsize) &&
@ -451,4 +300,4 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
return OK;
}
} // namespace
} // namespace courgette

185
courgette/third_party/qsufsort.h vendored Normal file

@ -0,0 +1,185 @@
/*
qsufsort.h -- Suffix array generation.
Copyright 2003 Colin Percival
For the terms under which this work may be distributed, please see
the adjoining file "LICENSE".
ChangeLog:
2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
values throughout.
--Benjamin Smedberg <benjamin@smedbergs.us>
2010-05-26 - Use a paged array for V and I. The address space may be too
fragmented for these big arrays to be contiguous.
--Stephen Adams <sra@chromium.org>
2015-08-03 - Extrat qsufsort to a separate file as template.
--Samuel Huang <huangs@chromium.org>
*/
#include <algorithm>
#include <cstring>
namespace courgette {
namespace qsuf {
// ------------------------------------------------------------------------
//
// The following code is taken verbatim from 'bsdiff.c'. Please keep all the
// code formatting and variable names. The changes from the original are:
// (1) replacing tabs with spaces,
// (2) indentation,
// (3) using 'const',
// (4) changing the V and I parameters from int* to template <typename T>.
//
// The code appears to be a rewritten version of the suffix array algorithm
// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
// Sadakane, special cased for bytes.
template <typename T>
static void
split(T I,T V,int start,int len,int h)
{
int i,j,k,x,tmp,jj,kk;
if(len<16) {
for(k=start;k<start+len;k+=j) {
j=1;x=V[I[k]+h];
for(i=1;k+i<start+len;i++) {
if(V[I[k+i]+h]<x) {
x=V[I[k+i]+h];
j=0;
};
if(V[I[k+i]+h]==x) {
tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
j++;
};
};
for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
if(j==1) I[k]=-1;
};
return;
};
x=V[I[start+len/2]+h];
jj=0;kk=0;
for(i=start;i<start+len;i++) {
if(V[I[i]+h]<x) jj++;
if(V[I[i]+h]==x) kk++;
};
jj+=start;kk+=jj;
i=start;j=0;k=0;
while(i<jj) {
if(V[I[i]+h]<x) {
i++;
} else if(V[I[i]+h]==x) {
tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
j++;
} else {
tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
k++;
};
};
while(jj+j<kk) {
if(V[I[jj+j]+h]==x) {
j++;
} else {
tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
k++;
};
};
if(jj>start) split<T>(I,V,start,jj-start,h);
for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
if(jj==kk-1) I[jj]=-1;
if(start+len>kk) split<T>(I,V,kk,start+len-kk,h);
}
template <class T>
static void
qsufsort(T I, T V,const unsigned char *old,int oldsize)
{
int buckets[256];
int i,h,len;
for(i=0;i<256;i++) buckets[i]=0;
for(i=0;i<oldsize;i++) buckets[old[i]]++;
for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
for(i=255;i>0;i--) buckets[i]=buckets[i-1];
buckets[0]=0;
for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
I[0]=oldsize;
for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
V[oldsize]=0;
for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
I[0]=-1;
for(h=1;I[0]!=-(oldsize+1);h+=h) {
len=0;
for(i=0;i<oldsize+1;) {
if(I[i]<0) {
len-=I[i];
i-=I[i];
} else {
if(len) I[i-len]=-len;
len=V[I[i]]+1-i;
split<T>(I,V,i,len,h);
i+=len;
len=0;
};
};
if(len) I[i-len]=-len;
};
for(i=0;i<oldsize+1;i++) I[V[i]]=i;
}
static int
matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)
{
int i;
for(i=0;(i<oldsize)&&(i<newsize);i++)
if(old[i]!=newbuf[i]) break;
return i;
}
template <class T>
static int
search(T I,const unsigned char *old,int oldsize,
const unsigned char *newbuf,int newsize,int st,int en,int *pos)
{
int x,y;
if(en-st<2) {
x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);
y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);
if(x>y) {
*pos=I[st];
return x;
} else {
*pos=I[en];
return y;
}
}
x=st+(en-st)/2;
if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {
return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos);
} else {
return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos);
}
}
// End of 'verbatim' code.
// ------------------------------------------------------------------------
} // namespace qsuf
} // namespace courgette

@ -0,0 +1,135 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "courgette/third_party/qsufsort.h"
#include <algorithm>
#include <cstring>
#include <string>
#include <vector>
#include "base/macros.h"
#include "base/memory/scoped_ptr.h"
#include "testing/gtest/include/gtest/gtest.h"
TEST(QSufSortTest, Sort) {
const char* test_cases[] = {
"",
"a",
"za",
"CACAO",
"banana",
"tobeornottobe",
"The quick brown fox jumps over the lazy dog.",
"elephantelephantelephantelephantelephant",
"-------------------------",
"011010011001011010010110011010010",
"3141592653589793238462643383279502884197169399375105",
"\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
};
for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
int len = static_cast<int>(::strlen(test_cases[idx]));
const unsigned char* s =
reinterpret_cast<const unsigned char*>(test_cases[idx]);
// Generate the suffix array as I.
std::vector<int> I(len + 1);
std::vector<int> V(len + 1);
courgette::qsuf::qsufsort<int*>(&I[0], &V[0], s, len);
// Expect that I[] is a permutation of [0, len].
std::vector<int> I_sorted(I);
std::sort(I_sorted.begin(), I_sorted.end());
for (int i = 0; i < len + 1; ++i) {
EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]";
}
// First string must be empty string.
EXPECT_EQ(len, I[0]) << "test_case[" << idx << "]";
// Expect that the |len + 1| suffixes are strictly ordered.
const unsigned char* end = s + len;
for (int i = 0; i < len; ++i) {
const unsigned char* suf1 = s + I[i];
const unsigned char* suf2 = s + I[i + 1];
bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
EXPECT_TRUE(is_less) << "test_case[" << idx << "]";
}
}
}
TEST(QSufSortTest, Search) {
// Initialize main string and the suffix array.
// Positions: 00000000001111111111122222222233333333334444
// 01234567890123456789012345678901234567890123
const char* old_str = "the quick brown fox jumps over the lazy dog.";
int old_size = static_cast<int>(::strlen(old_str));
const unsigned char* old_buf =
reinterpret_cast<const unsigned char*>(old_str);
std::vector<int> I(old_size + 1);
std::vector<int> V(old_size + 1);
courgette::qsuf::qsufsort<int*>(&I[0], &V[0], old_buf, old_size);
// Test queries.
const struct {
int exp_pos; // -1 means "don't care".
int exp_match_len;
const char* query_str;
} test_cases[] = {
// Entire string.
{0, 44, "the quick brown fox jumps over the lazy dog."},
// Empty string.
{-1, 0, ""}, // Current algorithm does not enforce |pos| == 0.
// Exact and unique suffix match.
{43, 1, "."},
{31, 13, "the lazy dog."},
// Exact and unique non-suffix match.
{4, 5, "quick"},
{0, 9, "the quick"}, // Unique prefix.
// Entire word match with mutiple results: take lexicographical first.
{31, 3, "the"}, // Non-unique prefix: "the l"... < "the q"...
{9, 1, " "}, // " brown"... wins.
// Partial and unique match of query prefix.
{16, 10, "fox jumps with the hosps"},
// Partial and multiple match of query prefix: no guarantees on |pos|.
// Take lexicographical first for matching portion *only*, so same results:
{-1, 4, "the apple"}, // query < "the l"... < "the q"...
{-1, 4, "the opera"}, // "the l"... < query < "the q"...
{-1, 4, "the zebra"}, // "the l"... < "the q"... < query
// Prefix match dominates suffix match.
{26, 5, "over quick brown fox"},
// No match.
{-1, 0, ","},
{-1, 0, "1234"},
{-1, 0, "THE QUICK BROWN FOX"},
{-1, 0, "(the"},
};
for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
const auto& test_case = test_cases[idx];
int new_size = static_cast<int>(::strlen(test_case.query_str));
const unsigned char* new_buf =
reinterpret_cast<const unsigned char*>(test_case.query_str);
// Perform the search.
int pos = 0;
int match_len = courgette::qsuf::search(
&I[0], old_buf, old_size, new_buf, new_size, 0, old_size, &pos);
// Check basic properties and match with expected values.
EXPECT_GE(match_len, 0) << "test_case[" << idx << "]";
EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]";
if (match_len > 0) {
EXPECT_GE(pos, 0) << "test_case[" << idx << "]";
EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]";
EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len))
<< "test_case[" << idx << "]";
}
if (test_case.exp_pos >= 0) {
EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]";
}
EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]";
}
}

@ -129,6 +129,9 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = {
'courgette/third_party/bsdiff_create.cc': [ # http://crbug.com/98095
'UNKNOWN',
],
'courgette/third_party/qsufsort.h': [ # http://crbug.com/98095
'UNKNOWN',
],
'native_client': [ # http://crbug.com/98099
'UNKNOWN',
],