[Courgette] Extract qsufsort module and add tests.

This is to prepare for a follow-up CL to optimize qsufsort(), which
reduces run time by ~40%, and "courgette -gen" run time by ~10%.

Review URL: https://codereview.chromium.org/1272453003

Cr-Commit-Position: refs/heads/master@{#342883}

This commit is contained in:

huangs

2015-08-11 13:58:52 -07:00

committed by

Commit bot

parent 4e6546e538

commit c1de821fbf

7 changed files with 341 additions and 163 deletions

courgette

BUILD.gn courgette.gyp

third_party

README.chromium bsdiff_create.cc qsufsort.h qsufsort_unittest.cc

tools/checklicenses

checklicenses.py

2

courgette/BUILD.gn

 @ -47,6 +47,7 @@ static_library("courgette_lib") {
     "third_party/bsdiff_apply.cc",
     "third_party/bsdiff_create.cc",
     "third_party/paged_array.h",
     "third_party/qsufsort.h",
     "types_elf.h",
     "types_win_pe.h",
   ]
 @ -104,6 +105,7 @@ test("courgette_unittests") {
     "memory_allocator_unittest.cc",
     "streams_unittest.cc",
     "third_party/paged_array_unittest.cc",
     "third_party/qsufsort_unittest.cc",
     "typedrva_unittest.cc",
     "versioning_unittest.cc",
   ]

12

courgette/courgette.gyp

 @ -11,10 +11,6 @@
       'adjustment_method.h',
       'assembly_program.cc',
       'assembly_program.h',
       'third_party/bsdiff.h',
       'third_party/bsdiff_apply.cc',
       'third_party/bsdiff_create.cc',
       'third_party/paged_array.h',
       'courgette.h',
       'crc.cc',
       'crc.h',
 @ -45,6 +41,11 @@
       'simple_delta.h',
       'streams.cc',
       'streams.h',
       'third_party/bsdiff.h',
       'third_party/bsdiff_apply.cc',
       'third_party/bsdiff_create.cc',
       'third_party/paged_array.h',
       'third_party/qsufsort.h',
       'types_elf.h',
       'types_win_pe.h',
       'patch_generator_x86_32.h',
 @ -109,7 +110,8 @@
         'streams_unittest.cc',
         'typedrva_unittest.cc',
         'versioning_unittest.cc',
         'third_party/paged_array_unittest.cc'
         'third_party/paged_array_unittest.cc',
         'third_party/qsufsort_unittest.cc',
       ],
       'dependencies': [
         'courgette_lib',

2

courgette/third_party/README.chromium vendored

 @ -21,3 +21,5 @@ List of changes made to original code:
   - reformatted code to be closer to Google coding standards
   - renamed variables
   - added comments
   - extracted qsufsort into qsufsort.h in 'courgette::qsuf' namespace
   - added unit tests for qsufsort

									
										165

courgette/third_party/bsdiff_create.cc
									
										vendored
									
				@ -20,6 +20,8 @@

				  2010-05-26 - Use a paged array for V and I. The address space may be too

				               fragmented for these big arrays to be contiguous.

				                 --Stephen Adams <sra@chromium.org>

				  2015-08-03 - Extract qsufsort portion to a separate file.

				                 --Samuel Huang <huangs@chromium.org>

				*/

				#include "courgette/third_party/bsdiff.h"

				@ -35,162 +37,10 @@

				#include "courgette/crc.h"

				#include "courgette/streams.h"

				#include "courgette/third_party/paged_array.h"

				#include "courgette/third_party/qsufsort.h"

				namespace courgette {

				// ------------------------------------------------------------------------

				//

				// The following code is taken verbatim from 'bsdiff.c'. Please keep all the

				// code formatting and variable names.  The changes from the original are (1)

				// replacing tabs with spaces, (2) indentation, (3) using 'const', and (4)

				// changing the V and I parameters from int* to PagedArray<int>&.

				//

				// The code appears to be a rewritten version of the suffix array algorithm

				// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko

				// Sadakane, special cased for bytes.

				static void

				split(PagedArray<int>& I,PagedArray<int>& V,int start,int len,int h)

				{

				  int i,j,k,x,tmp,jj,kk;

				  if(len<16) {

				    for(k=start;k<start+len;k+=j) {

				      j=1;x=V[I[k]+h];

				      for(i=1;k+i<start+len;i++) {

				        if(V[I[k+i]+h]<x) {

				          x=V[I[k+i]+h];

				          j=0;

				        };

				        if(V[I[k+i]+h]==x) {

				          tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;

				          j++;

				        };

				      };

				      for(i=0;i<j;i++) V[I[k+i]]=k+j-1;

				      if(j==1) I[k]=-1;

				    };

				    return;

				  };

				  x=V[I[start+len/2]+h];

				  jj=0;kk=0;

				  for(i=start;i<start+len;i++) {

				    if(V[I[i]+h]<x) jj++;

				    if(V[I[i]+h]==x) kk++;

				  };

				  jj+=start;kk+=jj;

				  i=start;j=0;k=0;

				  while(i<jj) {

				    if(V[I[i]+h]<x) {

				      i++;

				    } else if(V[I[i]+h]==x) {

				      tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;

				      j++;

				    } else {

				      tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;

				      k++;

				    };

				  };

				  while(jj+j<kk) {

				    if(V[I[jj+j]+h]==x) {

				      j++;

				    } else {

				      tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;

				      k++;

				    };

				  };

				  if(jj>start) split(I,V,start,jj-start,h);

				  for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;

				  if(jj==kk-1) I[jj]=-1;

				  if(start+len>kk) split(I,V,kk,start+len-kk,h);

				}

				static void

				qsufsort(PagedArray<int>& I, PagedArray<int>& V,const unsigned char *old,int oldsize)

				{

				  int buckets[256];

				  int i,h,len;

				  for(i=0;i<256;i++) buckets[i]=0;

				  for(i=0;i<oldsize;i++) buckets[old[i]]++;

				  for(i=1;i<256;i++) buckets[i]+=buckets[i-1];

				  for(i=255;i>0;i--) buckets[i]=buckets[i-1];

				  buckets[0]=0;

				  for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;

				  I[0]=oldsize;

				  for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];

				  V[oldsize]=0;

				  for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;

				  I[0]=-1;

				  for(h=1;I[0]!=-(oldsize+1);h+=h) {

				    len=0;

				    for(i=0;i<oldsize+1;) {

				      if(I[i]<0) {

				        len-=I[i];

				        i-=I[i];

				      } else {

				        if(len) I[i-len]=-len;

				        len=V[I[i]]+1-i;

				        split(I,V,i,len,h);

				        i+=len;

				        len=0;

				      };

				    };

				    if(len) I[i-len]=-len;

				  };

				  for(i=0;i<oldsize+1;i++) I[V[i]]=i;

				}

				static int

				matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)

				{

				  int i;

				  for(i=0;(i<oldsize)&&(i<newsize);i++)

				    if(old[i]!=newbuf[i]) break;

				  return i;

				}

				static int

				search(PagedArray<int>& I,const unsigned char *old,int oldsize,

				       const unsigned char *newbuf,int newsize,int st,int en,int *pos)

				{

				  int x,y;

				  if(en-st<2) {

				    x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);

				    y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);

				    if(x>y) {

				      *pos=I[st];

				      return x;

				    } else {

				      *pos=I[en];

				      return y;

				    }

				  }

				  x=st+(en-st)/2;

				  if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {

				    return search(I,old,oldsize,newbuf,newsize,x,en,pos);

				  } else {

				    return search(I,old,oldsize,newbuf,newsize,st,x,pos);

				  }

				}

				//  End of 'verbatim' code.

				// ------------------------------------------------------------------------

				static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {

				  bool ok = stream->Write(header->tag, sizeof(header->tag));

				  ok &= stream->WriteVarint32(header->slen);

				@ -236,7 +86,7 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,

				  }

				  base::Time q_start_time = base::Time::Now();

				  qsufsort(I, V, old, oldsize);

				  qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);

				  VLOG(1) << " done qsufsort "

				          << (base::Time::Now() - q_start_time).InSecondsF();

				  V.clear();

				@ -300,9 +150,8 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,

				    scan += match_length;

				    for (int scsc = scan;  scan < newsize;  ++scan) {

				      match_length = search(I, old, oldsize,

				                            newbuf + scan, newsize - scan,

				                            0, oldsize, &pos);

				      match_length = qsuf::search<PagedArray<int>&>(

				          I, old, oldsize, newbuf + scan, newsize - scan, 0, oldsize, &pos);

				      for ( ; scsc < scan + match_length ; scsc++)

				        if ((scsc + lastoffset < oldsize) &&

				@ -451,4 +300,4 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,

				  return OK;

				}

				}  // namespace

				}  // namespace courgette

									
										185

courgette/third_party/qsufsort.h
									
										vendored
									
										Normal file
									
				@ -0,0 +1,185 @@

				/*

				  qsufsort.h -- Suffix array generation.

				  Copyright 2003 Colin Percival

				  For the terms under which this work may be distributed, please see

				  the adjoining file "LICENSE".

				  ChangeLog:

				  2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit

				               values throughout.

				                 --Benjamin Smedberg <benjamin@smedbergs.us>

				  2010-05-26 - Use a paged array for V and I. The address space may be too

				               fragmented for these big arrays to be contiguous.

				                 --Stephen Adams <sra@chromium.org>

				  2015-08-03 - Extrat qsufsort to a separate file as template.

				                 --Samuel Huang <huangs@chromium.org>

				*/

				#include <algorithm>

				#include <cstring>

				namespace courgette {

				namespace qsuf {

				// ------------------------------------------------------------------------

				//

				// The following code is taken verbatim from 'bsdiff.c'. Please keep all the

				// code formatting and variable names.  The changes from the original are:

				// (1) replacing tabs with spaces,

				// (2) indentation,

				// (3) using 'const',

				// (4) changing the V and I parameters from int* to template <typename T>.

				//

				// The code appears to be a rewritten version of the suffix array algorithm

				// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko

				// Sadakane, special cased for bytes.

				template <typename T>

				static void

				split(T I,T V,int start,int len,int h)

				{

				  int i,j,k,x,tmp,jj,kk;

				  if(len<16) {

				    for(k=start;k<start+len;k+=j) {

				      j=1;x=V[I[k]+h];

				      for(i=1;k+i<start+len;i++) {

				        if(V[I[k+i]+h]<x) {

				          x=V[I[k+i]+h];

				          j=0;

				        };

				        if(V[I[k+i]+h]==x) {

				          tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;

				          j++;

				        };

				      };

				      for(i=0;i<j;i++) V[I[k+i]]=k+j-1;

				      if(j==1) I[k]=-1;

				    };

				    return;

				  };

				  x=V[I[start+len/2]+h];

				  jj=0;kk=0;

				  for(i=start;i<start+len;i++) {

				    if(V[I[i]+h]<x) jj++;

				    if(V[I[i]+h]==x) kk++;

				  };

				  jj+=start;kk+=jj;

				  i=start;j=0;k=0;

				  while(i<jj) {

				    if(V[I[i]+h]<x) {

				      i++;

				    } else if(V[I[i]+h]==x) {

				      tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;

				      j++;

				    } else {

				      tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;

				      k++;

				    };

				  };

				  while(jj+j<kk) {

				    if(V[I[jj+j]+h]==x) {

				      j++;

				    } else {

				      tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;

				      k++;

				    };

				  };

				  if(jj>start) split<T>(I,V,start,jj-start,h);

				  for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;

				  if(jj==kk-1) I[jj]=-1;

				  if(start+len>kk) split<T>(I,V,kk,start+len-kk,h);

				}

				template <class T>

				static void

				qsufsort(T I, T V,const unsigned char *old,int oldsize)

				{

				  int buckets[256];

				  int i,h,len;

				  for(i=0;i<256;i++) buckets[i]=0;

				  for(i=0;i<oldsize;i++) buckets[old[i]]++;

				  for(i=1;i<256;i++) buckets[i]+=buckets[i-1];

				  for(i=255;i>0;i--) buckets[i]=buckets[i-1];

				  buckets[0]=0;

				  for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;

				  I[0]=oldsize;

				  for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];

				  V[oldsize]=0;

				  for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;

				  I[0]=-1;

				  for(h=1;I[0]!=-(oldsize+1);h+=h) {

				    len=0;

				    for(i=0;i<oldsize+1;) {

				      if(I[i]<0) {

				        len-=I[i];

				        i-=I[i];

				      } else {

				        if(len) I[i-len]=-len;

				        len=V[I[i]]+1-i;

				        split<T>(I,V,i,len,h);

				        i+=len;

				        len=0;

				      };

				    };

				    if(len) I[i-len]=-len;

				  };

				  for(i=0;i<oldsize+1;i++) I[V[i]]=i;

				}

				static int

				matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)

				{

				  int i;

				  for(i=0;(i<oldsize)&&(i<newsize);i++)

				    if(old[i]!=newbuf[i]) break;

				  return i;

				}

				template <class T>

				static int

				search(T I,const unsigned char *old,int oldsize,

				       const unsigned char *newbuf,int newsize,int st,int en,int *pos)

				{

				  int x,y;

				  if(en-st<2) {

				    x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);

				    y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);

				    if(x>y) {

				      *pos=I[st];

				      return x;

				    } else {

				      *pos=I[en];

				      return y;

				    }

				  }

				  x=st+(en-st)/2;

				  if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {

				    return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos);

				  } else {

				    return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos);

				  }

				}

				//  End of 'verbatim' code.

				// ------------------------------------------------------------------------

				}  // namespace qsuf

				}  // namespace courgette

									
										135

courgette/third_party/qsufsort_unittest.cc
									
										vendored
									
										Normal file
									
				@ -0,0 +1,135 @@

				// Copyright 2015 The Chromium Authors. All rights reserved.

				// Use of this source code is governed by a BSD-style license that can be

				// found in the LICENSE file.

				#include "courgette/third_party/qsufsort.h"

				#include <algorithm>

				#include <cstring>

				#include <string>

				#include <vector>

				#include "base/macros.h"

				#include "base/memory/scoped_ptr.h"

				#include "testing/gtest/include/gtest/gtest.h"

				TEST(QSufSortTest, Sort) {

				  const char* test_cases[] = {

				    "",

				    "a",

				    "za",

				    "CACAO",

				    "banana",

				    "tobeornottobe",

				    "The quick brown fox jumps over the lazy dog.",

				    "elephantelephantelephantelephantelephant",

				    "-------------------------",

				    "011010011001011010010110011010010",

				    "3141592653589793238462643383279502884197169399375105",

				    "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",

				  };

				  for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {

				    int len = static_cast<int>(::strlen(test_cases[idx]));

				    const unsigned char* s =

				        reinterpret_cast<const unsigned char*>(test_cases[idx]);

				    // Generate the suffix array as I.

				    std::vector<int> I(len + 1);

				    std::vector<int> V(len + 1);

				    courgette::qsuf::qsufsort<int*>(&I[0], &V[0], s, len);

				    // Expect that I[] is a permutation of [0, len].

				    std::vector<int> I_sorted(I);

				    std::sort(I_sorted.begin(), I_sorted.end());

				    for (int i = 0; i < len + 1; ++i) {

				      EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]";

				    }

				    // First string must be empty string.

				    EXPECT_EQ(len, I[0]) << "test_case[" << idx << "]";

				    // Expect that the |len + 1| suffixes are strictly ordered.

				    const unsigned char* end = s + len;

				    for (int i = 0; i < len; ++i) {

				      const unsigned char* suf1 = s + I[i];

				      const unsigned char* suf2 = s + I[i + 1];

				      bool is_less = std::lexicographical_compare(suf1, end, suf2, end);

				      EXPECT_TRUE(is_less) << "test_case[" << idx << "]";

				    }

				  }

				}

				TEST(QSufSortTest, Search) {

				  // Initialize main string and the suffix array.

				  // Positions:          00000000001111111111122222222233333333334444

				  //                     01234567890123456789012345678901234567890123

				  const char* old_str = "the quick brown fox jumps over the lazy dog.";

				  int old_size = static_cast<int>(::strlen(old_str));

				  const unsigned char* old_buf =

				        reinterpret_cast<const unsigned char*>(old_str);

				  std::vector<int> I(old_size + 1);

				  std::vector<int> V(old_size + 1);

				  courgette::qsuf::qsufsort<int*>(&I[0], &V[0], old_buf, old_size);

				  // Test queries.

				  const struct {

				    int exp_pos;  // -1 means "don't care".

				    int exp_match_len;

				    const char* query_str;

				  } test_cases[] = {

				    // Entire string.

				    {0, 44, "the quick brown fox jumps over the lazy dog."},

				    // Empty string.

				    {-1, 0, ""},  // Current algorithm does not enforce |pos| == 0.

				    // Exact and unique suffix match.

				    {43, 1, "."},

				    {31, 13, "the lazy dog."},

				    // Exact and unique non-suffix match.

				    {4, 5, "quick"},

				    {0, 9, "the quick"},  // Unique prefix.

				    // Entire word match with mutiple results: take lexicographical first.

				    {31, 3, "the"},  // Non-unique prefix: "the l"... < "the q"...

				    {9, 1, " "},  // " brown"... wins.

				    // Partial and unique match of query prefix.

				    {16, 10, "fox jumps with the hosps"},

				    // Partial and multiple match of query prefix: no guarantees on |pos|.

				    // Take lexicographical first for matching portion *only*, so same results:

				    {-1, 4, "the apple"},  // query      < "the l"... < "the q"...

				    {-1, 4, "the opera"},  // "the l"... < query      < "the q"...

				    {-1, 4, "the zebra"},  // "the l"... < "the q"... < query

				    // Prefix match dominates suffix match.

				    {26, 5, "over quick brown fox"},

				    // No match.

				    {-1, 0, ","},

				    {-1, 0, "1234"},

				    {-1, 0, "THE QUICK BROWN FOX"},

				    {-1, 0, "(the"},

				  };

				  for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {

				    const auto& test_case = test_cases[idx];

				    int new_size = static_cast<int>(::strlen(test_case.query_str));

				    const unsigned char* new_buf =

				        reinterpret_cast<const unsigned char*>(test_case.query_str);

				    // Perform the search.

				    int pos = 0;

				    int match_len = courgette::qsuf::search(

				        &I[0], old_buf, old_size, new_buf, new_size, 0, old_size, &pos);

				    // Check basic properties and match with expected values.

				    EXPECT_GE(match_len, 0) << "test_case[" << idx << "]";

				    EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]";

				    if (match_len > 0) {

				      EXPECT_GE(pos, 0) << "test_case[" << idx << "]";

				      EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]";

				      EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len))

				          << "test_case[" << idx << "]";

				    }

				    if (test_case.exp_pos >= 0) {

				      EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]";

				    }

				    EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]";

				  }

				}

									
										3

tools/checklicenses/checklicenses.py
									
				@ -129,6 +129,9 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = {

				    'courgette/third_party/bsdiff_create.cc': [  # http://crbug.com/98095

				        'UNKNOWN',

				    ],

				    'courgette/third_party/qsufsort.h': [  # http://crbug.com/98095

				        'UNKNOWN',

				    ],

				    'native_client': [  # http://crbug.com/98099

				        'UNKNOWN',

				    ],

[Courgette] Extract qsufsort module and add tests.

2 courgette/BUILD.gn

12 courgette/courgette.gyp

2 courgette/third_party/README.chromium vendored

165 courgette/third_party/bsdiff_create.cc vendored

185 courgette/third_party/qsufsort.h vendored Normal file

135 courgette/third_party/qsufsort_unittest.cc vendored Normal file

3 tools/checklicenses/checklicenses.py

2

courgette/BUILD.gn

12

courgette/courgette.gyp

2

courgette/third_party/README.chromium vendored

165

courgette/third_party/bsdiff_create.cc vendored

185

courgette/third_party/qsufsort.h vendored Normal file

135

courgette/third_party/qsufsort_unittest.cc vendored Normal file

3

tools/checklicenses/checklicenses.py