0

[Courgette] Replace QSufSort with libdivsufsort.

Last step of the effort to get Courgette to use libdivsufsort.

BUG=608885

Review-Url: https://codereview.chromium.org/2187953003
Cr-Commit-Position: refs/heads/master@{#409326}
This commit is contained in:
huangs
2016-08-02 14:28:12 -07:00
committed by Commit bot
parent bf7ab24213
commit 4efc71b655
8 changed files with 27 additions and 332 deletions

@ -62,7 +62,6 @@ static_library("courgette_lib") {
"third_party/bsdiff/bsdiff_create.cc",
"third_party/bsdiff/bsdiff_search.h",
"third_party/bsdiff/paged_array.h",
"third_party/bsdiff/qsufsort.h",
"third_party/divsufsort/divsufsort.cc",
"third_party/divsufsort/divsufsort.h",
"third_party/divsufsort/divsufsort_private.h",
@ -178,7 +177,6 @@ test("courgette_unittests") {
"streams_unittest.cc",
"third_party/bsdiff/bsdiff_search_unittest.cc",
"third_party/bsdiff/paged_array_unittest.cc",
"third_party/bsdiff/qsufsort_unittest.cc",
"third_party/divsufsort/divsufsort_unittest.cc",
"typedrva_unittest.cc",
"versioning_unittest.cc",

@ -62,7 +62,6 @@
'third_party/bsdiff/bsdiff_create.cc',
'third_party/bsdiff/bsdiff_search.h',
'third_party/bsdiff/paged_array.h',
'third_party/bsdiff/qsufsort.h',
'third_party/divsufsort/divsufsort.cc',
'third_party/divsufsort/divsufsort.h',
'third_party/divsufsort/divsufsort_private.h',
@ -137,7 +136,6 @@
'versioning_unittest.cc',
'third_party/bsdiff/bsdiff_search_unittest.cc',
'third_party/bsdiff/paged_array_unittest.cc',
'third_party/bsdiff/qsufsort_unittest.cc',
'third_party/divsufsort/divsufsort_unittest.cc',
],
'dependencies': [

@ -22,12 +22,10 @@ List of changes made to original code:
- Wrapped functions in 'bsdiff' namespace.
- Renamed .c files to .cc files.
- Added bsdiff.h and bsdiff_search.h header files.
- Changed the code to use streams.h from courgette.
- Changed the code to use streams.h from Courgette.
- Changed the encoding of numbers to use the 'varint' encoding.
- Reformatted code to be closer to Google coding standards.
- Renamed variables.
- Added comments.
- Extracted qsufsort into qsufsort.h in 'qsuf' namespace.
- Added unit tests for qsufsort.
- Fixed qsufsort pivoting issue: http://crbug.com/605565.
- Fixed search() comparison issue: http://crbug.com/620867.
- Replaced QSufSort with modified version of libdivsufsort.

@ -25,7 +25,7 @@
// For the terms under which this work may be distributed, please see
// the adjoining file "LICENSE".
//
// Changelog:
// ChangeLog:
// 2009-03-31 - Change to use Streams. Move CRC code to crc.{h,cc}
// --Stephen Adams <sra@chromium.org>
// 2013-04-10 - Add wrapper method to apply a patch to files directly.

@ -38,11 +38,13 @@
// --Stephen Adams <sra@chromium.org>
// 2010-05-26 - Use a paged array for V and I. The address space may be too
// fragmented for these big arrays to be contiguous.
// --Stephen Adams <sra@chromium.org>
// --Stephen Adams <sra@chromium.org>
// 2015-08-03 - Extract qsufsort portion to a separate file.
// --Samuel Huang <huangs@chromium.org>
// 2015-08-12 - Interface change to search().
// --Samuel Huang <huangs@chromium.org>
// 2016-07-29 - Replacing qsufsort with divsufsort.
// --Samuel Huang <huangs@chromium.org>
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
@ -53,6 +55,7 @@
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <algorithm>
#include "base/logging.h"
@ -63,7 +66,7 @@
#include "courgette/streams.h"
#include "courgette/third_party/bsdiff/bsdiff_search.h"
#include "courgette/third_party/bsdiff/paged_array.h"
#include "courgette/third_party/bsdiff/qsufsort.h"
#include "courgette/third_party/divsufsort/divsufsort.h"
namespace {
@ -106,8 +109,7 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
uint32_t pending_diff_zeros = 0;
PagedArray<int> I;
PagedArray<int> V;
PagedArray<divsuf::saidx_t> I;
if (!I.Allocate(oldsize + 1)) {
LOG(ERROR) << "Could not allocate I[], " << ((oldsize + 1) * sizeof(int))
@ -115,17 +117,13 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
return MEM_ERROR;
}
if (!V.Allocate(oldsize + 1)) {
LOG(ERROR) << "Could not allocate V[], " << ((oldsize + 1) * sizeof(int))
<< " bytes";
return MEM_ERROR;
}
base::Time q_start_time = base::Time::Now();
qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);
VLOG(1) << " done qsufsort "
divsuf::saint_t result = divsuf::divsufsort_include_empty(
old, I.begin(), oldsize);
VLOG(1) << " done divsufsort "
<< (base::Time::Now() - q_start_time).InSecondsF();
V.clear();
if (result != 0)
return UNEXPECTED_ERROR;
const uint8_t* newbuf = new_stream->Buffer();
const int newsize = static_cast<int>(new_stream->Remaining());
@ -185,7 +183,7 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
scan += match.size;
for (int scsc = scan; scan < newsize; ++scan) {
match = search<PagedArray<int>&>(
match = search<PagedArray<divsuf::saidx_t>&>(
I, old, oldsize, newbuf + scan, newsize - scan);
for (; scsc < scan + match.size; scsc++)

@ -5,10 +5,10 @@
#include "courgette/third_party/bsdiff/bsdiff_search.h"
#include <cstring>
#include <vector>
#include "base/macros.h"
#include "courgette/third_party/bsdiff/qsufsort.h"
#include "courgette/third_party/bsdiff/paged_array.h"
#include "courgette/third_party/divsufsort/divsufsort.h"
#include "testing/gtest/include/gtest/gtest.h"
TEST(BSDiffSearchTest, Search) {
@ -18,9 +18,9 @@ TEST(BSDiffSearchTest, Search) {
const char* str = "the quick brown fox jumps over the lazy dog.";
int size = static_cast<int>(::strlen(str));
const unsigned char* buf = reinterpret_cast<const unsigned char*>(str);
std::vector<int> I(size + 1);
std::vector<int> V(size + 1);
qsuf::qsufsort<int*>(&I[0], &V[0], buf, size);
courgette::PagedArray<divsuf::saidx_t> I;
ASSERT_TRUE(I.Allocate(size + 1));
divsuf::divsufsort_include_empty(buf, I.begin(), size);
// Specific queries.
const struct {
@ -65,7 +65,8 @@ TEST(BSDiffSearchTest, Search) {
// Perform the search.
bsdiff::SearchResult match =
bsdiff::search(&I[0], buf, size, query_buf, query_size);
bsdiff::search<courgette::PagedArray<divsuf::saidx_t>&>(
I, buf, size, query_buf, query_size);
// Check basic properties and match with expected values.
EXPECT_GE(match.size, 0);
@ -100,9 +101,9 @@ TEST(BSDiffSearchTest, SearchExact) {
int size = static_cast<int>(::strlen(test_cases[idx]));
const unsigned char* buf =
reinterpret_cast<const unsigned char*>(test_cases[idx]);
std::vector<int> I(size + 1);
std::vector<int> V(size + 1);
qsuf::qsufsort<int*>(&I[0], &V[0], buf, size);
courgette::PagedArray<divsuf::saidx_t> I;
ASSERT_TRUE(I.Allocate(size + 1));
divsuf::divsufsort_include_empty(buf, I.begin(), size);
// Test exact matches for every non-empty substring.
for (int lo = 0; lo < size; ++lo) {
@ -113,7 +114,8 @@ TEST(BSDiffSearchTest, SearchExact) {
const unsigned char* query_buf =
reinterpret_cast<const unsigned char*>(query.c_str());
bsdiff::SearchResult match =
bsdiff::search(&I[0], buf, size, query_buf, query_size);
bsdiff::search<courgette::PagedArray<divsuf::saidx_t>&>(
I, buf, size, query_buf, query_size);
EXPECT_EQ(query_size, match.size);
EXPECT_GE(match.pos, 0);

@ -1,225 +0,0 @@
// Copyright 2003, 2004 Colin Percival
// All rights reserved
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted providing that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
// IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// For the terms under which this work may be distributed, please see
// the adjoining file "LICENSE".
//
// ChangeLog:
// 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
// values throughout.
// --Benjamin Smedberg <benjamin@smedbergs.us>
// 2010-05-26 - Use a paged array for V and I. The address space may be too
// fragmented for these big arrays to be contiguous.
// --Stephen Adams <sra@chromium.org>
// 2015-08-03 - Extract QSufSort to a separate file as template.
// --Samuel Huang <huangs@chromium.org>
// 2015-08-19 - Optimize split(), add comments.
// --Samuel Huang <huangs@chromium.org>
// 2016-04-27 - Change split() to use Bentley & McIlroy's pivot selection
// algorithm, which QSufSort originally used. Reference:
// http://www.larsson.dogma.net/qsufsort.c
// --Samuel Huang <huangs@chromium.org>
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COURGETTE_THIRD_PARTY_BSDIFF_QSUFSORT_H_
#define COURGETTE_THIRD_PARTY_BSDIFF_QSUFSORT_H_
namespace qsuf {
// ------------------------------------------------------------------------
//
// The following code is taken verbatim from 'bsdiff.c'. Please keep all the
// code formatting and variable names. The changes from the original are:
// (1) replacing tabs with spaces,
// (2) indentation and spacing,
// (3) using 'const',
// (4) changing the V and I parameters from int* to template <typename T>.
// (5) optimizing split(); fix styles.
// (6) moving matchlen() and search() to a separate file.
//
// The code appears to be a rewritten version of the suffix array algorithm
// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
// Sadakane, special cased for bytes.
namespace {
template <typename T>
T median3(const T& a, const T& b, const T& c) {
if (a < b)
return b < c ? b : (a < c ? c : a);
return b > c ? b : (a > c ? c : a);
}
} // namespace
template <typename T>
void split(T I, T V, int start, int end, int h) {
// For small interval, apply selection sort.
if (end - start < 16) {
for (int i = start; i < end;) {
int skip = 1;
int best = V[I[i] + h];
for (int j = i + 1; j < end; j++) {
int cur = V[I[j] + h];
if (best > cur) {
best = cur;
int tmp = I[i];
I[i] = I[j];
I[j] = tmp;
skip = 1;
} else if (best == cur) {
int tmp = I[i + skip];
I[i + skip] = I[j];
I[j] = tmp;
++skip;
}
}
if (skip == 1) {
V[I[i]] = i;
I[i] = -1;
} else {
for (int j = i, jend = i + skip; j < jend; j++)
V[I[j]] = jend - 1;
}
i += skip;
}
return;
}
// Select pivot, algorithm by Bentley & McIlroy.
int n = end - start;
int mid = start + (n >> 1);
int pivot = V[I[mid] + h];
int p1 = V[I[start] + h];
int p2 = V[I[end - 1] + h];
if (n > 40) { // Big array: Pseudomedian of 9.
int s = n >> 3;
pivot = median3(pivot, V[I[mid - s] + h], V[I[mid + s] + h]);
p1 = median3(p1, V[I[start + s] + h], V[I[start + s + s] + h]);
p2 = median3(p2, V[I[end - 1 - s] + h], V[I[end - 1 - s - s] + h]);
} // Else medium array: Pseudomedian of 3.
pivot = median3(pivot, p1, p2);
// Split [start, end) into 3 intervals:
// [start, j) with secondary keys < pivot,
// [j, k) with secondary keys == pivot,
// [k, end) with secondary keys > pivot.
int j = start;
int k = end;
for (int i = start; i < k;) {
int cur = V[I[i] + h];
if (cur < pivot) {
if (i != j) {
int tmp = I[i];
I[i] = I[j];
I[j] = tmp;
}
++i;
++j;
} else if (cur > pivot) {
--k;
int tmp = I[i];
I[i] = I[k];
I[k] = tmp;
} else {
++i;
}
}
// Recurse on the "< pivot" piece.
if (start < j)
split<T>(I, V, start, j, h);
// Update the "== pivot" piece.
if (j == k - 1) {
V[I[j]] = j;
I[j] = -1;
} else {
for (int i = j; i < k; ++i)
V[I[i]] = k - 1;
}
// Recurse on the "> pivot" piece.
if (k < end)
split<T>(I, V, k, end, h);
}
template <class T>
static void qsufsort(T I, T V, const unsigned char* old, int oldsize) {
int buckets[256];
int i, h, len;
for (i = 0; i < 256; i++)
buckets[i] = 0;
for (i = 0; i < oldsize; i++)
buckets[old[i]]++;
for (i = 1; i < 256; i++)
buckets[i] += buckets[i - 1];
for (i = 255; i > 0; i--)
buckets[i] = buckets[i - 1];
buckets[0] = 0;
for (i = 0; i < oldsize; i++)
I[++buckets[old[i]]] = i;
I[0] = oldsize;
for (i = 0; i < oldsize; i++)
V[i] = buckets[old[i]];
V[oldsize] = 0;
for (i = 1; i < 256; i++)
if (buckets[i] == buckets[i - 1] + 1)
I[buckets[i]] = -1;
I[0] = -1;
for (h = 1; I[0] != -(oldsize + 1); h += h) {
len = 0;
for (i = 0; i < oldsize + 1;) {
if (I[i] < 0) {
len -= I[i];
i -= I[i];
} else {
if (len)
I[i - len] = -len;
len = V[I[i]] + 1 - i;
split<T>(I, V, i, i + len, h);
i += len;
len = 0;
};
};
if (len)
I[i - len] = -len;
};
for (i = 0; i < oldsize + 1; i++)
I[V[i]] = i;
}
// End of 'verbatim' code.
// ------------------------------------------------------------------------
} // namespace qsuf
#endif // COURGETTE_THIRD_PARTY_BSDIFF_QSUFSORT_H_

@ -1,74 +0,0 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "courgette/third_party/bsdiff/qsufsort.h"
#include <algorithm>
#include <cstddef>
#include <cstring>
#include <vector>
#include "base/macros.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace {
bool IsPermutation(const std::vector<int> v) {
std::vector<int> v_sorted(v);
std::sort(v_sorted.begin(), v_sorted.end());
for (int i = 0; i < static_cast<int>(v.size()); ++i)
if (i != v_sorted[i])
return false;
return true;
}
} // namespace
TEST(QSufSortTest, Sort) {
const char* test_cases[] = {
"",
"a",
"za",
"CACAO",
"banana",
"tobeornottobe",
"The quick brown fox jumps over the lazy dog.",
"elephantelephantelephantelephantelephant",
"-------------------------",
"011010011001011010010110011010010",
"3141592653589793238462643383279502884197169399375105",
"\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
};
for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
int size = static_cast<int>(::strlen(test_cases[idx]));
const unsigned char* buf =
reinterpret_cast<const unsigned char*>(test_cases[idx]);
// Generate the suffix array as I.
std::vector<int> I(size + 1);
std::vector<int> V(size + 1);
qsuf::qsufsort<int*>(&I[0], &V[0], buf, size);
// Expect I[] and V[] to be a permutation of [0, size].
EXPECT_TRUE(IsPermutation(I));
EXPECT_TRUE(IsPermutation(V));
// Expect V[] to be inverse of I[].
for (int i = 0; i < size + 1; ++i)
EXPECT_EQ(i, V[I[i]]);
// First string must be empty string.
EXPECT_EQ(size, I[0]);
// Expect that the |size + 1| suffixes are strictly ordered.
const unsigned char* end = buf + size;
for (int i = 0; i < size; ++i) {
const unsigned char* suf1 = buf + I[i];
const unsigned char* suf2 = buf + I[i + 1];
bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
EXPECT_TRUE(is_less);
}
}
}