0

Improvements to Courgette's version of bsdiff

* Store 'diff' bytes by run-length encoding zeros. This reduces the memory
  needed to store the zeros by ~30MB for chrome.7z.
* Store the control tuple elements in separate streams.  The 'extra_bytes'
  counts are often zero so this brings all the zeros together.

The uncompressed patch file is much smaller due to the run-length encoded
zeros.  It is slightly smaller (3-8%) after compression with lzma.

Review URL: http://codereview.chromium.org/115435

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@16343 0039d316-1c4b-4281-b951-d872f2087c98
This commit is contained in:
sra@chromium.org
2009-05-18 23:47:01 +00:00
parent d9b46d768e
commit d576712f07
4 changed files with 64 additions and 66 deletions

@ -26,7 +26,7 @@ enum Status {
C_BAD_ENSEMBLE_MAGIC = 5, // Ensemble patch has bad magic.
C_BAD_ENSEMBLE_VERSION = 6, // Ensemble patch has wrong version.
C_BAD_ENSEMBLE_HEADER = 7, // Ensemble patch has corrupt header.
C_BAD_ENSEMBLE_CRC = 8, // Ensemble patch has corrupt header.
C_BAD_ENSEMBLE_CRC = 8, // Ensemble patch has corrupt data.
C_BAD_TRANSFORM = 12, // Transform mis-specified.
C_BAD_BASE = 13, // Base for transform malformed.

@ -74,9 +74,6 @@ typedef struct MBSPatchHeader_ {
uint32 slen; // Length of the file to be patched.
uint32 scrc32; // CRC32 of the file to be patched.
uint32 dlen; // Length of the result file.
uint32 cblen; // Length of the control block in bytes.
uint32 difflen; // Length of the diff block in bytes.
uint32 extralen; // Length of the extra block in bytes.
} MBSPatchHeader;
// This is the value for the tag field. Must match length exactly, not counting

@ -44,9 +44,6 @@ BSDiffStatus MBS_ReadHeader(SourceStream* stream, MBSPatchHeader* header) {
if (!stream->ReadVarint32(&header->slen)) return READ_ERROR;
if (!stream->ReadVarint32(&header->scrc32)) return READ_ERROR;
if (!stream->ReadVarint32(&header->dlen)) return READ_ERROR;
if (!stream->ReadVarint32(&header->cblen)) return READ_ERROR;
if (!stream->ReadVarint32(&header->difflen)) return READ_ERROR;
if (!stream->ReadVarint32(&header->extralen)) return READ_ERROR;
// The string will have a NUL terminator that we don't use, hence '-1'.
COMPILE_ASSERT(sizeof(MBS_PATCH_HEADER_TAG) - 1 == sizeof(header->tag),
@ -54,12 +51,6 @@ BSDiffStatus MBS_ReadHeader(SourceStream* stream, MBSPatchHeader* header) {
if (memcmp(header->tag, MBS_PATCH_HEADER_TAG, 8) != 0)
return UNEXPECTED_ERROR;
size_t bytes_remaining = stream->Remaining();
if (header->cblen +
header->difflen +
header->extralen != bytes_remaining)
return UNEXPECTED_ERROR;
return OK;
}
@ -69,35 +60,37 @@ BSDiffStatus MBS_ApplyPatch(const MBSPatchHeader *header,
SinkStream* new_stream) {
const uint8* old_end = old_start + old_size;
SourceStream control_stream;
const uint8* control_start = patch_stream->Buffer();
if (!patch_stream->ReadSubstream(header->cblen, &control_stream))
return READ_ERROR;
if (!patch_stream->Skip(header->difflen + header->extralen))
return READ_ERROR;
if (!patch_stream->Empty())
SourceStreamSet patch_streams;
if (!patch_streams.Init(patch_stream))
return READ_ERROR;
const uint8* diff_start = control_start + header->cblen;
const uint8* diff_end = diff_start + header->difflen;
const uint8* extra_start = diff_end;
const uint8* extra_end = extra_start + header->extralen;
SourceStream* control_stream_copy_counts = patch_streams.stream(0);
SourceStream* control_stream_extra_counts = patch_streams.stream(1);
SourceStream* control_stream_seeks = patch_streams.stream(2);
SourceStream* diff_skips = patch_streams.stream(3);
SourceStream* diff_bytes = patch_streams.stream(4);
SourceStream* extra_bytes = patch_streams.stream(5);
const uint8* extra_start = extra_bytes->Buffer();
const uint8* extra_end = extra_start + extra_bytes->Remaining();
const uint8* extra_position = extra_start;
const uint8* old_position = old_start;
const uint8* diff_position = diff_start;
const uint8* extra_position = extra_start;
new_stream->Reserve(header->dlen);
while (!control_stream.Empty()) {
uint32 pending_diff_zeros = 0;
if (!diff_skips->ReadVarint32(&pending_diff_zeros))
return UNEXPECTED_ERROR;
while (!control_stream_copy_counts->Empty()) {
uint32 copy_count, extra_count;
int32 seek_adjustment;
if (!control_stream.ReadVarint32(&copy_count))
if (!control_stream_copy_counts->ReadVarint32(&copy_count))
return UNEXPECTED_ERROR;
if (!control_stream.ReadVarint32(&extra_count))
if (!control_stream_extra_counts->ReadVarint32(&extra_count))
return UNEXPECTED_ERROR;
if (!control_stream.ReadVarint32Signed(&seek_adjustment))
if (!control_stream_seeks->ReadVarint32Signed(&seek_adjustment))
return UNEXPECTED_ERROR;
#ifdef DEBUG_bsmedberg
@ -108,16 +101,22 @@ BSDiffStatus MBS_ApplyPatch(const MBSPatchHeader *header,
// block.
if (copy_count > static_cast<size_t>(old_end - old_position))
return UNEXPECTED_ERROR;
if (copy_count > static_cast<size_t>(diff_end - diff_position))
return UNEXPECTED_ERROR;
// Add together bytes from the 'old' file and the 'diff' stream.
for (size_t i = 0; i < copy_count; ++i) {
uint8 byte = old_position[i] + diff_position[i];
uint8 diff_byte = 0;
if (pending_diff_zeros) {
--pending_diff_zeros;
} else {
if (!diff_skips->ReadVarint32(&pending_diff_zeros))
return UNEXPECTED_ERROR;
if (!diff_bytes->Read(&diff_byte, 1))
return UNEXPECTED_ERROR;
}
uint8 byte = old_position[i] + diff_byte;
new_stream->Write(&byte, 1);
}
old_position += copy_count;
diff_position += copy_count;
// Copy bytes from the extra block.
if (extra_count > static_cast<size_t>(extra_end - extra_position))
@ -134,9 +133,12 @@ BSDiffStatus MBS_ApplyPatch(const MBSPatchHeader *header,
old_position += seek_adjustment;
}
if (diff_position != diff_end)
return UNEXPECTED_ERROR;
if (extra_position != extra_end)
if (!control_stream_copy_counts->Empty() ||
!control_stream_extra_counts->Empty() ||
!control_stream_seeks->Empty() ||
!diff_skips->Empty() ||
!diff_bytes->Empty() ||
!extra_bytes->Empty())
return UNEXPECTED_ERROR;
return OK;

@ -42,7 +42,7 @@ namespace courgette {
//
// The code appears to be a rewritten version of the suffix array algorithm
// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
// Sadakane, special-cased for bytes.
// Sadakane, special cased for bytes.
static void
split(int *I,int *V,int start,int len,int h)
@ -191,9 +191,6 @@ static void WriteHeader(SinkStream* stream, MBSPatchHeader* header) {
stream->WriteVarint32(header->slen);
stream->WriteVarint32(header->scrc32);
stream->WriteVarint32(header->dlen);
stream->WriteVarint32(header->cblen);
stream->WriteVarint32(header->difflen);
stream->WriteVarint32(header->extralen);
}
BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
@ -204,9 +201,19 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
LOG(INFO) << "Start bsdiff";
size_t initial_patch_stream_length = patch_stream->Length();
SinkStreamSet patch_streams;
SinkStream* control_stream_copy_counts = patch_streams.stream(0);
SinkStream* control_stream_extra_counts = patch_streams.stream(1);
SinkStream* control_stream_seeks = patch_streams.stream(2);
SinkStream* diff_skips = patch_streams.stream(3);
SinkStream* diff_bytes = patch_streams.stream(4);
SinkStream* extra_bytes = patch_streams.stream(5);
const uint8* old = old_stream->Buffer();
const int oldsize = old_stream->Remaining();
uint32 pending_diff_zeros = 0;
scoped_array<int> I(new(std::nothrow) int[oldsize + 1]);
scoped_array<int> V(new(std::nothrow) int[oldsize + 1]);
if (I == NULL || V == NULL)
@ -221,24 +228,12 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
const uint8* newbuf = new_stream->Buffer();
const int newsize = new_stream->Remaining();
// Allocate newsize+1 bytes instead of newsize bytes to ensure that we never
// try to malloc(0) and get a NULL pointer.
scoped_array<uint8> diff_bytes_array(new(std::nothrow) uint8[newsize + 1]);
scoped_array<uint8> extra_bytes_array(new(std::nothrow) uint8[newsize + 1]);
if (diff_bytes_array == NULL || extra_bytes_array == NULL)
return MEM_ERROR;
uint8* diff_bytes = diff_bytes_array.get();
uint8* extra_bytes = extra_bytes_array.get();
int control_length = 0;
int diff_bytes_length = 0;
int diff_bytes_nonzero = 0;
int extra_bytes_length = 0;
int eblen = 0;
SinkStream control_stream;
// The patch format is a sequence of triples <copy,extra,seek> where 'copy' is
// the number of bytes to copy from the old file (possibly with mistakes),
// 'extra' is the number of bytes to copy from a stream of fresh bytes, and
@ -364,13 +359,18 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
for (int i = 0; i < lenf; i++) {
uint8 diff_byte = newbuf[lastscan + i] - old[lastpos + i];
diff_bytes[diff_bytes_length + i] = diff_byte;
if (diff_byte)
if (diff_byte) {
++diff_bytes_nonzero;
diff_skips->WriteVarint32(pending_diff_zeros);
pending_diff_zeros = 0;
diff_bytes->Write(&diff_byte, 1);
} else {
++pending_diff_zeros;
}
}
int gap = (scan - lenb) - (lastscan + lenf);
for (int i = 0; i < gap; i++)
extra_bytes[extra_bytes_length + i] = newbuf[lastscan + lenf + i];
extra_bytes->Write(&newbuf[lastscan + lenf + i], 1);
diff_bytes_length += lenf;
extra_bytes_length += gap;
@ -379,9 +379,9 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
uint32 extra_count = gap;
int32 seek_adjustment = ((pos - lenb) - (lastpos + lenf));
control_stream.WriteVarint32(copy_count);
control_stream.WriteVarint32(extra_count);
control_stream.WriteVarint32Signed(seek_adjustment);
control_stream_copy_counts->WriteVarint32(copy_count);
control_stream_extra_counts->WriteVarint32(extra_count);
control_stream_seeks->WriteVarint32Signed(seek_adjustment);
++control_length;
#ifdef DEBUG_bsmedberg
LOG(INFO) << StringPrintf(
@ -395,6 +395,8 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
}
}
diff_skips->WriteVarint32(pending_diff_zeros);
I.reset();
MBSPatchHeader header;
@ -405,19 +407,16 @@ BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,
header.slen = oldsize;
header.scrc32 = CalculateCrc(old, oldsize);
header.dlen = newsize;
header.cblen = control_stream.Length();
header.difflen = diff_bytes_length;
header.extralen = extra_bytes_length;
WriteHeader(patch_stream, &header);
patch_stream->Append(&control_stream);
patch_stream->Write(diff_bytes, diff_bytes_length);
patch_stream->Write(extra_bytes, extra_bytes_length);
size_t diff_skips_length = diff_skips->Length();
patch_streams.CopyTo(patch_stream);
LOG(INFO) << "Control tuples: " << control_length
<< " copy bytes: " << diff_bytes_length
<< " mistakes: " << diff_bytes_nonzero
<< " (skips: " << diff_skips_length << ")"
<< " extra bytes: " << extra_bytes_length;
LOG(INFO) << "Uncompressed bsdiff patch size "