From 338de4e58391e241a29e0bd1d71bce97f03759b3 Mon Sep 17 00:00:00 2001 From: Benjamin Kalman Date: Thu, 3 Dec 2015 14:46:53 -0800 Subject: [PATCH] Correctly distinguish between chunking window size and buzhash window size. Previously the buzhash boundary checker used a single value for the window size, both as the buzhash buffer size when constructing a hash object, and reported as its window size to the boundary checker interface. This was wrong because we don't always pass single byte values to the hasher, for example refs are 20 bytes. The compound list chunking compensated for this by only passing the first byte of each list leaf's ref rather than the full ref. This is bad because there is obviously less entropy in 1 byte vs 20 bytes. The meta sequence chunking compensated for this by multiplying the chunking window size by 20, but this also had the effect of unnecessarily considering 20 times more chunked elements than would fit in the buzhash buffer. --- types/blob.go | 6 ++---- types/buz_hash_boundary_checker.go | 31 ++++++++++++++++++++++++++++ types/compound_list.go | 11 +++++----- types/compound_list_test.go | 2 +- types/meta_sequence.go | 33 ++++-------------------------- 5 files changed, 43 insertions(+), 40 deletions(-) create mode 100644 types/buz_hash_boundary_checker.go diff --git a/types/blob.go b/types/blob.go index 13a6ebb986..c16ee2f8ca 100644 --- a/types/blob.go +++ b/types/blob.go @@ -3,7 +3,6 @@ package types import ( "io" - "github.com/attic-labs/noms/Godeps/_workspace/src/github.com/attic-labs/buzhash" "github.com/attic-labs/noms/chunks" "github.com/attic-labs/noms/d" ) @@ -36,9 +35,8 @@ func NewMemoryBlob(r io.Reader) Blob { } func newBlobLeafBoundaryChecker() boundaryChecker { - return newBuzHashBoundaryChecker(blobWindowSize, func(h *buzhash.BuzHash, item sequenceItem) bool { - b := item.(byte) - return h.HashByte(b)&blobPattern == blobPattern + return newBuzHashBoundaryChecker(blobWindowSize, 1, blobPattern, func(item sequenceItem) []byte { + return []byte{item.(byte)} }) } diff --git a/types/buz_hash_boundary_checker.go b/types/buz_hash_boundary_checker.go new file mode 100644 index 0000000000..0053faa656 --- /dev/null +++ b/types/buz_hash_boundary_checker.go @@ -0,0 +1,31 @@ +package types + +import ( + "github.com/attic-labs/noms/Godeps/_workspace/src/github.com/attic-labs/buzhash" + "github.com/attic-labs/noms/d" +) + +type buzHashBoundaryChecker struct { + h *buzhash.BuzHash + windowSize, valueSize int + pattern uint32 + getBytes getBytesFn +} + +type getBytesFn func(item sequenceItem) []byte + +func newBuzHashBoundaryChecker(windowSize, valueSize int, pattern uint32, getBytes getBytesFn) boundaryChecker { + return &buzHashBoundaryChecker{buzhash.NewBuzHash(uint32(windowSize * valueSize)), windowSize, valueSize, pattern, getBytes} +} + +func (b *buzHashBoundaryChecker) Write(item sequenceItem) bool { + bytes := b.getBytes(item) + d.Chk.Equal(b.valueSize, len(bytes)) + _, err := b.h.Write(bytes) + d.Chk.NoError(err) + return b.h.Sum32()&b.pattern == b.pattern +} + +func (b *buzHashBoundaryChecker) WindowSize() int { + return b.windowSize +} diff --git a/types/compound_list.go b/types/compound_list.go index 1d99e6cc05..12534e5df6 100644 --- a/types/compound_list.go +++ b/types/compound_list.go @@ -1,7 +1,8 @@ package types import ( - "github.com/attic-labs/noms/Godeps/_workspace/src/github.com/attic-labs/buzhash" + "crypto/sha1" + "github.com/attic-labs/noms/chunks" "github.com/attic-labs/noms/d" "github.com/attic-labs/noms/ref" @@ -128,11 +129,9 @@ func (cl compoundList) IterAll(f listIterAllFunc) { } func newListLeafBoundaryChecker() boundaryChecker { - return newBuzHashBoundaryChecker(listWindowSize, func(h *buzhash.BuzHash, item sequenceItem) bool { - v := item.(Value) - digest := v.Ref().Digest() - b := digest[0] - return h.HashByte(b)&listPattern == listPattern + return newBuzHashBoundaryChecker(listWindowSize, sha1.Size, listPattern, func(item sequenceItem) []byte { + digest := item.(Value).Ref().Digest() + return digest[:] }) } diff --git a/types/compound_list_test.go b/types/compound_list_test.go index 3f9e14196f..87f66e4dde 100644 --- a/types/compound_list_test.go +++ b/types/compound_list_test.go @@ -15,7 +15,7 @@ func (tsl testSimpleList) Get(idx uint64) Value { } func getTestSimpleListLen() int { - return int(listPattern * 16) + return int(listPattern * 50) } func getTestSimpleList() testSimpleList { diff --git a/types/meta_sequence.go b/types/meta_sequence.go index e1183b9461..6bed91da9e 100644 --- a/types/meta_sequence.go +++ b/types/meta_sequence.go @@ -3,14 +3,12 @@ package types import ( "crypto/sha1" - "github.com/attic-labs/noms/Godeps/_workspace/src/github.com/attic-labs/buzhash" "github.com/attic-labs/noms/chunks" - "github.com/attic-labs/noms/d" "github.com/attic-labs/noms/ref" ) const ( - objectWindowSize = 8 * sha1.Size + objectWindowSize = 8 objectPattern = uint32(1<<6 - 1) // Average size of 64 elements ) @@ -113,33 +111,10 @@ func getDataFromMetaSequence(v Value) metaSequenceData { panic("not reachable") } -type checkHashFn func(h *buzhash.BuzHash, item sequenceItem) bool - -type buzHashBoundaryChecker struct { - h *buzhash.BuzHash - windowSize int - checkHash checkHashFn -} - -func newBuzHashBoundaryChecker(windowSize int, checkHash checkHashFn) boundaryChecker { - return &buzHashBoundaryChecker{buzhash.NewBuzHash(uint32(windowSize)), windowSize, checkHash} -} - -func (b *buzHashBoundaryChecker) Write(item sequenceItem) bool { - return b.checkHash(b.h, item) -} - -func (b *buzHashBoundaryChecker) WindowSize() int { - return b.windowSize -} - func newMetaSequenceBoundaryChecker() boundaryChecker { - return newBuzHashBoundaryChecker(objectWindowSize, func(h *buzhash.BuzHash, item sequenceItem) bool { - mt := item.(metaTuple) - digest := mt.ref.Digest() - _, err := h.Write(digest[:]) - d.Chk.NoError(err) - return h.Sum32()&objectPattern == objectPattern + return newBuzHashBoundaryChecker(objectWindowSize, sha1.Size, objectPattern, func(item sequenceItem) []byte { + digest := item.(metaTuple).ref.Digest() + return digest[:] }) }