Add initial version of the new chunked Set implementation.

So far: - It does not support Put. It can only be constructed with a SetBuilder, then queried e.g. Has. - It only supports ref.Refs as values. - I'm not convinced about the naming. Perhaps SetChunker rather than SetBuilder, FlatLevel rather that FlatSet, etc. I expect they will change. - The test is fairly simplistic but there isn't much point testing this exhaustively, since once we support mutation, the testing abstraction will probably be with an Iterator.
2026-05-13 03:10:03 -05:00 · 2015-10-26 11:15:27 -07:00
parent 20b2839812
commit f08056555a
13 changed files with 542 additions and 0 deletions
@@ -0,0 +1,3 @@
+This is a work-in-progress implementation of a multi-tiered chunked set.
+
+It's not used in noms, but the idea is to gradually make it efficient, then implement all set operations on it, then replace the noms set implementation.
@@ -0,0 +1,35 @@
+package newset
+
+import (
+	"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/kch42/buzhash"
+	"github.com/attic-labs/noms/ref"
+)
+
+const (
+	buzPattern = uint32(1<<6 - 1) // Average size of 64 elements
+)
+
+type buzChunker struct {
+	h *buzhash.BuzHash
+}
+
+func newBuzChunker() *buzChunker {
+	return &buzChunker{newBuzHash()}
+}
+
+func (c *buzChunker) Add(r ref.Ref) bool {
+	c.h.Write(r.DigestSlice())
+	isBoundary := c.h.Sum32()&buzPattern == buzPattern
+	if isBoundary {
+		c.h = newBuzHash()
+	}
+	return isBoundary
+}
+
+func (c *buzChunker) New() Chunker {
+	return newBuzChunker()
+}
+
+func newBuzHash() *buzhash.BuzHash {
+	return buzhash.NewBuzHash(uint32(8 * ref.NewHash().BlockSize()))
+}
@@ -0,0 +1,35 @@
+package newset
+
+import (
+	"testing"
+
+	"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
+	"github.com/attic-labs/noms/ref"
+	"github.com/attic-labs/noms/types"
+)
+
+func TestNumMatches(t *testing.T) {
+	assert := assert.New(t)
+	chunker := newBuzChunker()
+
+	numMatches := 0
+	for i := 0; i < 1000; i++ {
+		if chunker.Add(getRef(i)) {
+			numMatches++
+		}
+	}
+
+	// 20 was experimentally determined by calling Add 1000 times.
+	assert.Equal(20, numMatches)
+}
+
+func TestThing(t *testing.T) {
+	assert := assert.New(t)
+	// This ref has been experimentally determined to be immediately chunked.
+	r := ref.Parse("sha1-00000000000000000000000000000000000f422f")
+	assert.True(newBuzChunker().Add(r))
+}
+
+func getRef(i int) ref.Ref {
+	return types.Int32(i).Ref()
+}
@@ -0,0 +1,75 @@
+package newset
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/attic-labs/noms/ref"
+)
+
+type chunkedSet struct {
+	children entrySlice // sorted
+}
+
+type chunkedSetEntry struct {
+	start ref.Ref
+	set   Set
+}
+
+type entrySlice []chunkedSetEntry
+
+func (es entrySlice) Len() int {
+	return len(es)
+}
+
+func (es entrySlice) Less(i, j int) bool {
+	return ref.Less(es[i].start, es[j].start)
+}
+
+func (es entrySlice) Swap(i, j int) {
+	es[i], es[j] = es[j], es[i]
+}
+
+func (set chunkedSet) Len() (length uint64) {
+	for _, entry := range set.children {
+		length += entry.set.Len()
+	}
+	return
+}
+
+func (set chunkedSet) first() ref.Ref {
+	return set.children[0].start
+}
+
+func (set chunkedSet) Has(r ref.Ref) bool {
+	searchIndex := sort.Search(len(set.children), func(i int) bool {
+		return ref.Greater(set.children[i].start, r)
+	})
+	if searchIndex == 0 {
+		return false
+	}
+	searchIndex--
+	return set.children[searchIndex].set.Has(r)
+}
+
+func (set chunkedSet) Ref() ref.Ref {
+	// Eventually when chunked sets use noms Values this will need to be derived from the serialization of a chunked set, not simply a hash of all items' refs.
+	h := ref.NewHash()
+	for _, entry := range set.children {
+		h.Write(entry.set.Ref().DigestSlice())
+	}
+	return ref.FromHash(h)
+}
+
+func (set chunkedSet) fmt(indent int) string {
+	indentStr := strings.Repeat(" ", indent)
+	if len(set.children) == 0 {
+		return fmt.Sprintf("%s(empty chunked set)", indentStr)
+	}
+	s := fmt.Sprintf("%s(chunked with %d chunks)\n", indentStr, len(set.children))
+	for i, entry := range set.children {
+		s += fmt.Sprintf("%schunk %d (start %s)\n%s\n", indentStr, i, fmtRef(entry.start), entry.set.fmt(indent+4))
+	}
+	return s
+}
@@ -0,0 +1,10 @@
+package newset
+
+import "github.com/attic-labs/noms/ref"
+
+type Chunker interface {
+	// Adds a ref to the chunker, and returns whether it results in a chunk boundary.
+	Add(r ref.Ref) bool
+	// Returns a new instance of this chunker's type. This is really a factory method hiding on an instance which is a bit icky.
+	New() Chunker
+}
@@ -0,0 +1,58 @@
+package newset
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/attic-labs/noms/ref"
+)
+
+type flatSet struct {
+	d ref.RefSlice // sorted
+	r *ref.Ref
+}
+
+func (s flatSet) Len() uint64 {
+	return uint64(len(s.d))
+}
+
+func (s flatSet) Has(r ref.Ref) bool {
+	idx := s.searchForIndex(r)
+	return idx != len(s.d) && s.d[idx] == r
+}
+
+func (s flatSet) first() ref.Ref {
+	return s.d[0]
+}
+
+func (s flatSet) Ref() ref.Ref {
+	if s.r == nil {
+		h := ref.NewHash()
+		for _, r := range s.d {
+			h.Write(r.DigestSlice())
+		}
+		r := ref.FromHash(h)
+		s.r = &r
+	}
+	return *s.r
+}
+
+func (s flatSet) fmt(indent int) string {
+	indentStr := strings.Repeat(" ", indent)
+	if len(s.d) == 1 {
+		return fmt.Sprintf("%sflat %s", indentStr, fmtRef(s.d[0]))
+	}
+	return fmt.Sprintf("%sflat{%s...(%d more)...%s}", indentStr, fmtRef(s.d[0]), len(s.d)-2, fmtRef(s.d[len(s.d)-1]))
+}
+
+func (s flatSet) searchForIndex(r ref.Ref) int {
+	return sort.Search(len(s.d), func(i int) bool {
+		return !ref.Less(s.d[i], r)
+	})
+}
+
+func fmtRef(r ref.Ref) string {
+	str := r.String()
+	return str[len(str)-8:]
+}
@@ -0,0 +1,28 @@
+package newset
+
+import (
+	"math/big"
+
+	"github.com/attic-labs/noms/ref"
+)
+
+// Generates fake ascending ref.Ref-s.
+type referrator struct {
+	count *big.Int
+}
+
+func newReferrator() referrator {
+	return referrator{big.NewInt(int64(0))}
+}
+
+func (r referrator) Next() ref.Ref {
+	digest := ref.Sha1Digest{}
+	bytes := r.count.Bytes()
+	for i := 0; i < len(bytes); i++ {
+		digest[len(digest)-i-1] = bytes[len(bytes)-i-1]
+	}
+
+	result := ref.New(digest)
+	r.count.Add(r.count, big.NewInt(int64(1)))
+	return result
+}
@@ -0,0 +1,19 @@
+package newset
+
+import (
+	"testing"
+
+	"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
+)
+
+func TestReferrator(t *testing.T) {
+	assert := assert.New(t)
+
+	ator := newReferrator()
+	assert.Equal("sha1-0000000000000000000000000000000000000000", ator.Next().String())
+	assert.Equal("sha1-0000000000000000000000000000000000000001", ator.Next().String())
+	for i := 0; i < 510; i++ {
+		ator.Next()
+	}
+	assert.Equal("sha1-0000000000000000000000000000000000000200", ator.Next().String())
+}
@@ -0,0 +1,13 @@
+package newset
+
+import (
+	"github.com/attic-labs/noms/ref"
+)
+
+type Set interface {
+	first() ref.Ref
+	Len() uint64
+	Has(r ref.Ref) bool
+	Ref() ref.Ref
+	fmt(indent int) string
+}
@@ -0,0 +1,87 @@
+package newset
+
+import (
+	"github.com/attic-labs/noms/d"
+	"github.com/attic-labs/noms/ref"
+)
+
+// This file is a giant copy-paste, but the architecture of chunking will likely be written in terms of iteration, so deal with it then.
+type SetBuilder interface {
+	AddItem(r ref.Ref)
+	Build() Set
+}
+
+type leafSetBuilder struct {
+	current flatSet
+	chunks  []flatSet
+	chunker Chunker
+}
+
+func NewSetBuilder() SetBuilder {
+	return NewSetBuilderWithChunker(newBuzChunker())
+}
+
+func NewSetBuilderWithChunker(chunker Chunker) SetBuilder {
+	return &leafSetBuilder{chunker: chunker}
+}
+
+func (builder *leafSetBuilder) AddItem(r ref.Ref) {
+	builder.current.d = append(builder.current.d, r)
+	if builder.chunker.Add(r) {
+		builder.chunks = append(builder.chunks, builder.current)
+		builder.current = flatSet{}
+	}
+}
+
+func (builder *leafSetBuilder) Build() Set {
+	if builder.current.Len() > uint64(0) {
+		builder.chunks = append(builder.chunks, builder.current)
+	}
+
+	if len(builder.chunks) == 1 {
+		d.Chk.NotEqual(0, builder.chunks[0].Len())
+		return builder.chunks[0]
+	}
+
+	mcb := newMetaChunkBuilder(builder.chunker.New())
+	for _, c := range builder.chunks {
+		mcb.AddItem(c)
+	}
+
+	return mcb.Build()
+}
+
+type chunkedSetBuilder struct {
+	current chunkedSet
+	sets    []chunkedSet
+	chunker Chunker
+}
+
+func newMetaChunkBuilder(chunker Chunker) chunkedSetBuilder {
+	return chunkedSetBuilder{chunker: chunker}
+}
+
+func (mcb *chunkedSetBuilder) AddItem(s Set) {
+	mcb.current.children = append(mcb.current.children, chunkedSetEntry{s.first(), s})
+	if mcb.chunker.Add(s.Ref()) {
+		mcb.sets = append(mcb.sets, mcb.current)
+		mcb.current = chunkedSet{}
+	}
+}
+
+func (mcb *chunkedSetBuilder) Build() chunkedSet {
+	if mcb.current.Len() > 0 {
+		mcb.sets = append(mcb.sets, mcb.current)
+	}
+
+	if len(mcb.sets) == 1 {
+		d.Chk.NotEqual(0, mcb.sets[0].Len())
+		return mcb.sets[0]
+	}
+
+	b := newMetaChunkBuilder(mcb.chunker.New())
+	for _, s := range mcb.sets {
+		b.AddItem(s)
+	}
+	return b.Build()
+}
@@ -34,6 +34,10 @@ func (r Ref) IsEmpty() bool {
 	return r.digest == emptyRef.digest
 }

+func (r Ref) DigestSlice() []byte {
+	return r.digest[:]
+}
+
 func (r Ref) String() string {
 	return fmt.Sprintf("sha1-%s", hex.EncodeToString(r.digest[:]))
 }
@@ -92,3 +96,7 @@ func Less(r1, r2 Ref) bool {
 	}
 	return false
 }
+
+func Greater(r1, r2 Ref) bool {
+	return !Less(r1, r2) && r1 != r2
+}
@@ -62,6 +62,15 @@ func TestDigest(t *testing.T) {
 	assert.NotEqual(t, r.Digest(), d)
 }

+func TestDigestSlice(t *testing.T) {
+	r := New(Sha1Digest{})
+	d := r.DigestSlice()
+	assert.Equal(t, r.DigestSlice(), d)
+	// DigestSlice() must return a copy otherwise things get weird.
+	d[0] = 0x01
+	assert.NotEqual(t, r.DigestSlice(), d)
+}
+
 func TestFromHash(t *testing.T) {
 	h := sha1.New()
 	h.Write([]byte("abc"))
@@ -79,3 +88,37 @@ func TestIsEmpty(t *testing.T) {
 	r3 := Parse("sha1-a9993e364706816aba3e25717850c26c9cd0d89d")
 	assert.False(t, r3.IsEmpty())
 }
+
+func TestLess(t *testing.T) {
+	assert := assert.New(t)
+
+	r1 := Parse("sha1-0000000000000000000000000000000000000001")
+	r2 := Parse("sha1-0000000000000000000000000000000000000002")
+
+	assert.False(Less(r1, r1))
+	assert.True(Less(r1, r2))
+	assert.False(Less(r2, r1))
+	assert.False(Less(r2, r2))
+
+	r0 := Ref{}
+	assert.False(Less(r0, r0))
+	assert.True(Less(r0, r2))
+	assert.False(Less(r2, r0))
+}
+
+func TestGreater(t *testing.T) {
+	assert := assert.New(t)
+
+	r1 := Parse("sha1-0000000000000000000000000000000000000001")
+	r2 := Parse("sha1-0000000000000000000000000000000000000002")
+
+	assert.False(Greater(r1, r1))
+	assert.False(Greater(r1, r2))
+	assert.True(Greater(r2, r1))
+	assert.False(Greater(r2, r2))
+
+	r0 := Ref{}
+	assert.False(Greater(r0, r0))
+	assert.False(Greater(r0, r2))
+	assert.True(Greater(r2, r0))
+}