Add initial version of the new chunked Set implementation.

So far:
- It does not support Put. It can only be constructed with a SetBuilder,
  then queried e.g. Has.
- It only supports ref.Refs as values.
- I'm not convinced about the naming. Perhaps SetChunker rather than
  SetBuilder, FlatLevel rather that FlatSet, etc. I expect they will change.
- The test is fairly simplistic but there isn't much point testing this
  exhaustively, since once we support mutation, the testing abstraction
  will probably be with an Iterator.
This commit is contained in:
Benjamin Kalman
2015-10-26 11:15:27 -07:00
parent 20b2839812
commit f08056555a
13 changed files with 542 additions and 0 deletions
+3
View File
@@ -0,0 +1,3 @@
This is a work-in-progress implementation of a multi-tiered chunked set.
It's not used in noms, but the idea is to gradually make it efficient, then implement all set operations on it, then replace the noms set implementation.
+35
View File
@@ -0,0 +1,35 @@
package newset
import (
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/kch42/buzhash"
"github.com/attic-labs/noms/ref"
)
const (
buzPattern = uint32(1<<6 - 1) // Average size of 64 elements
)
type buzChunker struct {
h *buzhash.BuzHash
}
func newBuzChunker() *buzChunker {
return &buzChunker{newBuzHash()}
}
func (c *buzChunker) Add(r ref.Ref) bool {
c.h.Write(r.DigestSlice())
isBoundary := c.h.Sum32()&buzPattern == buzPattern
if isBoundary {
c.h = newBuzHash()
}
return isBoundary
}
func (c *buzChunker) New() Chunker {
return newBuzChunker()
}
func newBuzHash() *buzhash.BuzHash {
return buzhash.NewBuzHash(uint32(8 * ref.NewHash().BlockSize()))
}
+35
View File
@@ -0,0 +1,35 @@
package newset
import (
"testing"
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
"github.com/attic-labs/noms/ref"
"github.com/attic-labs/noms/types"
)
func TestNumMatches(t *testing.T) {
assert := assert.New(t)
chunker := newBuzChunker()
numMatches := 0
for i := 0; i < 1000; i++ {
if chunker.Add(getRef(i)) {
numMatches++
}
}
// 20 was experimentally determined by calling Add 1000 times.
assert.Equal(20, numMatches)
}
func TestThing(t *testing.T) {
assert := assert.New(t)
// This ref has been experimentally determined to be immediately chunked.
r := ref.Parse("sha1-00000000000000000000000000000000000f422f")
assert.True(newBuzChunker().Add(r))
}
func getRef(i int) ref.Ref {
return types.Int32(i).Ref()
}
+75
View File
@@ -0,0 +1,75 @@
package newset
import (
"fmt"
"sort"
"strings"
"github.com/attic-labs/noms/ref"
)
type chunkedSet struct {
children entrySlice // sorted
}
type chunkedSetEntry struct {
start ref.Ref
set Set
}
type entrySlice []chunkedSetEntry
func (es entrySlice) Len() int {
return len(es)
}
func (es entrySlice) Less(i, j int) bool {
return ref.Less(es[i].start, es[j].start)
}
func (es entrySlice) Swap(i, j int) {
es[i], es[j] = es[j], es[i]
}
func (set chunkedSet) Len() (length uint64) {
for _, entry := range set.children {
length += entry.set.Len()
}
return
}
func (set chunkedSet) first() ref.Ref {
return set.children[0].start
}
func (set chunkedSet) Has(r ref.Ref) bool {
searchIndex := sort.Search(len(set.children), func(i int) bool {
return ref.Greater(set.children[i].start, r)
})
if searchIndex == 0 {
return false
}
searchIndex--
return set.children[searchIndex].set.Has(r)
}
func (set chunkedSet) Ref() ref.Ref {
// Eventually when chunked sets use noms Values this will need to be derived from the serialization of a chunked set, not simply a hash of all items' refs.
h := ref.NewHash()
for _, entry := range set.children {
h.Write(entry.set.Ref().DigestSlice())
}
return ref.FromHash(h)
}
func (set chunkedSet) fmt(indent int) string {
indentStr := strings.Repeat(" ", indent)
if len(set.children) == 0 {
return fmt.Sprintf("%s(empty chunked set)", indentStr)
}
s := fmt.Sprintf("%s(chunked with %d chunks)\n", indentStr, len(set.children))
for i, entry := range set.children {
s += fmt.Sprintf("%schunk %d (start %s)\n%s\n", indentStr, i, fmtRef(entry.start), entry.set.fmt(indent+4))
}
return s
}
+10
View File
@@ -0,0 +1,10 @@
package newset
import "github.com/attic-labs/noms/ref"
type Chunker interface {
// Adds a ref to the chunker, and returns whether it results in a chunk boundary.
Add(r ref.Ref) bool
// Returns a new instance of this chunker's type. This is really a factory method hiding on an instance which is a bit icky.
New() Chunker
}
+58
View File
@@ -0,0 +1,58 @@
package newset
import (
"fmt"
"sort"
"strings"
"github.com/attic-labs/noms/ref"
)
type flatSet struct {
d ref.RefSlice // sorted
r *ref.Ref
}
func (s flatSet) Len() uint64 {
return uint64(len(s.d))
}
func (s flatSet) Has(r ref.Ref) bool {
idx := s.searchForIndex(r)
return idx != len(s.d) && s.d[idx] == r
}
func (s flatSet) first() ref.Ref {
return s.d[0]
}
func (s flatSet) Ref() ref.Ref {
if s.r == nil {
h := ref.NewHash()
for _, r := range s.d {
h.Write(r.DigestSlice())
}
r := ref.FromHash(h)
s.r = &r
}
return *s.r
}
func (s flatSet) fmt(indent int) string {
indentStr := strings.Repeat(" ", indent)
if len(s.d) == 1 {
return fmt.Sprintf("%sflat %s", indentStr, fmtRef(s.d[0]))
}
return fmt.Sprintf("%sflat{%s...(%d more)...%s}", indentStr, fmtRef(s.d[0]), len(s.d)-2, fmtRef(s.d[len(s.d)-1]))
}
func (s flatSet) searchForIndex(r ref.Ref) int {
return sort.Search(len(s.d), func(i int) bool {
return !ref.Less(s.d[i], r)
})
}
func fmtRef(r ref.Ref) string {
str := r.String()
return str[len(str)-8:]
}
+28
View File
@@ -0,0 +1,28 @@
package newset
import (
"math/big"
"github.com/attic-labs/noms/ref"
)
// Generates fake ascending ref.Ref-s.
type referrator struct {
count *big.Int
}
func newReferrator() referrator {
return referrator{big.NewInt(int64(0))}
}
func (r referrator) Next() ref.Ref {
digest := ref.Sha1Digest{}
bytes := r.count.Bytes()
for i := 0; i < len(bytes); i++ {
digest[len(digest)-i-1] = bytes[len(bytes)-i-1]
}
result := ref.New(digest)
r.count.Add(r.count, big.NewInt(int64(1)))
return result
}
+19
View File
@@ -0,0 +1,19 @@
package newset
import (
"testing"
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
)
func TestReferrator(t *testing.T) {
assert := assert.New(t)
ator := newReferrator()
assert.Equal("sha1-0000000000000000000000000000000000000000", ator.Next().String())
assert.Equal("sha1-0000000000000000000000000000000000000001", ator.Next().String())
for i := 0; i < 510; i++ {
ator.Next()
}
assert.Equal("sha1-0000000000000000000000000000000000000200", ator.Next().String())
}
+13
View File
@@ -0,0 +1,13 @@
package newset
import (
"github.com/attic-labs/noms/ref"
)
type Set interface {
first() ref.Ref
Len() uint64
Has(r ref.Ref) bool
Ref() ref.Ref
fmt(indent int) string
}
+87
View File
@@ -0,0 +1,87 @@
package newset
import (
"github.com/attic-labs/noms/d"
"github.com/attic-labs/noms/ref"
)
// This file is a giant copy-paste, but the architecture of chunking will likely be written in terms of iteration, so deal with it then.
type SetBuilder interface {
AddItem(r ref.Ref)
Build() Set
}
type leafSetBuilder struct {
current flatSet
chunks []flatSet
chunker Chunker
}
func NewSetBuilder() SetBuilder {
return NewSetBuilderWithChunker(newBuzChunker())
}
func NewSetBuilderWithChunker(chunker Chunker) SetBuilder {
return &leafSetBuilder{chunker: chunker}
}
func (builder *leafSetBuilder) AddItem(r ref.Ref) {
builder.current.d = append(builder.current.d, r)
if builder.chunker.Add(r) {
builder.chunks = append(builder.chunks, builder.current)
builder.current = flatSet{}
}
}
func (builder *leafSetBuilder) Build() Set {
if builder.current.Len() > uint64(0) {
builder.chunks = append(builder.chunks, builder.current)
}
if len(builder.chunks) == 1 {
d.Chk.NotEqual(0, builder.chunks[0].Len())
return builder.chunks[0]
}
mcb := newMetaChunkBuilder(builder.chunker.New())
for _, c := range builder.chunks {
mcb.AddItem(c)
}
return mcb.Build()
}
type chunkedSetBuilder struct {
current chunkedSet
sets []chunkedSet
chunker Chunker
}
func newMetaChunkBuilder(chunker Chunker) chunkedSetBuilder {
return chunkedSetBuilder{chunker: chunker}
}
func (mcb *chunkedSetBuilder) AddItem(s Set) {
mcb.current.children = append(mcb.current.children, chunkedSetEntry{s.first(), s})
if mcb.chunker.Add(s.Ref()) {
mcb.sets = append(mcb.sets, mcb.current)
mcb.current = chunkedSet{}
}
}
func (mcb *chunkedSetBuilder) Build() chunkedSet {
if mcb.current.Len() > 0 {
mcb.sets = append(mcb.sets, mcb.current)
}
if len(mcb.sets) == 1 {
d.Chk.NotEqual(0, mcb.sets[0].Len())
return mcb.sets[0]
}
b := newMetaChunkBuilder(mcb.chunker.New())
for _, s := range mcb.sets {
b.AddItem(s)
}
return b.Build()
}
File diff suppressed because one or more lines are too long
+8
View File
@@ -34,6 +34,10 @@ func (r Ref) IsEmpty() bool {
return r.digest == emptyRef.digest
}
func (r Ref) DigestSlice() []byte {
return r.digest[:]
}
func (r Ref) String() string {
return fmt.Sprintf("sha1-%s", hex.EncodeToString(r.digest[:]))
}
@@ -92,3 +96,7 @@ func Less(r1, r2 Ref) bool {
}
return false
}
func Greater(r1, r2 Ref) bool {
return !Less(r1, r2) && r1 != r2
}
+43
View File
@@ -62,6 +62,15 @@ func TestDigest(t *testing.T) {
assert.NotEqual(t, r.Digest(), d)
}
func TestDigestSlice(t *testing.T) {
r := New(Sha1Digest{})
d := r.DigestSlice()
assert.Equal(t, r.DigestSlice(), d)
// DigestSlice() must return a copy otherwise things get weird.
d[0] = 0x01
assert.NotEqual(t, r.DigestSlice(), d)
}
func TestFromHash(t *testing.T) {
h := sha1.New()
h.Write([]byte("abc"))
@@ -79,3 +88,37 @@ func TestIsEmpty(t *testing.T) {
r3 := Parse("sha1-a9993e364706816aba3e25717850c26c9cd0d89d")
assert.False(t, r3.IsEmpty())
}
func TestLess(t *testing.T) {
assert := assert.New(t)
r1 := Parse("sha1-0000000000000000000000000000000000000001")
r2 := Parse("sha1-0000000000000000000000000000000000000002")
assert.False(Less(r1, r1))
assert.True(Less(r1, r2))
assert.False(Less(r2, r1))
assert.False(Less(r2, r2))
r0 := Ref{}
assert.False(Less(r0, r0))
assert.True(Less(r0, r2))
assert.False(Less(r2, r0))
}
func TestGreater(t *testing.T) {
assert := assert.New(t)
r1 := Parse("sha1-0000000000000000000000000000000000000001")
r2 := Parse("sha1-0000000000000000000000000000000000000002")
assert.False(Greater(r1, r1))
assert.False(Greater(r1, r2))
assert.True(Greater(r2, r1))
assert.False(Greater(r2, r2))
r0 := Ref{}
assert.False(Greater(r0, r0))
assert.False(Greater(r0, r2))
assert.True(Greater(r2, r0))
}