mirror of
https://github.com/dolthub/dolt.git
synced 2026-05-13 03:10:03 -05:00
Add initial version of the new chunked Set implementation.
So far: - It does not support Put. It can only be constructed with a SetBuilder, then queried e.g. Has. - It only supports ref.Refs as values. - I'm not convinced about the naming. Perhaps SetChunker rather than SetBuilder, FlatLevel rather that FlatSet, etc. I expect they will change. - The test is fairly simplistic but there isn't much point testing this exhaustively, since once we support mutation, the testing abstraction will probably be with an Iterator.
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
This is a work-in-progress implementation of a multi-tiered chunked set.
|
||||
|
||||
It's not used in noms, but the idea is to gradually make it efficient, then implement all set operations on it, then replace the noms set implementation.
|
||||
@@ -0,0 +1,35 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/kch42/buzhash"
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
const (
|
||||
buzPattern = uint32(1<<6 - 1) // Average size of 64 elements
|
||||
)
|
||||
|
||||
type buzChunker struct {
|
||||
h *buzhash.BuzHash
|
||||
}
|
||||
|
||||
func newBuzChunker() *buzChunker {
|
||||
return &buzChunker{newBuzHash()}
|
||||
}
|
||||
|
||||
func (c *buzChunker) Add(r ref.Ref) bool {
|
||||
c.h.Write(r.DigestSlice())
|
||||
isBoundary := c.h.Sum32()&buzPattern == buzPattern
|
||||
if isBoundary {
|
||||
c.h = newBuzHash()
|
||||
}
|
||||
return isBoundary
|
||||
}
|
||||
|
||||
func (c *buzChunker) New() Chunker {
|
||||
return newBuzChunker()
|
||||
}
|
||||
|
||||
func newBuzHash() *buzhash.BuzHash {
|
||||
return buzhash.NewBuzHash(uint32(8 * ref.NewHash().BlockSize()))
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
|
||||
"github.com/attic-labs/noms/ref"
|
||||
"github.com/attic-labs/noms/types"
|
||||
)
|
||||
|
||||
func TestNumMatches(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
chunker := newBuzChunker()
|
||||
|
||||
numMatches := 0
|
||||
for i := 0; i < 1000; i++ {
|
||||
if chunker.Add(getRef(i)) {
|
||||
numMatches++
|
||||
}
|
||||
}
|
||||
|
||||
// 20 was experimentally determined by calling Add 1000 times.
|
||||
assert.Equal(20, numMatches)
|
||||
}
|
||||
|
||||
func TestThing(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
// This ref has been experimentally determined to be immediately chunked.
|
||||
r := ref.Parse("sha1-00000000000000000000000000000000000f422f")
|
||||
assert.True(newBuzChunker().Add(r))
|
||||
}
|
||||
|
||||
func getRef(i int) ref.Ref {
|
||||
return types.Int32(i).Ref()
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
type chunkedSet struct {
|
||||
children entrySlice // sorted
|
||||
}
|
||||
|
||||
type chunkedSetEntry struct {
|
||||
start ref.Ref
|
||||
set Set
|
||||
}
|
||||
|
||||
type entrySlice []chunkedSetEntry
|
||||
|
||||
func (es entrySlice) Len() int {
|
||||
return len(es)
|
||||
}
|
||||
|
||||
func (es entrySlice) Less(i, j int) bool {
|
||||
return ref.Less(es[i].start, es[j].start)
|
||||
}
|
||||
|
||||
func (es entrySlice) Swap(i, j int) {
|
||||
es[i], es[j] = es[j], es[i]
|
||||
}
|
||||
|
||||
func (set chunkedSet) Len() (length uint64) {
|
||||
for _, entry := range set.children {
|
||||
length += entry.set.Len()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (set chunkedSet) first() ref.Ref {
|
||||
return set.children[0].start
|
||||
}
|
||||
|
||||
func (set chunkedSet) Has(r ref.Ref) bool {
|
||||
searchIndex := sort.Search(len(set.children), func(i int) bool {
|
||||
return ref.Greater(set.children[i].start, r)
|
||||
})
|
||||
if searchIndex == 0 {
|
||||
return false
|
||||
}
|
||||
searchIndex--
|
||||
return set.children[searchIndex].set.Has(r)
|
||||
}
|
||||
|
||||
func (set chunkedSet) Ref() ref.Ref {
|
||||
// Eventually when chunked sets use noms Values this will need to be derived from the serialization of a chunked set, not simply a hash of all items' refs.
|
||||
h := ref.NewHash()
|
||||
for _, entry := range set.children {
|
||||
h.Write(entry.set.Ref().DigestSlice())
|
||||
}
|
||||
return ref.FromHash(h)
|
||||
}
|
||||
|
||||
func (set chunkedSet) fmt(indent int) string {
|
||||
indentStr := strings.Repeat(" ", indent)
|
||||
if len(set.children) == 0 {
|
||||
return fmt.Sprintf("%s(empty chunked set)", indentStr)
|
||||
}
|
||||
s := fmt.Sprintf("%s(chunked with %d chunks)\n", indentStr, len(set.children))
|
||||
for i, entry := range set.children {
|
||||
s += fmt.Sprintf("%schunk %d (start %s)\n%s\n", indentStr, i, fmtRef(entry.start), entry.set.fmt(indent+4))
|
||||
}
|
||||
return s
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
package newset
|
||||
|
||||
import "github.com/attic-labs/noms/ref"
|
||||
|
||||
type Chunker interface {
|
||||
// Adds a ref to the chunker, and returns whether it results in a chunk boundary.
|
||||
Add(r ref.Ref) bool
|
||||
// Returns a new instance of this chunker's type. This is really a factory method hiding on an instance which is a bit icky.
|
||||
New() Chunker
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
type flatSet struct {
|
||||
d ref.RefSlice // sorted
|
||||
r *ref.Ref
|
||||
}
|
||||
|
||||
func (s flatSet) Len() uint64 {
|
||||
return uint64(len(s.d))
|
||||
}
|
||||
|
||||
func (s flatSet) Has(r ref.Ref) bool {
|
||||
idx := s.searchForIndex(r)
|
||||
return idx != len(s.d) && s.d[idx] == r
|
||||
}
|
||||
|
||||
func (s flatSet) first() ref.Ref {
|
||||
return s.d[0]
|
||||
}
|
||||
|
||||
func (s flatSet) Ref() ref.Ref {
|
||||
if s.r == nil {
|
||||
h := ref.NewHash()
|
||||
for _, r := range s.d {
|
||||
h.Write(r.DigestSlice())
|
||||
}
|
||||
r := ref.FromHash(h)
|
||||
s.r = &r
|
||||
}
|
||||
return *s.r
|
||||
}
|
||||
|
||||
func (s flatSet) fmt(indent int) string {
|
||||
indentStr := strings.Repeat(" ", indent)
|
||||
if len(s.d) == 1 {
|
||||
return fmt.Sprintf("%sflat %s", indentStr, fmtRef(s.d[0]))
|
||||
}
|
||||
return fmt.Sprintf("%sflat{%s...(%d more)...%s}", indentStr, fmtRef(s.d[0]), len(s.d)-2, fmtRef(s.d[len(s.d)-1]))
|
||||
}
|
||||
|
||||
func (s flatSet) searchForIndex(r ref.Ref) int {
|
||||
return sort.Search(len(s.d), func(i int) bool {
|
||||
return !ref.Less(s.d[i], r)
|
||||
})
|
||||
}
|
||||
|
||||
func fmtRef(r ref.Ref) string {
|
||||
str := r.String()
|
||||
return str[len(str)-8:]
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"math/big"
|
||||
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
// Generates fake ascending ref.Ref-s.
|
||||
type referrator struct {
|
||||
count *big.Int
|
||||
}
|
||||
|
||||
func newReferrator() referrator {
|
||||
return referrator{big.NewInt(int64(0))}
|
||||
}
|
||||
|
||||
func (r referrator) Next() ref.Ref {
|
||||
digest := ref.Sha1Digest{}
|
||||
bytes := r.count.Bytes()
|
||||
for i := 0; i < len(bytes); i++ {
|
||||
digest[len(digest)-i-1] = bytes[len(bytes)-i-1]
|
||||
}
|
||||
|
||||
result := ref.New(digest)
|
||||
r.count.Add(r.count, big.NewInt(int64(1)))
|
||||
return result
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestReferrator(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
ator := newReferrator()
|
||||
assert.Equal("sha1-0000000000000000000000000000000000000000", ator.Next().String())
|
||||
assert.Equal("sha1-0000000000000000000000000000000000000001", ator.Next().String())
|
||||
for i := 0; i < 510; i++ {
|
||||
ator.Next()
|
||||
}
|
||||
assert.Equal("sha1-0000000000000000000000000000000000000200", ator.Next().String())
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
type Set interface {
|
||||
first() ref.Ref
|
||||
Len() uint64
|
||||
Has(r ref.Ref) bool
|
||||
Ref() ref.Ref
|
||||
fmt(indent int) string
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
package newset
|
||||
|
||||
import (
|
||||
"github.com/attic-labs/noms/d"
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
// This file is a giant copy-paste, but the architecture of chunking will likely be written in terms of iteration, so deal with it then.
|
||||
type SetBuilder interface {
|
||||
AddItem(r ref.Ref)
|
||||
Build() Set
|
||||
}
|
||||
|
||||
type leafSetBuilder struct {
|
||||
current flatSet
|
||||
chunks []flatSet
|
||||
chunker Chunker
|
||||
}
|
||||
|
||||
func NewSetBuilder() SetBuilder {
|
||||
return NewSetBuilderWithChunker(newBuzChunker())
|
||||
}
|
||||
|
||||
func NewSetBuilderWithChunker(chunker Chunker) SetBuilder {
|
||||
return &leafSetBuilder{chunker: chunker}
|
||||
}
|
||||
|
||||
func (builder *leafSetBuilder) AddItem(r ref.Ref) {
|
||||
builder.current.d = append(builder.current.d, r)
|
||||
if builder.chunker.Add(r) {
|
||||
builder.chunks = append(builder.chunks, builder.current)
|
||||
builder.current = flatSet{}
|
||||
}
|
||||
}
|
||||
|
||||
func (builder *leafSetBuilder) Build() Set {
|
||||
if builder.current.Len() > uint64(0) {
|
||||
builder.chunks = append(builder.chunks, builder.current)
|
||||
}
|
||||
|
||||
if len(builder.chunks) == 1 {
|
||||
d.Chk.NotEqual(0, builder.chunks[0].Len())
|
||||
return builder.chunks[0]
|
||||
}
|
||||
|
||||
mcb := newMetaChunkBuilder(builder.chunker.New())
|
||||
for _, c := range builder.chunks {
|
||||
mcb.AddItem(c)
|
||||
}
|
||||
|
||||
return mcb.Build()
|
||||
}
|
||||
|
||||
type chunkedSetBuilder struct {
|
||||
current chunkedSet
|
||||
sets []chunkedSet
|
||||
chunker Chunker
|
||||
}
|
||||
|
||||
func newMetaChunkBuilder(chunker Chunker) chunkedSetBuilder {
|
||||
return chunkedSetBuilder{chunker: chunker}
|
||||
}
|
||||
|
||||
func (mcb *chunkedSetBuilder) AddItem(s Set) {
|
||||
mcb.current.children = append(mcb.current.children, chunkedSetEntry{s.first(), s})
|
||||
if mcb.chunker.Add(s.Ref()) {
|
||||
mcb.sets = append(mcb.sets, mcb.current)
|
||||
mcb.current = chunkedSet{}
|
||||
}
|
||||
}
|
||||
|
||||
func (mcb *chunkedSetBuilder) Build() chunkedSet {
|
||||
if mcb.current.Len() > 0 {
|
||||
mcb.sets = append(mcb.sets, mcb.current)
|
||||
}
|
||||
|
||||
if len(mcb.sets) == 1 {
|
||||
d.Chk.NotEqual(0, mcb.sets[0].Len())
|
||||
return mcb.sets[0]
|
||||
}
|
||||
|
||||
b := newMetaChunkBuilder(mcb.chunker.New())
|
||||
for _, s := range mcb.sets {
|
||||
b.AddItem(s)
|
||||
}
|
||||
return b.Build()
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -34,6 +34,10 @@ func (r Ref) IsEmpty() bool {
|
||||
return r.digest == emptyRef.digest
|
||||
}
|
||||
|
||||
func (r Ref) DigestSlice() []byte {
|
||||
return r.digest[:]
|
||||
}
|
||||
|
||||
func (r Ref) String() string {
|
||||
return fmt.Sprintf("sha1-%s", hex.EncodeToString(r.digest[:]))
|
||||
}
|
||||
@@ -92,3 +96,7 @@ func Less(r1, r2 Ref) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func Greater(r1, r2 Ref) bool {
|
||||
return !Less(r1, r2) && r1 != r2
|
||||
}
|
||||
|
||||
@@ -62,6 +62,15 @@ func TestDigest(t *testing.T) {
|
||||
assert.NotEqual(t, r.Digest(), d)
|
||||
}
|
||||
|
||||
func TestDigestSlice(t *testing.T) {
|
||||
r := New(Sha1Digest{})
|
||||
d := r.DigestSlice()
|
||||
assert.Equal(t, r.DigestSlice(), d)
|
||||
// DigestSlice() must return a copy otherwise things get weird.
|
||||
d[0] = 0x01
|
||||
assert.NotEqual(t, r.DigestSlice(), d)
|
||||
}
|
||||
|
||||
func TestFromHash(t *testing.T) {
|
||||
h := sha1.New()
|
||||
h.Write([]byte("abc"))
|
||||
@@ -79,3 +88,37 @@ func TestIsEmpty(t *testing.T) {
|
||||
r3 := Parse("sha1-a9993e364706816aba3e25717850c26c9cd0d89d")
|
||||
assert.False(t, r3.IsEmpty())
|
||||
}
|
||||
|
||||
func TestLess(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
r1 := Parse("sha1-0000000000000000000000000000000000000001")
|
||||
r2 := Parse("sha1-0000000000000000000000000000000000000002")
|
||||
|
||||
assert.False(Less(r1, r1))
|
||||
assert.True(Less(r1, r2))
|
||||
assert.False(Less(r2, r1))
|
||||
assert.False(Less(r2, r2))
|
||||
|
||||
r0 := Ref{}
|
||||
assert.False(Less(r0, r0))
|
||||
assert.True(Less(r0, r2))
|
||||
assert.False(Less(r2, r0))
|
||||
}
|
||||
|
||||
func TestGreater(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
r1 := Parse("sha1-0000000000000000000000000000000000000001")
|
||||
r2 := Parse("sha1-0000000000000000000000000000000000000002")
|
||||
|
||||
assert.False(Greater(r1, r1))
|
||||
assert.False(Greater(r1, r2))
|
||||
assert.True(Greater(r2, r1))
|
||||
assert.False(Greater(r2, r2))
|
||||
|
||||
r0 := Ref{}
|
||||
assert.False(Greater(r0, r0))
|
||||
assert.False(Greater(r0, r2))
|
||||
assert.True(Greater(r2, r0))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user