mirror of
https://github.com/dolthub/dolt.git
synced 2026-05-04 11:30:14 -05:00
Chunking: Multi level chunking for blobs
After a compound blob is created we try to chunk it again in a similar way to how we chunk Lists. We use the refs of the sub blob and compute a rolling hash over these. If the hash matches a pattern then we split the existing compound blob into a new compound blob with sub blobs which are slices of the original compound blob. Issue #17
This commit is contained in:
+4
-1
@@ -55,7 +55,10 @@ func NewBlob(r io.Reader) (Blob, error) {
|
||||
if len(blobs) == 1 {
|
||||
return blob, nil
|
||||
}
|
||||
return compoundBlob{offsets, blobs, &ref.Ref{}, nil}, nil
|
||||
|
||||
co := compoundObject{offsets, blobs, &ref.Ref{}, nil}
|
||||
co = splitCompoundObject(co, compoundObjectToBlobFuture)
|
||||
return compoundBlob{co}, nil
|
||||
}
|
||||
|
||||
func BlobFromVal(v Value) Blob {
|
||||
|
||||
+8
-21
@@ -12,10 +12,11 @@ import (
|
||||
// compoundBlob represents a list of Blobs.
|
||||
// It implements the Blob interface.
|
||||
type compoundBlob struct {
|
||||
offsets []uint64 // The offsets of the end of the related blobs.
|
||||
blobs []Future
|
||||
ref *ref.Ref
|
||||
cs chunks.ChunkSource
|
||||
compoundObject
|
||||
}
|
||||
|
||||
func newCompoundBlob(offsets []uint64, futures []Future, cs chunks.ChunkSource) compoundBlob {
|
||||
return compoundBlob{compoundObject{offsets, futures, &ref.Ref{}, cs}}
|
||||
}
|
||||
|
||||
// Reader implements the Blob interface
|
||||
@@ -31,7 +32,7 @@ type compoundBlobReader struct {
|
||||
}
|
||||
|
||||
func (cbr *compoundBlobReader) Read(p []byte) (n int, err error) {
|
||||
for cbr.currentBlobIndex < len(cbr.cb.blobs) {
|
||||
for cbr.currentBlobIndex < len(cbr.cb.futures) {
|
||||
if cbr.currentReader == nil {
|
||||
if err = cbr.updateReader(); err != nil {
|
||||
return
|
||||
@@ -97,8 +98,8 @@ func (cbr *compoundBlobReader) findBlobOffset(abs uint64) int {
|
||||
}
|
||||
|
||||
func (cbr *compoundBlobReader) updateReader() error {
|
||||
if cbr.currentBlobIndex < len(cbr.cb.blobs) {
|
||||
v := cbr.cb.blobs[cbr.currentBlobIndex].Deref(cbr.cb.cs)
|
||||
if cbr.currentBlobIndex < len(cbr.cb.futures) {
|
||||
v := cbr.cb.futures[cbr.currentBlobIndex].Deref(cbr.cb.cs)
|
||||
cbr.currentReader = v.(Blob).Reader()
|
||||
} else {
|
||||
cbr.currentReader = nil
|
||||
@@ -106,11 +107,6 @@ func (cbr *compoundBlobReader) updateReader() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Len implements the Blob interface
|
||||
func (cb compoundBlob) Len() uint64 {
|
||||
return cb.offsets[len(cb.offsets)-1]
|
||||
}
|
||||
|
||||
func (cb compoundBlob) Ref() ref.Ref {
|
||||
return ensureRef(cb.ref, cb)
|
||||
}
|
||||
@@ -121,12 +117,3 @@ func (cb compoundBlob) Equals(other Value) bool {
|
||||
}
|
||||
return cb.Ref() == other.Ref()
|
||||
}
|
||||
|
||||
func (cb compoundBlob) Chunks() (futures []Future) {
|
||||
for _, f := range cb.blobs {
|
||||
if f, ok := f.(*unresolvedFuture); ok {
|
||||
futures = append(futures, f)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
+97
-43
@@ -2,14 +2,15 @@ package types
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"math/rand"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
|
||||
"github.com/attic-labs/noms/chunks"
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
func getTestCompoundBlob(datas ...string) compoundBlob {
|
||||
@@ -22,23 +23,45 @@ func getTestCompoundBlob(datas ...string) compoundBlob {
|
||||
length += uint64(len(s))
|
||||
offsets[i] = length
|
||||
}
|
||||
return compoundBlob{offsets, blobs, &ref.Ref{}, nil}
|
||||
return newCompoundBlob(offsets, blobs, nil)
|
||||
}
|
||||
|
||||
func getAliceBlob(t *testing.T) compoundBlob {
|
||||
assert := assert.New(t)
|
||||
f, err := os.Open("alice-short.txt")
|
||||
assert.NoError(err)
|
||||
defer f.Close()
|
||||
type randReader struct {
|
||||
s rand.Source
|
||||
i int
|
||||
size int
|
||||
}
|
||||
|
||||
b, err := NewBlob(f)
|
||||
assert.NoError(err)
|
||||
cb, ok := b.(compoundBlob)
|
||||
assert.True(ok)
|
||||
return cb
|
||||
func (r *randReader) Read(p []byte) (n int, err error) {
|
||||
start := r.i
|
||||
for i := range p {
|
||||
if r.i == r.size {
|
||||
return r.i - start, io.EOF
|
||||
}
|
||||
p[i] = byte(r.s.Int63() & 0xff)
|
||||
r.i++
|
||||
}
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func getRandomReader() io.Reader {
|
||||
return &randReader{rand.NewSource(42), 0, 5e5}
|
||||
}
|
||||
|
||||
func getRandomBlob(t *testing.T) compoundBlob {
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping test in short mode.")
|
||||
}
|
||||
r := getRandomReader()
|
||||
b, err := NewBlob(r)
|
||||
assert.NoError(t, err)
|
||||
return b.(compoundBlob)
|
||||
}
|
||||
|
||||
func TestCompoundBlobReader(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping test in short mode.")
|
||||
}
|
||||
assert := assert.New(t)
|
||||
cs := &chunks.MemoryStore{}
|
||||
|
||||
@@ -47,13 +70,11 @@ func TestCompoundBlobReader(t *testing.T) {
|
||||
assert.NoError(err)
|
||||
assert.Equal("helloworld", string(bs))
|
||||
|
||||
ab := getAliceBlob(t)
|
||||
ab := getRandomBlob(t)
|
||||
bs, err = ioutil.ReadAll(ab.Reader())
|
||||
assert.NoError(err)
|
||||
f, err := os.Open("alice-short.txt")
|
||||
assert.NoError(err)
|
||||
defer f.Close()
|
||||
bs2, err := ioutil.ReadAll(f)
|
||||
r := getRandomReader()
|
||||
bs2, err := ioutil.ReadAll(r)
|
||||
assert.Equal(bs2, bs)
|
||||
|
||||
ref := WriteValue(cb, cs)
|
||||
@@ -85,7 +106,7 @@ func TestCompoundBlobReaderLazy(t *testing.T) {
|
||||
b2 := newBlobLeaf([]byte("bye"))
|
||||
tb2 := &testBlob{b2, &readCount2}
|
||||
|
||||
cb := compoundBlob{[]uint64{2, 5}, []Future{futureFromValue(tb1), futureFromValue(tb2)}, &ref.Ref{}, nil}
|
||||
cb := newCompoundBlob([]uint64{2, 5}, []Future{futureFromValue(tb1), futureFromValue(tb2)}, nil)
|
||||
|
||||
r := cb.Reader()
|
||||
assert.Equal(0, readCount1)
|
||||
@@ -128,7 +149,7 @@ func TestCompoundBlobReaderLazySeek(t *testing.T) {
|
||||
b2 := newBlobLeaf([]byte("bye"))
|
||||
tb2 := &testBlob{b2, &readCount2}
|
||||
|
||||
cb := compoundBlob{[]uint64{2, 5}, []Future{futureFromValue(tb1), futureFromValue(tb2)}, &ref.Ref{}, nil}
|
||||
cb := newCompoundBlob([]uint64{2, 5}, []Future{futureFromValue(tb1), futureFromValue(tb2)}, nil)
|
||||
|
||||
r := cb.Reader()
|
||||
|
||||
@@ -217,8 +238,8 @@ func TestCompoundBlobLen(t *testing.T) {
|
||||
cb := getTestCompoundBlob("hello", "world")
|
||||
assert.Equal(uint64(10), cb.Len())
|
||||
|
||||
ab := getAliceBlob(t)
|
||||
assert.Equal(uint64(30157), ab.Len())
|
||||
ab := getRandomBlob(t)
|
||||
assert.Equal(uint64(5e5), ab.Len())
|
||||
}
|
||||
|
||||
func TestCompoundBlobChunks(t *testing.T) {
|
||||
@@ -231,54 +252,87 @@ func TestCompoundBlobChunks(t *testing.T) {
|
||||
bl1 := newBlobLeaf([]byte("hello"))
|
||||
blr1 := bl1.Ref()
|
||||
bl2 := newBlobLeaf([]byte("world"))
|
||||
cb = compoundBlob{[]uint64{5, 10}, []Future{futureFromRef(blr1), futureFromValue(bl2)}, &ref.Ref{}, cs}
|
||||
cb = newCompoundBlob([]uint64{5, 10}, []Future{futureFromRef(blr1), futureFromValue(bl2)}, cs)
|
||||
assert.Equal(1, len(cb.Chunks()))
|
||||
}
|
||||
|
||||
func TestCompoundBlobSameChunksWithPrefix(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
cb1 := getAliceBlob(t)
|
||||
cb1 := getRandomBlob(t)
|
||||
|
||||
// Load same file again but prepend some data... all but the first chunk should stay the same
|
||||
f, err := os.Open("alice-short.txt")
|
||||
assert.NoError(err)
|
||||
defer f.Close()
|
||||
rr := getRandomReader()
|
||||
buf := bytes.NewBufferString("prefix")
|
||||
r := io.MultiReader(buf, f)
|
||||
r := io.MultiReader(buf, rr)
|
||||
|
||||
b, err := NewBlob(r)
|
||||
assert.NoError(err)
|
||||
cb2 := b.(compoundBlob)
|
||||
|
||||
// cb1: chunks 2
|
||||
// chunks 21 - only first chunk is different
|
||||
// chunks 31
|
||||
// cb2: chunks 2
|
||||
// chunks 21
|
||||
// chunks 31
|
||||
|
||||
assert.Equal(cb2.Len(), cb1.Len()+uint64(6))
|
||||
assert.Equal(3, len(cb1.blobs))
|
||||
assert.Equal(len(cb1.blobs), len(cb2.blobs))
|
||||
assert.NotEqual(cb1.blobs[0].Ref(), cb2.blobs[0].Ref())
|
||||
assert.Equal(cb1.blobs[1].Ref(), cb2.blobs[1].Ref())
|
||||
assert.Equal(cb1.blobs[2].Ref(), cb2.blobs[2].Ref())
|
||||
assert.Equal(2, len(cb1.futures))
|
||||
assert.Equal(2, len(cb2.futures))
|
||||
assert.NotEqual(cb1.futures[0].Ref(), cb2.futures[0].Ref())
|
||||
assert.Equal(cb1.futures[1].Ref(), cb2.futures[1].Ref())
|
||||
|
||||
futures1 := cb1.futures[0].Deref(nil).(compoundBlob).futures
|
||||
futures2 := cb2.futures[0].Deref(nil).(compoundBlob).futures
|
||||
assert.NotEqual(futures1[0].Ref(), futures2[0].Ref())
|
||||
assert.Equal(futures1[1].Ref(), futures2[1].Ref())
|
||||
}
|
||||
|
||||
func TestCompoundBlobSameChunksWithSuffix(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
cb1 := getAliceBlob(t)
|
||||
cb1 := getRandomBlob(t)
|
||||
|
||||
// Load same file again but append some data... all but the last chunk should stay the same
|
||||
f, err := os.Open("alice-short.txt")
|
||||
assert.NoError(err)
|
||||
defer f.Close()
|
||||
rr := getRandomReader()
|
||||
buf := bytes.NewBufferString("suffix")
|
||||
r := io.MultiReader(f, buf)
|
||||
r := io.MultiReader(rr, buf)
|
||||
|
||||
b, err := NewBlob(r)
|
||||
assert.NoError(err)
|
||||
cb2 := b.(compoundBlob)
|
||||
|
||||
// cb1: chunks 2
|
||||
// chunks 21
|
||||
// chunks 31
|
||||
// cb2: chunks 2
|
||||
// chunks 21
|
||||
// chunks 31 - only last chunk is different
|
||||
|
||||
assert.Equal(cb2.Len(), cb1.Len()+uint64(6))
|
||||
assert.Equal(3, len(cb1.blobs))
|
||||
assert.Equal(len(cb1.blobs), len(cb2.blobs))
|
||||
assert.Equal(cb1.blobs[0].Ref(), cb2.blobs[0].Ref())
|
||||
assert.Equal(cb1.blobs[1].Ref(), cb2.blobs[1].Ref())
|
||||
assert.NotEqual(cb1.blobs[2].Ref(), cb2.blobs[2].Ref())
|
||||
assert.Equal(2, len(cb1.futures))
|
||||
assert.Equal(len(cb1.futures), len(cb2.futures))
|
||||
assert.Equal(cb1.futures[0].Ref(), cb2.futures[0].Ref())
|
||||
assert.NotEqual(cb1.futures[1].Ref(), cb2.futures[1].Ref())
|
||||
|
||||
futures1 := cb1.futures[1].Deref(nil).(compoundBlob).futures
|
||||
futures2 := cb2.futures[1].Deref(nil).(compoundBlob).futures
|
||||
assert.Equal(futures1[0].Ref(), futures2[0].Ref())
|
||||
assert.Equal(futures1[len(futures1)-2].Ref(), futures2[len(futures2)-2].Ref())
|
||||
assert.NotEqual(futures1[len(futures1)-1].Ref(), futures2[len(futures2)-1].Ref())
|
||||
}
|
||||
|
||||
func printBlob(b Blob, indent int) {
|
||||
indentString := strings.Repeat("| ", indent)
|
||||
switch b := b.(type) {
|
||||
case blobLeaf:
|
||||
fmt.Printf("%sblobLeaf, len: %d\n", indentString, b.Len())
|
||||
case compoundBlob:
|
||||
fmt.Printf("%scompoundBlob, len: %d, chunks: %d\n", indentString, b.Len(), len(b.offsets))
|
||||
indent++
|
||||
for _, sb := range b.futures {
|
||||
printBlob(sb.Deref(b.cs).(Blob), indent)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+23
-35
@@ -19,10 +19,7 @@ const (
|
||||
// compoundList implements the List interface
|
||||
// compoundList implements the Value interface
|
||||
type compoundList struct {
|
||||
offsets []uint64 // The offsets are the end offsets between child lists
|
||||
lists []Future
|
||||
ref *ref.Ref
|
||||
cs chunks.ChunkSource
|
||||
compoundObject
|
||||
}
|
||||
|
||||
// listChunker is used to create a compoundList or a listLeaf.
|
||||
@@ -31,7 +28,7 @@ type compoundList struct {
|
||||
// we split the list at that point.
|
||||
type listChunker struct {
|
||||
h *buzhash.BuzHash
|
||||
lists []Future
|
||||
futures []Future
|
||||
offsets []uint64
|
||||
currentList []Future // Accumulated Futures as the list is built.
|
||||
cs chunks.ChunkSource
|
||||
@@ -48,15 +45,15 @@ func newListChunker(cs chunks.ChunkSource) *listChunker {
|
||||
func newListChunkerFromList(l compoundList, startIdx uint64) *listChunker {
|
||||
lc := newListChunker(l.cs)
|
||||
si := findSubIndex(startIdx, l.offsets)
|
||||
lc.lists = make([]Future, si)
|
||||
copy(lc.lists, l.lists)
|
||||
lc.futures = make([]Future, si)
|
||||
copy(lc.futures, l.futures)
|
||||
lc.offsets = make([]uint64, si)
|
||||
copy(lc.offsets, l.offsets)
|
||||
offset := uint64(0)
|
||||
if si > 0 {
|
||||
offset += l.offsets[si-1]
|
||||
}
|
||||
lastList := l.lists[si].Deref(l.cs).(List)
|
||||
lastList := l.futures[si].Deref(l.cs).(List)
|
||||
it := newListIterator(lastList)
|
||||
for i := uint64(0); i < startIdx-offset; i++ {
|
||||
f, done := it.next()
|
||||
@@ -88,7 +85,7 @@ func (lc *listChunker) writeFuture(f Future) (split bool) {
|
||||
|
||||
func (lc *listChunker) addChunk() {
|
||||
list := listLeafFromFutures(lc.currentList, lc.cs)
|
||||
lc.lists = append(lc.lists, futureFromValue(list))
|
||||
lc.futures = append(lc.futures, futureFromValue(list))
|
||||
offset := uint64(len(lc.currentList))
|
||||
if len(lc.offsets) > 0 {
|
||||
offset += lc.offsets[len(lc.offsets)-1]
|
||||
@@ -116,7 +113,7 @@ func (lc *listChunker) writeTail(cl compoundList, idx, added uint64) {
|
||||
if lc.writeFuture(f) {
|
||||
// if cl has a split at this index then the rest can be copied.
|
||||
if sc, si := cl.startsChunk(i - added + 1); sc {
|
||||
lc.lists = append(lc.lists, cl.lists[si:]...)
|
||||
lc.futures = append(lc.futures, cl.futures[si:]...)
|
||||
lc.offsets = append(lc.offsets, cl.offsets[si:]...)
|
||||
break
|
||||
}
|
||||
@@ -125,21 +122,17 @@ func (lc *listChunker) writeTail(cl compoundList, idx, added uint64) {
|
||||
}
|
||||
|
||||
func (lc *listChunker) makeList() List {
|
||||
if len(lc.lists) == 0 {
|
||||
if len(lc.futures) == 0 {
|
||||
return listLeafFromFutures(lc.currentList, lc.cs)
|
||||
}
|
||||
if len(lc.currentList) > 0 {
|
||||
lc.addChunk()
|
||||
}
|
||||
// In case we get a single child list just return that instead.
|
||||
if len(lc.lists) == 1 {
|
||||
return lc.lists[0].Deref(lc.cs).(List)
|
||||
if len(lc.futures) == 1 {
|
||||
return lc.futures[0].Deref(lc.cs).(List)
|
||||
}
|
||||
return compoundList{lc.offsets, lc.lists, &ref.Ref{}, lc.cs}
|
||||
}
|
||||
|
||||
func (cl compoundList) Len() uint64 {
|
||||
return cl.offsets[len(cl.offsets)-1]
|
||||
return newCompoundList(lc.offsets, lc.futures, lc.cs)
|
||||
}
|
||||
|
||||
func (cl compoundList) Empty() bool {
|
||||
@@ -158,7 +151,7 @@ func (cl compoundList) Get(idx uint64) Value {
|
||||
|
||||
func (cl compoundList) getFuture(idx uint64) Future {
|
||||
si := findSubIndex(idx, cl.offsets)
|
||||
f := cl.lists[si]
|
||||
f := cl.futures[si]
|
||||
l := f.Deref(cl.cs).(List)
|
||||
if si > 0 {
|
||||
idx -= cl.offsets[si-1]
|
||||
@@ -188,18 +181,18 @@ func (cl compoundList) Set(idx uint64, v Value) List {
|
||||
func (cl compoundList) Append(vs ...Value) List {
|
||||
// Redo chunking from last chunk.
|
||||
d.Chk.False(cl.Empty())
|
||||
d.Chk.True(len(cl.lists) > 1)
|
||||
d.Chk.True(len(cl.futures) > 1)
|
||||
|
||||
l := len(cl.offsets)
|
||||
offsets := make([]uint64, l-1, l)
|
||||
copy(offsets, cl.offsets)
|
||||
l = len(cl.lists)
|
||||
l = len(cl.futures)
|
||||
lists := make([]Future, l-1, l)
|
||||
copy(lists, cl.lists)
|
||||
lastList := cl.lists[l-1].Deref(cl.cs).(List)
|
||||
copy(lists, cl.futures)
|
||||
lastList := cl.futures[l-1].Deref(cl.cs).(List)
|
||||
|
||||
lc := newListChunker(cl.cs)
|
||||
lc.lists = lists
|
||||
lc.futures = lists
|
||||
lc.offsets = offsets
|
||||
|
||||
// Append elements from last list again.
|
||||
@@ -246,7 +239,7 @@ func (cl compoundList) Ref() ref.Ref {
|
||||
}
|
||||
|
||||
func (cl compoundList) Release() {
|
||||
for _, f := range cl.lists {
|
||||
for _, f := range cl.futures {
|
||||
f.Release()
|
||||
}
|
||||
}
|
||||
@@ -258,15 +251,6 @@ func (cl compoundList) Equals(other Value) bool {
|
||||
return cl.Ref() == other.Ref()
|
||||
}
|
||||
|
||||
func (cl compoundList) Chunks() (futures []Future) {
|
||||
for _, f := range cl.lists {
|
||||
if f, ok := f.(*unresolvedFuture); ok {
|
||||
futures = append(futures, f)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// startsChunk determines if idx refers to the first element in one of cl's chunks.
|
||||
// If so, it also returns the index of the chunk into which idx points.
|
||||
func (cl compoundList) startsChunk(idx uint64) (bool, uint64) {
|
||||
@@ -278,7 +262,11 @@ func (cl compoundList) startsChunk(idx uint64) (bool, uint64) {
|
||||
return offset == idx, uint64(si)
|
||||
}
|
||||
|
||||
func newCompoundList(vs []Value, cs chunks.ChunkSource) List {
|
||||
func newCompoundList(offsets []uint64, futures []Future, cs chunks.ChunkSource) compoundList {
|
||||
return compoundList{compoundObject{offsets, futures, &ref.Ref{}, cs}}
|
||||
}
|
||||
|
||||
func newCompoundListFromValues(vs []Value, cs chunks.ChunkSource) List {
|
||||
l := uint64(len(vs))
|
||||
// Always use a list leaf for empty and single element lists.
|
||||
if l < 2 {
|
||||
|
||||
+10
-11
@@ -8,7 +8,6 @@ import (
|
||||
|
||||
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
|
||||
"github.com/attic-labs/noms/chunks"
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
func getFakeCompoundList(datas ...string) compoundList {
|
||||
@@ -24,7 +23,7 @@ func getFakeCompoundList(datas ...string) compoundList {
|
||||
length += l.Len()
|
||||
offsets[i] = length
|
||||
}
|
||||
return compoundList{offsets, futures, &ref.Ref{}, nil}
|
||||
return newCompoundList(offsets, futures, nil)
|
||||
}
|
||||
|
||||
func getTestCompoundList(t *testing.T) List {
|
||||
@@ -57,7 +56,7 @@ func getWordsInAlice(t *testing.T) []Value {
|
||||
}
|
||||
|
||||
func getAliceList(t *testing.T) compoundList {
|
||||
return newCompoundList(getWordsInAlice(t), nil).(compoundList)
|
||||
return newCompoundListFromValues(getWordsInAlice(t), nil).(compoundList)
|
||||
}
|
||||
|
||||
func TestCompoundListLen(t *testing.T) {
|
||||
@@ -80,7 +79,7 @@ func TestCompoundListChunks(t *testing.T) {
|
||||
ll1 := NewList(NewString("h"), NewString("i"))
|
||||
llr1 := ll1.Ref()
|
||||
ll2 := NewList(NewString("b"), NewString("y"), NewString("e"))
|
||||
cl = compoundList{[]uint64{2, 5}, []Future{futureFromRef(llr1), futureFromValue(ll2)}, &ref.Ref{}, cs}
|
||||
cl = newCompoundList([]uint64{2, 5}, []Future{futureFromRef(llr1), futureFromValue(ll2)}, cs)
|
||||
assert.Equal(1, len(cl.Chunks()))
|
||||
}
|
||||
|
||||
@@ -106,16 +105,16 @@ func TestCompoundListReadWriteValue(t *testing.T) {
|
||||
assert.True(v.Equals(cl))
|
||||
}
|
||||
|
||||
func TestNewCompoundList(t *testing.T) {
|
||||
func TestnewCompoundListFromValues(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
vs := newCompoundList([]Value{}, nil)
|
||||
vs := newCompoundListFromValues([]Value{}, nil)
|
||||
assert.Equal(uint64(0), vs.Len())
|
||||
|
||||
vs = newCompoundList([]Value{NewString("a")}, nil)
|
||||
vs = newCompoundListFromValues([]Value{NewString("a")}, nil)
|
||||
assert.Equal(uint64(1), vs.Len())
|
||||
|
||||
vs = newCompoundList([]Value{NewString("h"), NewString("i")}, nil)
|
||||
vs = newCompoundListFromValues([]Value{NewString("h"), NewString("i")}, nil)
|
||||
assert.Equal(uint64(2), vs.Len())
|
||||
}
|
||||
|
||||
@@ -135,12 +134,12 @@ func TestCompoundListAppend(t *testing.T) {
|
||||
|
||||
cl2, ok := l2.(compoundList)
|
||||
assert.True(ok)
|
||||
assert.Equal(2, len(cl2.lists))
|
||||
assert.Equal(2, len(cl2.futures))
|
||||
|
||||
// It should not matter how the list was made
|
||||
words := getWordsInAlice(t)
|
||||
al1 := newCompoundList(words, nil)
|
||||
al2 := newCompoundList(words[0:len(words)/2], nil)
|
||||
al1 := newCompoundListFromValues(words, nil)
|
||||
al2 := newCompoundListFromValues(words[0:len(words)/2], nil)
|
||||
al2 = al2.Append(words[len(words)/2:]...)
|
||||
assert.True(al1.Equals(al2))
|
||||
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
package types
|
||||
|
||||
import (
|
||||
"crypto/sha1"
|
||||
|
||||
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/kch42/buzhash"
|
||||
"github.com/attic-labs/noms/chunks"
|
||||
"github.com/attic-labs/noms/d"
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
const (
|
||||
objectWindowSize = 8 * sha1.Size
|
||||
objectPattern = uint32(1<<6 - 1) // Average size of 64 elements
|
||||
)
|
||||
|
||||
type compoundObject struct {
|
||||
offsets []uint64
|
||||
futures []Future
|
||||
ref *ref.Ref
|
||||
cs chunks.ChunkSource
|
||||
}
|
||||
|
||||
func (co compoundObject) Len() uint64 {
|
||||
return co.offsets[len(co.offsets)-1]
|
||||
}
|
||||
|
||||
func (co compoundObject) Chunks() (futures []Future) {
|
||||
for _, f := range co.futures {
|
||||
if f, ok := f.(*unresolvedFuture); ok {
|
||||
futures = append(futures, f)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
type compoundObjectToFuture func(co compoundObject) Future
|
||||
|
||||
func compoundObjectToBlobFuture(co compoundObject) Future {
|
||||
return futureFromValue(compoundBlob{co})
|
||||
}
|
||||
|
||||
// splitCompoundObject chunks a compound list/blob into smaller compound
|
||||
// lists/blobs. If no split was made the same compoundObject is returned.
|
||||
func splitCompoundObject(co compoundObject, toFuture compoundObjectToFuture) compoundObject {
|
||||
offsets := []uint64{}
|
||||
futures := []Future{}
|
||||
|
||||
startIndex := uint64(0)
|
||||
h := buzhash.NewBuzHash(objectWindowSize)
|
||||
|
||||
for i := 0; i < len(co.offsets); i++ {
|
||||
future := co.futures[i]
|
||||
digest := future.Ref().Digest()
|
||||
_, err := h.Write(digest[:])
|
||||
d.Chk.NoError(err)
|
||||
if h.Sum32()&objectPattern == objectPattern {
|
||||
h = buzhash.NewBuzHash(objectWindowSize)
|
||||
future := makeSubObject(co, startIndex, uint64(i)+1, toFuture)
|
||||
startIndex = uint64(i) + 1
|
||||
offsets = append(offsets, co.offsets[i])
|
||||
futures = append(futures, future)
|
||||
}
|
||||
}
|
||||
|
||||
// No split, use original.
|
||||
if startIndex == 0 {
|
||||
return co
|
||||
}
|
||||
|
||||
// Add remaining.
|
||||
if startIndex != uint64(len(co.offsets)) {
|
||||
future := makeSubObject(co, startIndex, uint64(len(co.offsets)), toFuture)
|
||||
offsets = append(offsets, co.offsets[len(co.offsets)-1])
|
||||
futures = append(futures, future)
|
||||
}
|
||||
|
||||
// Single chunk, use original.
|
||||
if len(offsets) == 1 {
|
||||
return co
|
||||
}
|
||||
|
||||
// It is possible that the splitting the object produces the exact same
|
||||
// compound object.
|
||||
if len(offsets) == len(co.offsets) {
|
||||
return co
|
||||
}
|
||||
|
||||
// Split again.
|
||||
return splitCompoundObject(compoundObject{offsets, futures, &ref.Ref{}, co.cs}, toFuture)
|
||||
}
|
||||
|
||||
func makeSubObject(co compoundObject, startIndex, endIndex uint64, toFuture compoundObjectToFuture) Future {
|
||||
d.Chk.True(endIndex-startIndex > 0)
|
||||
if endIndex-startIndex == 1 {
|
||||
return co.futures[startIndex]
|
||||
}
|
||||
|
||||
futures := make([]Future, endIndex-startIndex)
|
||||
copy(futures, co.futures[startIndex:endIndex])
|
||||
offsets := make([]uint64, endIndex-startIndex)
|
||||
startOffset := uint64(0)
|
||||
if startIndex > 0 {
|
||||
startOffset = co.offsets[startIndex-1]
|
||||
}
|
||||
for i := startIndex; i < endIndex; i++ {
|
||||
offsets[i-startIndex] = co.offsets[i] - startOffset
|
||||
}
|
||||
return toFuture(compoundObject{offsets, futures, &ref.Ref{}, co.cs})
|
||||
}
|
||||
@@ -5,7 +5,6 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/attic-labs/noms/Godeps/_workspace/src/github.com/stretchr/testify/assert"
|
||||
"github.com/attic-labs/noms/ref"
|
||||
)
|
||||
|
||||
func TestPrimitiveEquals(t *testing.T) {
|
||||
@@ -59,7 +58,7 @@ func TestPrimitiveEquals(t *testing.T) {
|
||||
func() Value {
|
||||
b1, _ := NewBlob(bytes.NewBufferString("hi"))
|
||||
b2, _ := NewBlob(bytes.NewBufferString("bye"))
|
||||
return compoundBlob{[]uint64{2, 5}, []Future{futureFromValue(b1), futureFromValue(b2)}, &ref.Ref{}, nil}
|
||||
return newCompoundBlob([]uint64{2, 5}, []Future{futureFromValue(b1), futureFromValue(b2)}, nil)
|
||||
},
|
||||
func() Value { return NewList() },
|
||||
func() Value { return NewList(NewString("foo")) },
|
||||
|
||||
@@ -42,7 +42,7 @@ func TestEnsureRef(t *testing.T) {
|
||||
}()
|
||||
|
||||
bl := newBlobLeaf([]byte("hi"))
|
||||
cb := compoundBlob{[]uint64{2}, []Future{futureFromValue(bl)}, &ref.Ref{}, cs}
|
||||
cb := newCompoundBlob([]uint64{2}, []Future{futureFromValue(bl)}, cs)
|
||||
|
||||
values := []Value{
|
||||
newBlobLeaf([]byte{}),
|
||||
|
||||
+1
-1
@@ -23,7 +23,7 @@ type List interface {
|
||||
}
|
||||
|
||||
func NewList(v ...Value) List {
|
||||
return newCompoundList(v, nil)
|
||||
return newCompoundListFromValues(v, nil)
|
||||
}
|
||||
|
||||
func valuesToFutures(list []Value) []Future {
|
||||
|
||||
@@ -10,7 +10,7 @@ func newListIterator(l List) listIterator {
|
||||
case listLeaf:
|
||||
return &listLeafIterator{l, 0}
|
||||
case compoundList:
|
||||
return &compoundListIterator{l, newListIterator(l.lists[0].Deref(l.cs).(List)), 0}
|
||||
return &compoundListIterator{l, newListIterator(l.futures[0].Deref(l.cs).(List)), 0}
|
||||
}
|
||||
panic("Unreachable")
|
||||
}
|
||||
@@ -24,7 +24,7 @@ func newListIteratorAt(l List, idx uint64) listIterator {
|
||||
if si > 0 {
|
||||
idx -= l.offsets[si-1]
|
||||
}
|
||||
return &compoundListIterator{l, newListIteratorAt(l.lists[si].Deref(l.cs).(List), idx), uint64(si)}
|
||||
return &compoundListIterator{l, newListIteratorAt(l.futures[si].Deref(l.cs).(List), idx), uint64(si)}
|
||||
}
|
||||
panic("Unreachable")
|
||||
}
|
||||
@@ -55,9 +55,9 @@ type compoundListIterator struct {
|
||||
|
||||
func (it *compoundListIterator) next() (f Future, done bool) {
|
||||
f, done = it.it.next()
|
||||
if done && it.si < uint64(len(it.list.lists))-1 {
|
||||
if done && it.si < uint64(len(it.list.futures))-1 {
|
||||
it.si++
|
||||
it.it = newListIterator(it.list.lists[it.si].Deref(it.list.cs).(List))
|
||||
it.it = newListIterator(it.list.futures[it.si].Deref(it.list.cs).(List))
|
||||
f, done = it.it.next()
|
||||
}
|
||||
return
|
||||
|
||||
+2
-2
@@ -68,14 +68,14 @@ func fromEncodeable(i interface{}, cs chunks.ChunkSource) Future {
|
||||
for idx, blobRef := range i.Blobs {
|
||||
blobs[idx] = fromEncodeable(blobRef, cs)
|
||||
}
|
||||
cb := compoundBlob{i.Offsets, blobs, &ref.Ref{}, cs}
|
||||
cb := newCompoundBlob(i.Offsets, blobs, cs)
|
||||
return futureFromValue(cb)
|
||||
case enc.CompoundList:
|
||||
lists := make([]Future, len(i.Lists))
|
||||
for idx, listRef := range i.Lists {
|
||||
lists[idx] = fromEncodeable(listRef, cs)
|
||||
}
|
||||
cl := compoundList{i.Offsets, lists, &ref.Ref{}, cs}
|
||||
cl := newCompoundList(i.Offsets, lists, cs)
|
||||
return futureFromValue(cl)
|
||||
default:
|
||||
d.Exp.Fail(fmt.Sprintf("Unknown encodeable", "%+v", i))
|
||||
|
||||
+1
-1
@@ -38,7 +38,7 @@ func doTreeWalk2(f Future, cs chunks.ChunkSource, cb SomeCallback, skip bool) {
|
||||
|
||||
switch v := v.(type) {
|
||||
case compoundList:
|
||||
for _, f := range v.lists {
|
||||
for _, f := range v.futures {
|
||||
doTreeWalk2(f, cs, cb, true)
|
||||
}
|
||||
case listLeaf:
|
||||
|
||||
@@ -46,8 +46,8 @@ func toEncodeable(v Value, cs chunks.ChunkSink) interface{} {
|
||||
}
|
||||
|
||||
func encCompoundBlobFromCompoundBlob(cb compoundBlob, cs chunks.ChunkSink) interface{} {
|
||||
refs := make([]ref.Ref, len(cb.blobs))
|
||||
for idx, f := range cb.blobs {
|
||||
refs := make([]ref.Ref, len(cb.futures))
|
||||
for idx, f := range cb.futures {
|
||||
i := processChild(f, cs)
|
||||
// All children of compoundBlob must be Blobs, which get encoded and reffed by processChild.
|
||||
refs[idx] = i.(ref.Ref)
|
||||
@@ -56,8 +56,8 @@ func encCompoundBlobFromCompoundBlob(cb compoundBlob, cs chunks.ChunkSink) inter
|
||||
}
|
||||
|
||||
func encCompoundListFromCompoundList(cl compoundList, cs chunks.ChunkSink) interface{} {
|
||||
refs := make([]ref.Ref, len(cl.lists))
|
||||
for idx, f := range cl.lists {
|
||||
refs := make([]ref.Ref, len(cl.futures))
|
||||
for idx, f := range cl.futures {
|
||||
i := processChild(f, cs)
|
||||
// All children of compoundList must be Lists, which get encoded and reffed by processChild.
|
||||
refs[idx] = i.(ref.Ref)
|
||||
|
||||
Reference in New Issue
Block a user