Introduce List.Concat to the Go API (#2550)

It exploits the chunked structure of Lists to allow concatenating
arbitrarily large Lists.

I've added both functional and perf tests for Concat, and the perf tests
only made sense with tests for building and reading Lists, so now we
have those too.
This commit is contained in:
Ben Kalman
2016-09-14 14:15:14 -07:00
committed by GitHub
parent 55caa0f519
commit da336c3aab
9 changed files with 248 additions and 31 deletions

View File

@@ -83,6 +83,7 @@ import (
"time"
"github.com/attic-labs/noms/go/chunks"
"github.com/attic-labs/noms/go/d"
"github.com/attic-labs/noms/go/datas"
"github.com/attic-labs/noms/go/dataset"
"github.com/attic-labs/noms/go/marshal"
@@ -339,16 +340,14 @@ func (suite *PerfSuite) Pause(fn func()) {
suite.paused += time.Since(start)
}
func callSafe(name string, fun reflect.Value, args ...interface{}) (err interface{}) {
defer func() {
err = recover()
}()
func callSafe(name string, fun reflect.Value, args ...interface{}) error {
funArgs := make([]reflect.Value, len(args))
for i, arg := range args {
funArgs[i] = reflect.ValueOf(arg)
}
fun.Call(funArgs)
return
return d.Try(func() {
fun.Call(funArgs)
})
}
func (suite *PerfSuite) getEnvironment() types.Value {

View File

@@ -59,22 +59,6 @@ func (ims indexedMetaSequence) getCompareFn(other sequence) compareFn {
}
}
func newCursorAtIndex(seq indexedSequence, idx uint64) *sequenceCursor {
var cur *sequenceCursor
for {
cur = newSequenceCursor(cur, seq, 0)
idx = idx - advanceCursorToOffset(cur, idx)
cs := cur.getChildSequence()
if cs == nil {
break
}
seq = cs.(indexedSequence)
}
d.PanicIfFalse(cur != nil)
return cur
}
func advanceCursorToOffset(cur *sequenceCursor, idx uint64) uint64 {
seq := cur.seq.(indexedSequence)
cur.idx = sort.Search(seq.seqLen(), func(i int) bool {

View File

@@ -91,11 +91,12 @@ func (l List) Hash() hash.Hash {
return *l.h
}
func (l List) ChildValues() (values []Value) {
func (l List) ChildValues() []Value {
values := make([]Value, l.Len())
l.IterAll(func(v Value, idx uint64) {
values = append(values, v)
values[idx] = v
})
return
return values
}
func (l List) Chunks() []Ref {
@@ -161,7 +162,7 @@ func (l List) Splice(idx uint64, deleteCount uint64, vs ...Value) List {
d.PanicIfFalse(idx+deleteCount <= l.Len())
cur := newCursorAtIndex(l.seq, idx)
ch := newSequenceChunker(cur, l.seq.valueReader(), nil, makeListLeafChunkFn(l.seq.valueReader()), newIndexedMetaSequenceChunkFn(ListKind, l.seq.valueReader()), hashValueBytes)
ch := l.newChunker(cur, l.seq.valueReader())
for deleteCount > 0 {
ch.Skip()
deleteCount--
@@ -178,6 +179,16 @@ func (l List) Insert(idx uint64, vs ...Value) List {
return l.Splice(idx, 0, vs...)
}
// Concat returns new list comprised of this joined with other. It only needs to
// visit the rightmost prolly tree chunks of this list, and the leftmost prolly
// tree chunks of other.
func (l List) Concat(other List) List {
seq := concat(l.seq, other.seq, func(cur *sequenceCursor, vr ValueReader) *sequenceChunker {
return l.newChunker(cur, vr)
})
return newList(seq.(indexedSequence))
}
// Remove returns a new list where the items at index start (inclusive) through end (exclusive) have
// been removed. This panics if end is smaller than start.
func (l List) Remove(start uint64, end uint64) List {
@@ -262,6 +273,10 @@ func (l List) DiffWithLimit(last List, changes chan<- Splice, closeChan <-chan s
indexedSequenceDiff(last.seq, lastCur.depth(), 0, l.seq, lCur.depth(), 0, changes, closeChan, maxSpliceMatrixSize)
}
func (l List) newChunker(cur *sequenceCursor, vr ValueReader) *sequenceChunker {
return newSequenceChunker(cur, vr, nil, makeListLeafChunkFn(vr), newIndexedMetaSequenceChunkFn(ListKind, vr), hashValueBytes)
}
// If |sink| is not nil, chunks will be eagerly written as they're created. Otherwise they are
// written when the root is written.
func makeListLeafChunkFn(vr ValueReader) makeChunkFn {

View File

@@ -1060,3 +1060,50 @@ func TestListRemoveLastWhenNotLoaded(t *testing.T) {
assert.True(tl.toList().Equals(nl))
}
}
func TestListConcat(t *testing.T) {
if testing.Short() {
t.Skip("Skipping test in short mode.")
}
assert := assert.New(t)
smallTestChunks()
defer normalProductionChunks()
vs := NewTestValueStore()
reload := func(vs *ValueStore, l List) List {
return vs.ReadValue(vs.WriteValue(l).TargetHash()).(List)
}
run := func(seed int64, size, from, to, by int) {
r := rand.New(rand.NewSource(seed))
listSlice := make(testList, size)
for i := range listSlice {
listSlice[i] = Number(r.Intn(size))
}
list := listSlice.toList()
for i := from; i < to; i += by {
fst := reload(vs, listSlice[:i].toList())
snd := reload(vs, listSlice[i:].toList())
actual := fst.Concat(snd)
assert.True(list.Equals(actual),
"fail at %d/%d (with expected length %d, actual %d)", i, size, list.Len(), actual.Len())
}
}
run(0, 10, 0, 10, 1)
run(1, 100, 0, 100, 1)
run(2, 1000, 0, 1000, 10)
run(3, 1000, 0, 100, 1)
run(4, 1000, 900, 1000, 1)
run(5, 1e4, 0, 1e4, 100)
run(6, 1e4, 0, 1000, 10)
run(7, 1e4, 1e4-1000, 1e4, 10)
}

7
go/types/perf/dummy.go Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2016 Attic Labs, Inc. All rights reserved.
// Licensed under the Apache License, version 2.0:
// http://www.apache.org/licenses/LICENSE-2.0
package perf
// go build fails if there are _test.go but no other go files in a directory.

112
go/types/perf/perf_test.go Normal file
View File

@@ -0,0 +1,112 @@
// Copyright 2016 Attic Labs, Inc. All rights reserved.
// Licensed under the Apache License, version 2.0:
// http://www.apache.org/licenses/LICENSE-2.0
package perf
import (
"math/rand"
"testing"
"github.com/attic-labs/noms/go/dataset"
"github.com/attic-labs/noms/go/perf/suite"
"github.com/attic-labs/noms/go/types"
)
type perfSuite struct {
suite.PerfSuite
r *rand.Rand
ds string
}
func (s *perfSuite) SetupSuite() {
s.r = rand.New(rand.NewSource(0))
}
func (s *perfSuite) Test01BuildList10mNumbers() {
assert := s.NewAssert()
in := make(chan types.Value, 16)
out := types.NewStreamingList(s.Database, in)
for i := 0; i < 1e7; i++ {
in <- types.Number(s.r.Int63())
}
close(in)
ds := dataset.NewDataset(s.Database, "BuildList10mNumbers")
var err error
ds, err = ds.CommitValue(<-out)
assert.NoError(err)
s.Database = ds.Database()
}
func (s *perfSuite) Test02BuildList10mStructs() {
assert := s.NewAssert()
in := make(chan types.Value, 16)
out := types.NewStreamingList(s.Database, in)
for i := 0; i < 1e7; i++ {
in <- types.NewStruct("", types.StructData{
"number": types.Number(s.r.Int63()),
})
}
close(in)
ds := dataset.NewDataset(s.Database, "BuildList10mStructs")
var err error
ds, err = ds.CommitValue(<-out)
assert.NoError(err)
s.Database = ds.Database()
}
func (s *perfSuite) Test03Read10mNumbers() {
s.headList("BuildList10mNumbers").IterAll(func(v types.Value, index uint64) {})
}
func (s *perfSuite) Test04Read10mStructs() {
s.headList("BuildList10mStructs").IterAll(func(v types.Value, index uint64) {})
}
func (s *perfSuite) Test05Concat10mValues2kTimes() {
assert := s.NewAssert()
last := func(v types.List) types.Value {
return v.Get(v.Len() - 1)
}
l1 := s.headList("BuildList10mNumbers")
l2 := s.headList("BuildList10mStructs")
l1Len, l2Len := l1.Len(), l2.Len()
l1Last, l2Last := last(l1), last(l2)
l3 := types.NewList()
for i := uint64(0); i < 1e3; i++ { // 1k iterations * 2 concat ops = 2k times
// Include some basic sanity checks.
l3 = l3.Concat(l1)
assert.True(l1Last.Equals(last(l3)))
assert.Equal(i*(l1Len+l2Len)+l1Len, l3.Len())
l3 = l3.Concat(l2)
assert.True(l2Last.Equals(last(l3)))
assert.Equal((i+1)*(l1Len+l2Len), l3.Len())
}
ds := dataset.NewDataset(s.Database, "Concat10mValues2kTimes")
var err error
ds, err = ds.CommitValue(l3)
assert.NoError(err)
s.Database = ds.Database()
}
func (s *perfSuite) headList(dsName string) types.List {
ds := dataset.NewDataset(s.Database, dsName)
return ds.HeadValue().(types.List)
}
func TestPerf(t *testing.T) {
suite.Run("types", t, &perfSuite{})
}

View File

@@ -0,0 +1,40 @@
// Copyright 2016 Attic Labs, Inc. All rights reserved.
// Licensed under the Apache License, version 2.0:
// http://www.apache.org/licenses/LICENSE-2.0
package types
import "github.com/attic-labs/noms/go/d"
type newSequenceChunkerFn func(cur *sequenceCursor, vr ValueReader) *sequenceChunker
func concat(fst, snd sequence, newSequenceChunker newSequenceChunkerFn) sequence {
if fst.numLeaves() == 0 {
return snd
}
if snd.numLeaves() == 0 {
return fst
}
// concat works by tricking the sequenceChunker into resuming chunking at a
// cursor to the end of fst, then finalizing chunking to the start of snd - by
// swapping fst cursors for snd cursors in the middle of chunking.
vr := fst.valueReader()
d.PanicIfTrue(vr != snd.valueReader(), "cannot concat sequences from different databases")
chunker := newSequenceChunker(newCursorAtIndex(fst, fst.numLeaves()), vr)
for cur, ch := newCursorAtIndex(snd, 0), chunker; cur != nil; cur = cur.parent {
// If fst is shallower than snd, its cur will have a parent whereas the
// chunker to snd won't. In that case, create a parent for fst.
// Note that if the inverse is true - snd is shallower than fst - this just
// means higher chunker levels will still have cursors from fst... which
// point to the end, so finalisation won't do anything. This is correct.
if ch.parent == nil {
ch.createParent()
}
ch.cur = cur.clone()
ch = ch.parent
}
return chunker.Done()
}

View File

@@ -125,3 +125,19 @@ func (cur *sequenceCursor) iter(cb cursorIterCallback) {
cur.advance()
}
}
func newCursorAtIndex(seq sequence, idx uint64) *sequenceCursor {
var cur *sequenceCursor
for {
cur = newSequenceCursor(cur, seq, 0)
idx = idx - advanceCursorToOffset(cur, idx)
cs := cur.getChildSequence()
if cs == nil {
break
}
seq = cs
}
d.PanicIfTrue(cur == nil)
return cur
}

View File

@@ -34,17 +34,15 @@ func intsToValueSlice(ints ...int) ValueSlice {
}
func generateNumbersAsValues(n int) []Value {
d.Chk.True(n > 0, "must be an integer greater than zero")
return generateNumbersAsValuesFromToBy(0, n, 1)
}
func generateNumbersAsValueSlice(n int) ValueSlice {
d.Chk.True(n > 0, "must be an integer greater than zero")
return generateNumbersAsValuesFromToBy(0, n, 1)
}
func generateNumbersAsValuesFromToBy(from, to, by int) ValueSlice {
d.Chk.True(to > from, "to must be greater than from")
d.Chk.True(to >= from, "to must be greater than or equal to from")
d.Chk.True(by > 0, "must be an integer greater than zero")
nums := []Value{}
for i := from; i < to; i += by {
@@ -58,7 +56,7 @@ func generateNumbersAsStructs(n int) ValueSlice {
}
func generateNumbersAsStructsFromToBy(from, to, by int) ValueSlice {
d.Chk.True(to > from, "to must be greater than from")
d.Chk.True(to >= from, "to must be greater than or equal to from")
d.Chk.True(by > 0, "must be an integer greater than zero")
nums := []Value{}
for i := from; i < to; i += by {
@@ -68,7 +66,6 @@ func generateNumbersAsStructsFromToBy(from, to, by int) ValueSlice {
}
func generateNumbersAsRefOfStructs(n int) []Value {
d.Chk.True(n > 0, "must be an integer greater than zero")
vs := NewTestValueStore()
nums := []Value{}
for i := 0; i < n; i++ {