Introduce List.Concat to the Go API (#2550)

It exploits the chunked structure of Lists to allow concatenating arbitrarily large Lists. I've added both functional and perf tests for Concat, and the perf tests only made sense with tests for building and reading Lists, so now we have those too.
2026-01-25 03:09:00 -06:00 · 2016-09-14 14:15:14 -07:00
parent 55caa0f519
commit da336c3aab
9 changed files with 248 additions and 31 deletions
--- a/go/perf/suite/suite.go
+++ b/go/perf/suite/suite.go
@@ -83,6 +83,7 @@ import (
 	"time"

 	"github.com/attic-labs/noms/go/chunks"
+	"github.com/attic-labs/noms/go/d"
 	"github.com/attic-labs/noms/go/datas"
 	"github.com/attic-labs/noms/go/dataset"
 	"github.com/attic-labs/noms/go/marshal"
@@ -339,16 +340,14 @@ func (suite *PerfSuite) Pause(fn func()) {
 	suite.paused += time.Since(start)
 }

-func callSafe(name string, fun reflect.Value, args ...interface{}) (err interface{}) {
-	defer func() {
-		err = recover()
-	}()
+func callSafe(name string, fun reflect.Value, args ...interface{}) error {
 	funArgs := make([]reflect.Value, len(args))
 	for i, arg := range args {
 		funArgs[i] = reflect.ValueOf(arg)
 	}
-	fun.Call(funArgs)
-	return
+	return d.Try(func() {
+		fun.Call(funArgs)
+	})
 }

 func (suite *PerfSuite) getEnvironment() types.Value {
--- a/go/types/indexed_sequences.go
+++ b/go/types/indexed_sequences.go
@@ -59,22 +59,6 @@ func (ims indexedMetaSequence) getCompareFn(other sequence) compareFn {
 	}
 }

-func newCursorAtIndex(seq indexedSequence, idx uint64) *sequenceCursor {
-	var cur *sequenceCursor
-	for {
-		cur = newSequenceCursor(cur, seq, 0)
-		idx = idx - advanceCursorToOffset(cur, idx)
-		cs := cur.getChildSequence()
-		if cs == nil {
-			break
-		}
-		seq = cs.(indexedSequence)
-	}
-
-	d.PanicIfFalse(cur != nil)
-	return cur
-}
-
 func advanceCursorToOffset(cur *sequenceCursor, idx uint64) uint64 {
 	seq := cur.seq.(indexedSequence)
 	cur.idx = sort.Search(seq.seqLen(), func(i int) bool {
--- a/go/types/list.go
+++ b/go/types/list.go
@@ -91,11 +91,12 @@ func (l List) Hash() hash.Hash {
 	return *l.h
 }

-func (l List) ChildValues() (values []Value) {
+func (l List) ChildValues() []Value {
+	values := make([]Value, l.Len())
 	l.IterAll(func(v Value, idx uint64) {
-		values = append(values, v)
+		values[idx] = v
 	})
-	return
+	return values
 }

 func (l List) Chunks() []Ref {
@@ -161,7 +162,7 @@ func (l List) Splice(idx uint64, deleteCount uint64, vs ...Value) List {
 	d.PanicIfFalse(idx+deleteCount <= l.Len())

 	cur := newCursorAtIndex(l.seq, idx)
-	ch := newSequenceChunker(cur, l.seq.valueReader(), nil, makeListLeafChunkFn(l.seq.valueReader()), newIndexedMetaSequenceChunkFn(ListKind, l.seq.valueReader()), hashValueBytes)
+	ch := l.newChunker(cur, l.seq.valueReader())
 	for deleteCount > 0 {
 		ch.Skip()
 		deleteCount--
@@ -178,6 +179,16 @@ func (l List) Insert(idx uint64, vs ...Value) List {
 	return l.Splice(idx, 0, vs...)
 }

+// Concat returns new list comprised of this joined with other. It only needs to
+// visit the rightmost prolly tree chunks of this list, and the leftmost prolly
+// tree chunks of other.
+func (l List) Concat(other List) List {
+	seq := concat(l.seq, other.seq, func(cur *sequenceCursor, vr ValueReader) *sequenceChunker {
+		return l.newChunker(cur, vr)
+	})
+	return newList(seq.(indexedSequence))
+}
+
 // Remove returns a new list where the items at index start (inclusive) through end (exclusive) have
 // been removed. This panics if end is smaller than start.
 func (l List) Remove(start uint64, end uint64) List {
@@ -262,6 +273,10 @@ func (l List) DiffWithLimit(last List, changes chan<- Splice, closeChan <-chan s
 	indexedSequenceDiff(last.seq, lastCur.depth(), 0, l.seq, lCur.depth(), 0, changes, closeChan, maxSpliceMatrixSize)
 }

+func (l List) newChunker(cur *sequenceCursor, vr ValueReader) *sequenceChunker {
+	return newSequenceChunker(cur, vr, nil, makeListLeafChunkFn(vr), newIndexedMetaSequenceChunkFn(ListKind, vr), hashValueBytes)
+}
+
 // If |sink| is not nil, chunks will be eagerly written as they're created. Otherwise they are
 // written when the root is written.
 func makeListLeafChunkFn(vr ValueReader) makeChunkFn {
--- a/go/types/list_test.go
+++ b/go/types/list_test.go
@@ -1060,3 +1060,50 @@ func TestListRemoveLastWhenNotLoaded(t *testing.T) {
 		assert.True(tl.toList().Equals(nl))
 	}
 }
+
+func TestListConcat(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping test in short mode.")
+	}
+
+	assert := assert.New(t)
+
+	smallTestChunks()
+	defer normalProductionChunks()
+
+	vs := NewTestValueStore()
+	reload := func(vs *ValueStore, l List) List {
+		return vs.ReadValue(vs.WriteValue(l).TargetHash()).(List)
+	}
+
+	run := func(seed int64, size, from, to, by int) {
+		r := rand.New(rand.NewSource(seed))
+
+		listSlice := make(testList, size)
+		for i := range listSlice {
+			listSlice[i] = Number(r.Intn(size))
+		}
+
+		list := listSlice.toList()
+
+		for i := from; i < to; i += by {
+			fst := reload(vs, listSlice[:i].toList())
+			snd := reload(vs, listSlice[i:].toList())
+			actual := fst.Concat(snd)
+			assert.True(list.Equals(actual),
+				"fail at %d/%d (with expected length %d, actual %d)", i, size, list.Len(), actual.Len())
+		}
+	}
+
+	run(0, 10, 0, 10, 1)
+
+	run(1, 100, 0, 100, 1)
+
+	run(2, 1000, 0, 1000, 10)
+	run(3, 1000, 0, 100, 1)
+	run(4, 1000, 900, 1000, 1)
+
+	run(5, 1e4, 0, 1e4, 100)
+	run(6, 1e4, 0, 1000, 10)
+	run(7, 1e4, 1e4-1000, 1e4, 10)
+}
--- a/go/types/perf/dummy.go
+++ b/go/types/perf/dummy.go
@@ -0,0 +1,7 @@
+// Copyright 2016 Attic Labs, Inc. All rights reserved.
+// Licensed under the Apache License, version 2.0:
+// http://www.apache.org/licenses/LICENSE-2.0
+
+package perf
+
+// go build fails if there are _test.go but no other go files in a directory.
--- a/go/types/perf/perf_test.go
+++ b/go/types/perf/perf_test.go
@@ -0,0 +1,112 @@
+// Copyright 2016 Attic Labs, Inc. All rights reserved.
+// Licensed under the Apache License, version 2.0:
+// http://www.apache.org/licenses/LICENSE-2.0
+
+package perf
+
+import (
+	"math/rand"
+	"testing"
+
+	"github.com/attic-labs/noms/go/dataset"
+	"github.com/attic-labs/noms/go/perf/suite"
+	"github.com/attic-labs/noms/go/types"
+)
+
+type perfSuite struct {
+	suite.PerfSuite
+	r  *rand.Rand
+	ds string
+}
+
+func (s *perfSuite) SetupSuite() {
+	s.r = rand.New(rand.NewSource(0))
+}
+
+func (s *perfSuite) Test01BuildList10mNumbers() {
+	assert := s.NewAssert()
+	in := make(chan types.Value, 16)
+	out := types.NewStreamingList(s.Database, in)
+
+	for i := 0; i < 1e7; i++ {
+		in <- types.Number(s.r.Int63())
+	}
+	close(in)
+
+	ds := dataset.NewDataset(s.Database, "BuildList10mNumbers")
+
+	var err error
+	ds, err = ds.CommitValue(<-out)
+
+	assert.NoError(err)
+	s.Database = ds.Database()
+}
+
+func (s *perfSuite) Test02BuildList10mStructs() {
+	assert := s.NewAssert()
+	in := make(chan types.Value, 16)
+	out := types.NewStreamingList(s.Database, in)
+
+	for i := 0; i < 1e7; i++ {
+		in <- types.NewStruct("", types.StructData{
+			"number": types.Number(s.r.Int63()),
+		})
+	}
+	close(in)
+
+	ds := dataset.NewDataset(s.Database, "BuildList10mStructs")
+
+	var err error
+	ds, err = ds.CommitValue(<-out)
+
+	assert.NoError(err)
+	s.Database = ds.Database()
+}
+
+func (s *perfSuite) Test03Read10mNumbers() {
+	s.headList("BuildList10mNumbers").IterAll(func(v types.Value, index uint64) {})
+}
+
+func (s *perfSuite) Test04Read10mStructs() {
+	s.headList("BuildList10mStructs").IterAll(func(v types.Value, index uint64) {})
+}
+
+func (s *perfSuite) Test05Concat10mValues2kTimes() {
+	assert := s.NewAssert()
+
+	last := func(v types.List) types.Value {
+		return v.Get(v.Len() - 1)
+	}
+
+	l1 := s.headList("BuildList10mNumbers")
+	l2 := s.headList("BuildList10mStructs")
+	l1Len, l2Len := l1.Len(), l2.Len()
+	l1Last, l2Last := last(l1), last(l2)
+
+	l3 := types.NewList()
+	for i := uint64(0); i < 1e3; i++ { // 1k iterations * 2 concat ops = 2k times
+		// Include some basic sanity checks.
+		l3 = l3.Concat(l1)
+		assert.True(l1Last.Equals(last(l3)))
+		assert.Equal(i*(l1Len+l2Len)+l1Len, l3.Len())
+		l3 = l3.Concat(l2)
+		assert.True(l2Last.Equals(last(l3)))
+		assert.Equal((i+1)*(l1Len+l2Len), l3.Len())
+	}
+
+	ds := dataset.NewDataset(s.Database, "Concat10mValues2kTimes")
+	var err error
+	ds, err = ds.CommitValue(l3)
+
+	assert.NoError(err)
+	s.Database = ds.Database()
+}
+
+func (s *perfSuite) headList(dsName string) types.List {
+	ds := dataset.NewDataset(s.Database, dsName)
+	return ds.HeadValue().(types.List)
+}
+
+func TestPerf(t *testing.T) {
+	suite.Run("types", t, &perfSuite{})
+}
--- a/go/types/sequence_concat.go
+++ b/go/types/sequence_concat.go
@@ -0,0 +1,40 @@
+// Copyright 2016 Attic Labs, Inc. All rights reserved.
+// Licensed under the Apache License, version 2.0:
+// http://www.apache.org/licenses/LICENSE-2.0
+
+package types
+
+import "github.com/attic-labs/noms/go/d"
+
+type newSequenceChunkerFn func(cur *sequenceCursor, vr ValueReader) *sequenceChunker
+
+func concat(fst, snd sequence, newSequenceChunker newSequenceChunkerFn) sequence {
+	if fst.numLeaves() == 0 {
+		return snd
+	}
+	if snd.numLeaves() == 0 {
+		return fst
+	}
+
+	// concat works by tricking the sequenceChunker into resuming chunking at a
+	// cursor to the end of fst, then finalizing chunking to the start of snd - by
+	// swapping fst cursors for snd cursors in the middle of chunking.
+	vr := fst.valueReader()
+	d.PanicIfTrue(vr != snd.valueReader(), "cannot concat sequences from different databases")
+	chunker := newSequenceChunker(newCursorAtIndex(fst, fst.numLeaves()), vr)
+
+	for cur, ch := newCursorAtIndex(snd, 0), chunker; cur != nil; cur = cur.parent {
+		// If fst is shallower than snd, its cur will have a parent whereas the
+		// chunker to snd won't. In that case, create a parent for fst.
+		// Note that if the inverse is true - snd is shallower than fst - this just
+		// means higher chunker levels will still have cursors from fst... which
+		// point to the end, so finalisation won't do anything. This is correct.
+		if ch.parent == nil {
+			ch.createParent()
+		}
+		ch.cur = cur.clone()
+		ch = ch.parent
+	}
+
+	return chunker.Done()
+}
--- a/go/types/sequence_cursor.go
+++ b/go/types/sequence_cursor.go
@@ -125,3 +125,19 @@ func (cur *sequenceCursor) iter(cb cursorIterCallback) {
 		cur.advance()
 	}
 }
+
+func newCursorAtIndex(seq sequence, idx uint64) *sequenceCursor {
+	var cur *sequenceCursor
+	for {
+		cur = newSequenceCursor(cur, seq, 0)
+		idx = idx - advanceCursorToOffset(cur, idx)
+		cs := cur.getChildSequence()
+		if cs == nil {
+			break
+		}
+		seq = cs
+	}
+
+	d.PanicIfTrue(cur == nil)
+	return cur
+}
--- a/go/types/util_test.go
+++ b/go/types/util_test.go
@@ -34,17 +34,15 @@ func intsToValueSlice(ints ...int) ValueSlice {
 }

 func generateNumbersAsValues(n int) []Value {
-	d.Chk.True(n > 0, "must be an integer greater than zero")
 	return generateNumbersAsValuesFromToBy(0, n, 1)
 }

 func generateNumbersAsValueSlice(n int) ValueSlice {
-	d.Chk.True(n > 0, "must be an integer greater than zero")
 	return generateNumbersAsValuesFromToBy(0, n, 1)
 }

 func generateNumbersAsValuesFromToBy(from, to, by int) ValueSlice {
-	d.Chk.True(to > from, "to must be greater than from")
+	d.Chk.True(to >= from, "to must be greater than or equal to from")
 	d.Chk.True(by > 0, "must be an integer greater than zero")
 	nums := []Value{}
 	for i := from; i < to; i += by {
@@ -58,7 +56,7 @@ func generateNumbersAsStructs(n int) ValueSlice {
 }

 func generateNumbersAsStructsFromToBy(from, to, by int) ValueSlice {
-	d.Chk.True(to > from, "to must be greater than from")
+	d.Chk.True(to >= from, "to must be greater than or equal to from")
 	d.Chk.True(by > 0, "must be an integer greater than zero")
 	nums := []Value{}
 	for i := from; i < to; i += by {
@@ -68,7 +66,6 @@ func generateNumbersAsStructsFromToBy(from, to, by int) ValueSlice {
 }

 func generateNumbersAsRefOfStructs(n int) []Value {
-	d.Chk.True(n > 0, "must be an integer greater than zero")
 	vs := NewTestValueStore()
 	nums := []Value{}
 	for i := 0; i < n; i++ {