Implement compoundList Set/Insert/Remove/RemoveAt.

2026-05-12 19:39:32 -05:00 · 2015-12-09 10:51:49 -08:00
parent 798e7fac7e
commit c5a6382d25
3 changed files with 330 additions and 17 deletions
@@ -105,12 +105,21 @@ func (cl compoundList) MapP(concurrency int, mf MapFunc) []interface{} {
 }

 func (cl compoundList) Set(idx uint64, v Value) List {
-	panic("not implemented")
+	seq := cl.sequenceChunkerAtIndex(idx)
+	seq.Skip()
+	seq.Append(v)
+	return seq.Done().(List)
 }

 func (cl compoundList) Append(vs ...Value) List {
-	// TODO: add short circuitry to immediately create a cursor pointing to the end of the list.
-	seq := cl.sequenceChunkerAtIndex(cl.Len())
+	return cl.Insert(cl.Len(), vs...)
+}
+
+func (cl compoundList) Insert(idx uint64, vs ...Value) List {
+	if len(vs) == 0 {
+		return cl
+	}
+	seq := cl.sequenceChunkerAtIndex(idx)
 	for _, v := range vs {
 		seq.Append(v)
 	}
@@ -118,6 +127,7 @@ func (cl compoundList) Append(vs ...Value) List {
 }

 func (cl compoundList) sequenceCursorAtIndex(idx uint64) *sequenceCursor {
+	// TODO: An optimisation would be to decide at each level whether to step forward or backward across the node to find the insertion point, depending on which is closer. This would make Append much faster.
 	metaCur, leaf, start := cl.cursorAt(idx)
 	return &sequenceCursor{metaCur, leaf, int(idx - start), len(leaf.values), func(list sequenceItem, idx int) sequenceItem {
 		return list.(listLeaf).values[idx]
@@ -136,16 +146,20 @@ func (cl compoundList) Filter(cb listFilterCallback) List {
 	panic("not implemented")
 }

-func (cl compoundList) Insert(idx uint64, v ...Value) List {
-	panic("not implemented")
-}
-
 func (cl compoundList) Remove(start uint64, end uint64) List {
-	panic("not implemented")
+	if start == end {
+		return cl
+	}
+	d.Chk.True(end > start)
+	seq := cl.sequenceChunkerAtIndex(start)
+	for i := start; i < end; i++ {
+		seq.Skip()
+	}
+	return seq.Done().(compoundList)
 }

 func (cl compoundList) RemoveAt(idx uint64) List {
-	panic("not implemented")
+	return cl.Remove(idx, idx+1)
 }

 func (cl compoundList) Iter(f listIterFunc) {
@@ -10,8 +10,36 @@ import (

 type testSimpleList []Value

+func (tsl testSimpleList) Set(idx int, v Value) (res testSimpleList) {
+	res = append(res, tsl[:idx]...)
+	res = append(res, v)
+	res = append(res, tsl[idx+1:]...)
+	return
+}
+
+func (tsl testSimpleList) Insert(idx int, vs ...Value) (res testSimpleList) {
+	res = append(res, tsl[:idx]...)
+	res = append(res, vs...)
+	res = append(res, tsl[idx:]...)
+	return
+}
+
+func (tsl testSimpleList) Remove(start, end int) (res testSimpleList) {
+	res = append(res, tsl[:start]...)
+	res = append(res, tsl[end:]...)
+	return
+}
+
+func (tsl testSimpleList) RemoveAt(idx int) testSimpleList {
+	return tsl.Remove(idx, idx+1)
+}
+
+func (tsl testSimpleList) ToNomsList(cs chunks.ChunkStore) List {
+	return NewList(cs, tsl...)
+}
+
 func getTestSimpleListLen() uint64 {
-	return uint64(listPattern) * 200
+	return uint64(listPattern) * 50
 }

 func getTestSimpleList() testSimpleList {
@@ -25,6 +53,14 @@ func getTestSimpleList() testSimpleList {
 	return values
 }

+func testSimpleListFromNomsList(list List) testSimpleList {
+	simple := make(testSimpleList, list.Len())
+	list.IterAll(func(v Value, offset uint64) {
+		simple[offset] = v
+	})
+	return simple
+}
+
 func TestCompoundListGet(t *testing.T) {
 	assert := assert.New(t)

@@ -171,6 +207,223 @@ func TestCompoundListAppend(t *testing.T) {
 	assert.True(newCompoundList(expected).Equals(cl6))
 }

+func TestCompoundListInsertNothing(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	cl := getTestSimpleList().ToNomsList(cs)
+
+	assert.True(cl.Equals(cl.Insert(0)))
+	for i := uint64(1); i < getTestSimpleListLen(); i *= 2 {
+		assert.True(cl.Equals(cl.Insert(i)))
+	}
+	assert.True(cl.Equals(cl.Insert(cl.Len() - 1)))
+	assert.True(cl.Equals(cl.Insert(cl.Len())))
+}
+
+func TestCompoundListInsertStart(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	cl := getTestSimpleList().ToNomsList(cs)
+	cl2 := cl.Insert(0, Int64(42))
+	cl3 := cl2.Insert(0, Int64(43))
+	cl4 := cl3.Insert(0, getTestSimpleList()...)
+	cl5 := cl4.Insert(0, Int64(44), Int64(45))
+	cl6 := cl5.Insert(0, getTestSimpleList()...)
+
+	expected := getTestSimpleList()
+	assert.Equal(expected, testSimpleListFromNomsList(cl))
+	assert.Equal(getTestSimpleListLen(), cl.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl))
+
+	expected = expected.Insert(0, Int64(42))
+	assert.Equal(expected, testSimpleListFromNomsList(cl2))
+	assert.Equal(getTestSimpleListLen()+1, cl2.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl2))
+
+	expected = expected.Insert(0, Int64(43))
+	assert.Equal(expected, testSimpleListFromNomsList(cl3))
+	assert.Equal(getTestSimpleListLen()+2, cl3.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl3))
+
+	expected = expected.Insert(0, getTestSimpleList()...)
+	assert.Equal(expected, testSimpleListFromNomsList(cl4))
+	assert.Equal(2*getTestSimpleListLen()+2, cl4.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl4))
+
+	expected = expected.Insert(0, Int64(44), Int64(45))
+	assert.Equal(expected, testSimpleListFromNomsList(cl5))
+	assert.Equal(2*getTestSimpleListLen()+4, cl5.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl5))
+
+	expected = expected.Insert(0, getTestSimpleList()...)
+	assert.Equal(expected, testSimpleListFromNomsList(cl6))
+	assert.Equal(3*getTestSimpleListLen()+4, cl6.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl6))
+}
+
+func TestCompoundListInsertMiddle(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	cl := getTestSimpleList().ToNomsList(cs)
+	cl2 := cl.Insert(100, Int64(42))
+	cl3 := cl2.Insert(200, Int64(43))
+	cl4 := cl3.Insert(300, getTestSimpleList()...)
+	cl5 := cl4.Insert(400, Int64(44), Int64(45))
+	cl6 := cl5.Insert(500, getTestSimpleList()...)
+	cl7 := cl6.Insert(600, Int64(100))
+
+	expected := getTestSimpleList()
+	assert.Equal(expected, testSimpleListFromNomsList(cl))
+	assert.Equal(getTestSimpleListLen(), cl.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl))
+
+	expected = expected.Insert(100, Int64(42))
+	assert.Equal(expected, testSimpleListFromNomsList(cl2))
+	assert.Equal(getTestSimpleListLen()+1, cl2.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl2))
+
+	expected = expected.Insert(200, Int64(43))
+	assert.Equal(expected, testSimpleListFromNomsList(cl3))
+	assert.Equal(getTestSimpleListLen()+2, cl3.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl3))
+
+	expected = expected.Insert(300, getTestSimpleList()...)
+	assert.Equal(expected, testSimpleListFromNomsList(cl4))
+	assert.Equal(2*getTestSimpleListLen()+2, cl4.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl4))
+
+	expected = expected.Insert(400, Int64(44), Int64(45))
+	assert.Equal(expected, testSimpleListFromNomsList(cl5))
+	assert.Equal(2*getTestSimpleListLen()+4, cl5.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl5))
+
+	expected = expected.Insert(500, getTestSimpleList()...)
+	assert.Equal(expected, testSimpleListFromNomsList(cl6))
+	assert.Equal(3*getTestSimpleListLen()+4, cl6.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl6))
+
+	expected = expected.Insert(600, Int64(100))
+	assert.Equal(expected, testSimpleListFromNomsList(cl7))
+	assert.Equal(3*getTestSimpleListLen()+5, cl7.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl7))
+}
+
+func TestCompoundListInsertRanges(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	testList := getTestSimpleList()
+	whole := testList.ToNomsList(cs)
+
+	// Compare list equality. Increment by 256 (16^2) because each iteration requires building a new list, which is slow.
+	for incr, i := 256, 0; i < len(testList)-incr; i += incr {
+		for window := 1; window <= incr; window *= 16 {
+			testListPart := testList.Remove(i, i+window)
+			actual := testListPart.ToNomsList(cs).Insert(uint64(i), testList[i:i+window]...)
+			assert.Equal(whole.Len(), actual.Len())
+			assert.True(whole.Equals(actual))
+		}
+	}
+
+	// Compare list length, which doesn't require building a new list every iteration, so the increment can be smaller.
+	for incr, i := 10, 0; i < len(testList); i += incr {
+		assert.Equal(len(testList)+incr, int(whole.Insert(uint64(i), testList[0:incr]...).Len()))
+	}
+}
+
+func TestCompoundListRemoveNothing(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	cl := getTestSimpleList().ToNomsList(cs)
+
+	assert.True(cl.Equals(cl.Remove(0, 0)))
+	for i := uint64(1); i < getTestSimpleListLen(); i *= 2 {
+		assert.True(cl.Equals(cl.Remove(i, i)))
+	}
+	assert.True(cl.Equals(cl.Remove(cl.Len()-1, cl.Len()-1)))
+	assert.True(cl.Equals(cl.Remove(cl.Len(), cl.Len())))
+}
+
+func TestCompoundListRemoveAtMiddle(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	cl := getTestSimpleList().ToNomsList(cs)
+	cl2 := cl.RemoveAt(100)
+	cl3 := cl2.RemoveAt(200)
+
+	expected := getTestSimpleList()
+	assert.Equal(expected, testSimpleListFromNomsList(cl))
+	assert.Equal(getTestSimpleListLen(), cl.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl))
+
+	expected = expected.RemoveAt(100)
+	assert.Equal(expected, testSimpleListFromNomsList(cl2))
+	assert.Equal(getTestSimpleListLen()-1, cl2.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl2))
+
+	expected = expected.RemoveAt(200)
+	assert.Equal(expected, testSimpleListFromNomsList(cl3))
+	assert.Equal(getTestSimpleListLen()-2, cl3.Len())
+	assert.True(expected.ToNomsList(cs).Equals(cl3))
+}
+
+func TestCompoundListRemoveRanges(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	testList := getTestSimpleList()
+	whole := testList.ToNomsList(cs)
+
+	// Compare list equality. Increment by 256 (16^2) because each iteration requires building a new list, which is slow.
+	for incr, i := 256, 0; i < len(testList)-incr; i += incr {
+		for window := 1; window <= incr; window *= 16 {
+			testListPart := testList.Remove(i, i+window)
+			expected := testListPart.ToNomsList(cs)
+			actual := whole.Remove(uint64(i), uint64(i+window))
+			assert.Equal(expected.Len(), actual.Len())
+			assert.True(expected.Equals(actual))
+		}
+	}
+
+	// Compare list length, which doesn't require building a new list every iteration, so the increment can be smaller.
+	for incr, i := 10, 0; i < len(testList)-incr; i += incr {
+		assert.Equal(len(testList)-incr, int(whole.Remove(uint64(i), uint64(i+incr)).Len()))
+	}
+}
+
+func TestCompoundListSet(t *testing.T) {
+	assert := assert.New(t)
+
+	cs := chunks.NewMemoryStore()
+	testList := getTestSimpleList()
+	cl := testList.ToNomsList(cs)
+
+	testIdx := func(idx int, testEquality bool) {
+		newVal := Int64(-1) // Test values are never < 0
+		cl2 := cl.Set(uint64(idx), newVal)
+		assert.False(cl.Equals(cl2))
+		if testEquality {
+			assert.True(testList.Set(idx, newVal).ToNomsList(cs).Equals(cl2))
+		}
+	}
+
+	// Compare list equality. Increment by 100 because each iteration requires building a new list, which is slow, but always test the last index.
+	for incr, i := 100, 0; i < len(testList); i += incr {
+		testIdx(i, true)
+	}
+	testIdx(len(testList)-1, true)
+
+	// Compare list unequality, which doesn't require building a new list every iteration, so the increment can be smaller.
+	for incr, i := 10, 0; i < len(testList); i += incr {
+		testIdx(i, false)
+	}
+}
+
 func TestCompoundListSlice(t *testing.T) {
 	assert := assert.New(t)

@@ -20,6 +20,7 @@ type sequenceChunker struct {
 	makeChunk, parentMakeChunk makeChunkFn
 	boundaryChk                boundaryChecker
 	newBoundaryChecker         newBoundaryCheckerFn
+	used                       bool
 }

 // makeChunkFn takes a sequence of items to chunk, and returns the result of chunking those items, a tuple of a reference to that chunk which can itself be chunked + its underlying value.
@@ -30,6 +31,7 @@ func newEmptySequenceChunker(makeChunk, parentMakeChunk makeChunkFn, boundaryChk
 }

 func newSequenceChunker(cur *sequenceCursor, makeChunk, parentMakeChunk makeChunkFn, boundaryChk boundaryChecker, newBoundaryChecker newBoundaryCheckerFn) *sequenceChunker {
+	// |cur| will be nil if this is a new sequence, implying this is a new tree, or the tree has grown in height relative to its original chunked form.
 	d.Chk.NotNil(makeChunk)
 	d.Chk.NotNil(parentMakeChunk)
 	d.Chk.NotNil(boundaryChk)
@@ -42,19 +44,22 @@ func newSequenceChunker(cur *sequenceCursor, makeChunk, parentMakeChunk makeChun
 		makeChunk, parentMakeChunk,
 		boundaryChk,
 		newBoundaryChecker,
+		false,
 	}

 	if cur != nil {
-		// Eagerly create a chunker for each level of the existing tree. This is correct while sequences can only ever append, and therefore the tree can only ever grow in height, but generally speaking the tree can also shrink - due to both removals and changes - and in that situation we can't simply create every meta-node that was in the cursor. If we did that, we'd end up with meta-nodes with only a single entry, which is illegal.
+		// Eagerly create a chunker for each level of the existing tree, but note that we may not necessarily need them all, since chunk boundaries may change such that the tree ends up shallower. The |seq.used| flag accounts for that case.
 		if cur.parent != nil {
 			seq.createParent()
 		}
 		// Prime the chunker into the state it would be if all items in the sequence had been appended one at a time.
-		for _, item := range cur.maxNPrevItems(boundaryChk.WindowSize()) {
+		// This can be WindowSize-1, not WindowSize, because the first appended item will fill the remaining spot in the hash window.
+		for _, item := range cur.maxNPrevItems(boundaryChk.WindowSize() - 1) {
 			boundaryChk.Write(item)
 		}
 		// Reconstruct this entire chunk.
 		seq.current = cur.maxNPrevItems(cur.indexInChunk())
+		seq.used = len(seq.current) > 0
 	}

 	return seq
@@ -68,19 +73,33 @@ func (seq *sequenceChunker) Append(item sequenceItem) {
 		seq.commitPendingFirst()
 	}
 	seq.current = append(seq.current, item)
+	seq.used = true
 	if seq.boundaryChk.Write(item) {
 		seq.handleChunkBoundary()
 	}
 }

+func (seq *sequenceChunker) Skip() {
+	if seq.cur.advance() && seq.cur.indexInChunk() == 0 {
+		// Advancing moved our cursor into the next chunk. We need to advance our parent's cursor, so that when our parent writes out the remaining chunks it doesn't include the chunk that we skipped.
+		seq.skipParentIfExists()
+	}
+}
+
+func (seq *sequenceChunker) skipParentIfExists() {
+	if seq.parent != nil && seq.parent.cur != nil {
+		seq.parent.Skip()
+	}
+}
+
 func (seq *sequenceChunker) createParent() {
 	d.Chk.True(seq.parent == nil)
-	var curParent *sequenceCursor
-	// seq.cur will be nil if it points to the root of the chunked tree.
+	var parent *sequenceCursor
 	if seq.cur != nil && seq.cur.parent != nil {
-		curParent = seq.cur.parent.clone()
+		// Clone the parent cursor because otherwise calling cur.advance() will affect our parent - and vice versa - in surprising ways. Instead, Skip moves forward our parent's cursor if we advance across a boundary.
+		parent = seq.cur.parent.clone()
 	}
-	seq.parent = newSequenceChunker(curParent, seq.parentMakeChunk, seq.parentMakeChunk, seq.newBoundaryChecker(), seq.newBoundaryChecker)
+	seq.parent = newSequenceChunker(parent, seq.parentMakeChunk, seq.parentMakeChunk, seq.newBoundaryChecker(), seq.newBoundaryChecker)
 }

 func (seq *sequenceChunker) commitPendingFirst() {
@@ -102,18 +121,45 @@ func (seq *sequenceChunker) handleChunkBoundary() {
 }

 func (seq *sequenceChunker) Done() Value {
+	if seq.cur != nil {
+		seq.finalizeCursor()
+	}
+
 	if seq.pendingFirst != nil {
 		d.Chk.True(seq.parent == nil)
 		d.Chk.Equal(0, len(seq.current))
 		_, done := seq.makeChunk(seq.pendingFirst)
 		return done
 	}
-	if seq.parent != nil {
+
+	if seq.parent != nil && seq.parent.used {
 		if len(seq.current) > 0 {
 			seq.handleChunkBoundary()
 		}
 		return seq.parent.Done()
 	}
+
 	_, done := seq.makeChunk(seq.current)
 	return done
 }
+
+func (seq *sequenceChunker) finalizeCursor() {
+	if _, ok := seq.cur.maybeCurrent(); !ok {
+		// The cursor is past the end, and due to the way cursors work, the parent cursor will actually point to its last chunk. We need to force it to point past the end so that our parent's Done() method doesn't add the last chunk twice.
+		seq.skipParentIfExists()
+		return
+	}
+
+	// Append the rest of the values in the sequence, up to the window size, plus the rest of that chunk. It needs to be the full window size because anything that was appended/skipped between chunker construction and finalization will have changed the hash state.
+	fzr := seq.cur.clone()
+	for i := 0; i < seq.boundaryChk.WindowSize() || fzr.indexInChunk() > 0; i++ {
+		if i == 0 || fzr.indexInChunk() == 0 {
+			// Every time we step into a chunk from the original sequence, that chunk will no longer exist in the new sequence. The parent must be instructed to skip it.
+			seq.skipParentIfExists()
+		}
+		seq.Append(fzr.current())
+		if !fzr.advance() {
+			break
+		}
+	}
+}