Fix sequence chunker bug triggered by repeatedly removing last element (#1844)

Fix sequence chunker bug triggered by repeatedly removing last element The bug is we sometimes create a prollytree with a root meta sequence node with only a single item. This is never the canonical representation of prollytrees. I reworked the sequence chunker to take a different approach to corner cases. Instead of being smart and avoiding this case (which clearly didn't work properly), it's more liberal with creating unnecessary nodes, then it fixes them up in the finalisation step.
2026-05-02 03:10:42 -05:00 · 2016-06-20 18:35:46 -07:00
parent afaec8a6ad
commit 1b9ea570ae
16 changed files with 350 additions and 221 deletions
@@ -149,14 +149,16 @@ func newBlobLeafBoundaryChecker() boundaryChecker {
 }

 func newBlobLeafChunkFn(vr ValueReader, sink ValueWriter) makeChunkFn {
-	return func(items []sequenceItem) (metaTuple, Collection) {
+	return func(items []sequenceItem) (metaTuple, sequence) {
 		buff := make([]byte, len(items))

 		for i, v := range items {
 			buff[i] = v.(byte)
 		}

-		blob := newBlob(newBlobLeafSequence(vr, buff))
+		seq := newBlobLeafSequence(vr, buff)
+		blob := newBlob(seq)
+
 		var ref Ref
 		var child Collection
 		if sink != nil {
@@ -168,7 +170,7 @@ func newBlobLeafChunkFn(vr ValueReader, sink ValueWriter) makeChunkFn {
 			child = blob
 		}

-		return newMetaTuple(ref, Number(len(buff)), uint64(len(buff)), child), blob
+		return newMetaTuple(ref, Number(len(buff)), uint64(len(buff)), child), seq
 	}
 }

@@ -189,6 +191,6 @@ func NewStreamingBlob(r io.Reader, vrw ValueReadWriter) Blob {
 			break
 		}
 	}
-	return seq.Done().(Blob)
+	return newBlob(seq.Done().(indexedSequence))

 }
@@ -110,7 +110,7 @@ func newIndexedMetaSequenceBoundaryChecker() boundaryChecker {
 // If |sink| is not nil, chunks will be eagerly written as they're created. Otherwise they are
 // written when the root is written.
 func newIndexedMetaSequenceChunkFn(kind NomsKind, source ValueReader, sink ValueWriter) makeChunkFn {
-	return func(items []sequenceItem) (metaTuple, Collection) {
+	return func(items []sequenceItem) (metaTuple, sequence) {
 		tuples := make(metaSequenceData, len(items))
 		numLeaves := uint64(0)

@@ -121,17 +121,18 @@ func newIndexedMetaSequenceChunkFn(kind NomsKind, source ValueReader, sink Value
 		}

 		var col Collection
+		var metaSeq indexedMetaSequence
 		if kind == ListKind {
-			metaSeq := newListMetaSequence(tuples, source)
+			metaSeq = newListMetaSequence(tuples, source)
 			col = newList(metaSeq)
 		} else {
 			d.Chk.True(BlobKind == kind)
-			metaSeq := newBlobMetaSequence(tuples, source)
+			metaSeq = newBlobMetaSequence(tuples, source)
 			col = newBlob(metaSeq)
 		}
 		if sink != nil {
-			return newMetaTuple(sink.WriteValue(col), Number(tuples.uint64ValuesSum()), numLeaves, nil), col
+			return newMetaTuple(sink.WriteValue(col), Number(tuples.uint64ValuesSum()), numLeaves, nil), metaSeq
 		}
-		return newMetaTuple(NewRef(col), Number(tuples.uint64ValuesSum()), numLeaves, col), col
+		return newMetaTuple(NewRef(col), Number(tuples.uint64ValuesSum()), numLeaves, col), metaSeq
 	}
 }
@@ -32,7 +32,7 @@ func NewList(values ...Value) List {
 	for _, v := range values {
 		seq.Append(v)
 	}
-	return seq.Done().(List)
+	return newList(seq.Done().(indexedSequence))
 }

 // NewStreamingList creates a new List with type t, populated with values, chunking if and when needed. As chunks are created, they're written to vrw -- including the root chunk of the list. Once the caller has closed values, she can read the completed List from the returned channel.
@@ -43,7 +43,7 @@ func NewStreamingList(vrw ValueReadWriter, values <-chan Value) <-chan List {
 		for v := range values {
 			seq.Append(v)
 		}
-		out <- seq.Done().(List)
+		out <- newList(seq.Done().(indexedSequence))
 		close(out)
 	}()
 	return out
@@ -151,7 +151,7 @@ func (l List) Splice(idx uint64, deleteCount uint64, vs ...Value) List {
 	for _, v := range vs {
 		ch.Append(v)
 	}
-	return ch.Done().(List)
+	return newList(ch.Done().(indexedSequence))
 }

 func (l List) Insert(idx uint64, vs ...Value) List {
@@ -159,6 +159,7 @@ func (l List) Insert(idx uint64, vs ...Value) List {
 }

 func (l List) Remove(start uint64, end uint64) List {
+	d.Chk.True(start <= end)
 	return l.Splice(start, end-start)
 }

@@ -222,14 +223,15 @@ func newListLeafBoundaryChecker() boundaryChecker {
 // If |sink| is not nil, chunks will be eagerly written as they're created. Otherwise they are
 // written when the root is written.
 func makeListLeafChunkFn(vr ValueReader, sink ValueWriter) makeChunkFn {
-	return func(items []sequenceItem) (metaTuple, Collection) {
+	return func(items []sequenceItem) (metaTuple, sequence) {
 		values := make([]Value, len(items))

 		for i, v := range items {
 			values[i] = v.(Value)
 		}

-		list := newList(newListLeafSequence(vr, values...))
+		seq := newListLeafSequence(vr, values...)
+		list := newList(seq)

 		var ref Ref
 		var child Collection
@@ -242,6 +244,6 @@ func makeListLeafChunkFn(vr ValueReader, sink ValueWriter) makeChunkFn {
 			child = list
 		}

-		return newMetaTuple(ref, Number(len(values)), uint64(len(values)), child), list
+		return newMetaTuple(ref, Number(len(values)), uint64(len(values)), child), seq
 	}
 }
@@ -208,7 +208,10 @@ func getTestListLen() uint64 {
 }

 func getTestList() testList {
-	length := int(getTestListLen())
+	return getTestListWithLen(int(getTestListLen()))
+}
+
+func getTestListWithLen(length int) testList {
 	s := rand.NewSource(42)
 	values := make([]Value, length)
 	for i := 0; i < length; i++ {
@@ -541,6 +544,22 @@ func TestListRemoveRanges(t *testing.T) {
 	}
 }

+func TestListRemoveAtEnd(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping test in short mode.")
+	}
+	assert := assert.New(t)
+
+	tl := getTestListWithLen(testListSize / 10)
+	cl := tl.toList()
+
+	for i := len(tl) - 1; i >= 0; i-- {
+		cl = cl.Remove(uint64(i), uint64(i+1))
+		expect := tl[0:i].toList()
+		assert.True(expect.Equals(cl))
+	}
+}
+
 func TestListSet(t *testing.T) {
 	if testing.Short() {
 		t.Skip("Skipping test in short mode.")
@@ -34,7 +34,7 @@ func NewMap(kv ...Value) Map {
 		seq.Append(entry)
 	}

-	return seq.Done().(Map)
+	return newMap(seq.Done().(orderedSequence))
 }

 func (m Map) Diff(last Map) (added []Value, removed []Value, modified []Value) {
@@ -149,7 +149,7 @@ func (m Map) splice(cur *sequenceCursor, deleteCount uint64, vs ...mapEntry) Map
 	for _, v := range vs {
 		ch.Append(v)
 	}
-	return ch.Done().(Map)
+	return newMap(ch.Done().(orderedSequence))
 }

 func (m Map) getCursorAtValue(v Value) (cur *sequenceCursor, found bool) {
@@ -234,14 +234,15 @@ func newMapLeafBoundaryChecker() boundaryChecker {
 }

 func makeMapLeafChunkFn(vr ValueReader) makeChunkFn {
-	return func(items []sequenceItem) (metaTuple, Collection) {
+	return func(items []sequenceItem) (metaTuple, sequence) {
 		mapData := make([]mapEntry, len(items), len(items))

 		for i, v := range items {
 			mapData[i] = v.(mapEntry)
 		}

-		m := newMap(newMapLeafSequence(vr, mapData...))
+		seq := newMapLeafSequence(vr, mapData...)
+		m := newMap(seq)

 		var indexValue Value
 		if len(mapData) > 0 {
@@ -251,6 +252,6 @@ func makeMapLeafChunkFn(vr ValueReader) makeChunkFn {
 			}
 		}

-		return newMetaTuple(NewRef(m), indexValue, uint64(len(items)), m), m
+		return newMetaTuple(NewRef(m), indexValue, uint64(len(items)), m), seq
 	}
 }
@@ -146,7 +146,7 @@ func newOrderedMetaSequenceBoundaryChecker() boundaryChecker {
 }

 func newOrderedMetaSequenceChunkFn(kind NomsKind, vr ValueReader) makeChunkFn {
-	return func(items []sequenceItem) (metaTuple, Collection) {
+	return func(items []sequenceItem) (metaTuple, sequence) {
 		tuples := make(metaSequenceData, len(items))
 		numLeaves := uint64(0)

@@ -156,16 +156,17 @@ func newOrderedMetaSequenceChunkFn(kind NomsKind, vr ValueReader) makeChunkFn {
 			numLeaves += mt.numLeaves
 		}

+		var metaSeq orderedMetaSequence
 		var col Collection
 		if kind == SetKind {
-			metaSeq := newSetMetaSequence(tuples, vr)
+			metaSeq = newSetMetaSequence(tuples, vr)
 			col = newSet(metaSeq)
 		} else {
 			d.Chk.True(MapKind == kind)
-			metaSeq := newMapMetaSequence(tuples, vr)
+			metaSeq = newMapMetaSequence(tuples, vr)
 			col = newMap(metaSeq)
 		}

-		return newMetaTuple(NewRef(col), tuples.last().value, numLeaves, col), col
+		return newMetaTuple(NewRef(col), tuples.last().value, numLeaves, col), metaSeq
 	}
 }
@@ -17,17 +17,17 @@ type newBoundaryCheckerFn func() boundaryChecker

 type sequenceChunker struct {
 	cur                        *sequenceCursor
-	isOnChunkBoundary          bool
 	parent                     *sequenceChunker
 	current                    []sequenceItem
+	lastSeq                    sequence
 	makeChunk, parentMakeChunk makeChunkFn
 	boundaryChk                boundaryChecker
 	newBoundaryChecker         newBoundaryCheckerFn
-	used                       bool
+	done                       bool
 }

 // makeChunkFn takes a sequence of items to chunk, and returns the result of chunking those items, a tuple of a reference to that chunk which can itself be chunked + its underlying value.
-type makeChunkFn func(values []sequenceItem) (metaTuple, Collection)
+type makeChunkFn func(values []sequenceItem) (metaTuple, sequence)

 func newEmptySequenceChunker(makeChunk, parentMakeChunk makeChunkFn, boundaryChk boundaryChecker, newBoundaryChecker newBoundaryCheckerFn) *sequenceChunker {
 	return newSequenceChunker(nil, makeChunk, parentMakeChunk, boundaryChk, newBoundaryChecker)
@@ -40,11 +40,11 @@ func newSequenceChunker(cur *sequenceCursor, makeChunk, parentMakeChunk makeChun
 	d.Chk.True(boundaryChk != nil)
 	d.Chk.True(newBoundaryChecker != nil)

-	seq := &sequenceChunker{
+	sc := &sequenceChunker{
 		cur,
-		false,
 		nil,
 		[]sequenceItem{},
+		nil,
 		makeChunk, parentMakeChunk,
 		boundaryChk,
 		newBoundaryChecker,
@@ -52,9 +52,8 @@ func newSequenceChunker(cur *sequenceCursor, makeChunk, parentMakeChunk makeChun
 	}

 	if cur != nil {
-		// Eagerly create a chunker for each level of the existing tree, but note that we may not necessarily need them all, since chunk boundaries may change such that the tree ends up shallower. The |seq.used| flag accounts for that case.
 		if cur.parent != nil {
-			seq.createParent()
+			sc.createParent()
 		}

 		// Number of previous items which must be hashed into the boundary checker.
@@ -81,7 +80,7 @@ func newSequenceChunker(cur *sequenceCursor, makeChunk, parentMakeChunk makeChun
 			backIdx := len(prev) - i
 			if appendPenultimate && backIdx == 1 {
 				// Test the penultimate item for a boundary.
-				seq.Append(item)
+				sc.Append(item)
 				continue
 			}

@@ -90,108 +89,126 @@ func newSequenceChunker(cur *sequenceCursor, makeChunk, parentMakeChunk makeChun
 			}

 			if backIdx <= primeCurrentCount {
-				seq.current = append(seq.current, item)
+				sc.current = append(sc.current, item)
 			}
 		}
+	}

-		seq.used = len(seq.current) > 0
+	return sc
+}
+
+func (sc *sequenceChunker) Append(item sequenceItem) {
+	d.Chk.True(item != nil)
+	sc.current = append(sc.current, item)
+	if sc.boundaryChk.Write(item) {
+		sc.handleChunkBoundary(true)
+	}
+}
+
+func (sc *sequenceChunker) Skip() {
+	if sc.cur.advance() && sc.cur.indexInChunk() == 0 {
+		// Advancing moved our cursor into the next chunk. We need to advance our parent's cursor, so that when our parent writes out the remaining chunks it doesn't include the chunk that we skipped.
+		sc.skipParentIfExists()
+	}
+}
+
+func (sc *sequenceChunker) skipParentIfExists() {
+	if sc.parent != nil && sc.parent.cur != nil {
+		sc.parent.Skip()
+	}
+}
+
+func (sc *sequenceChunker) createParent() {
+	d.Chk.True(sc.parent == nil)
+	var parent *sequenceCursor
+	if sc.cur != nil && sc.cur.parent != nil {
+		// Clone the parent cursor because otherwise calling cur.advance() will affect our parent - and vice versa - in surprising ways. Instead, Skip moves forward our parent's cursor if we advance across a boundary.
+		parent = sc.cur.parent.clone()
+	}
+	sc.parent = newSequenceChunker(parent, sc.parentMakeChunk, sc.parentMakeChunk, sc.newBoundaryChecker(), sc.newBoundaryChecker)
+}
+
+func (sc *sequenceChunker) handleChunkBoundary(createParentIfNil bool) {
+	d.Chk.NotEmpty(sc.current)
+	chunk, seq := sc.makeChunk(sc.current)
+	sc.current = []sequenceItem{}
+	sc.lastSeq = seq
+	if sc.parent == nil && createParentIfNil {
+		sc.createParent()
+	}
+	if sc.parent != nil {
+		sc.parent.Append(chunk)
+	}
+}
+
+func (sc *sequenceChunker) Done() sequence {
+	d.Chk.False(sc.done)
+	sc.done = true
+
+	for s := sc; s != nil; s = s.parent {
+		if s.cur != nil {
+			s.finalizeCursor()
+		}
+	}
+
+	// Chunkers will probably have current items which didn't hit a chunk boundary. Pretend they end on chunk boundaries for now.
+	for s := sc; s != nil; s = s.parent {
+		if len(s.current) > 0 {
+			// Don't create a new parent if we haven't chunked.
+			s.handleChunkBoundary(s.lastSeq != nil)
+		}
+	}
+
+	// The rest of this code figures out which sequence in the parent chain is canonical. That is:
+	// * It's empty, or
+	// * It never chunked, so it's not a prollytree, or
+	// * It chunked, so it's a prollytree, but it must have at least 2 children (or it could have been represented as that 1 child).
+	//
+	// Examples of when we may have constructed non-canonical sequences:
+	// * If the previous tree (i.e. its cursor) was deeper, we will have created empty parents.
+	// * If the last appended item was on a chunk boundary, there may be a sequence with a single chunk.
+
+	// Firstly, follow up the parent chain to find the highest chunker which did chunk.
+	var seq sequence
+	for s := sc; s != nil; s = s.parent {
+		if s.lastSeq != nil {
+			seq = s.lastSeq
+		}
+	}
+
+	if seq == nil {
+		_, seq = sc.makeChunk([]sequenceItem{})
+		return seq
+	}
+
+	// Lastly, step back down to find a meta sequence with more than 1 child.
+	for seq.seqLen() <= 1 {
+		d.Chk.NotEqual(0, seq.seqLen())
+		ms, ok := seq.(metaSequence)
+		if !ok {
+			break
+		}
+		seq = ms.getChildSequence(0)
 	}

 	return seq
 }

-func (seq *sequenceChunker) Append(item sequenceItem) {
-	d.Chk.True(item != nil)
-	// Check |isOnChunkBoundary| immediately, because it's effectively a continuation from the last call to Append. Specifically, this happens when the last call to Append created the first chunk boundary, which delayed creating the parent until absolutely necessary. Otherwise, we will be in a state where a parent has only a single item, which is invalid.
-	if seq.isOnChunkBoundary {
-		seq.createParent()
-		seq.handleChunkBoundary()
-		seq.isOnChunkBoundary = false
-	}
-	seq.current = append(seq.current, item)
-	seq.used = true
-	if seq.boundaryChk.Write(item) {
-		seq.handleChunkBoundary()
-	}
-}
-
-func (seq *sequenceChunker) Skip() {
-	if seq.cur.advance() && seq.cur.indexInChunk() == 0 {
-		// Advancing moved our cursor into the next chunk. We need to advance our parent's cursor, so that when our parent writes out the remaining chunks it doesn't include the chunk that we skipped.
-		seq.skipParentIfExists()
-	}
-}
-
-func (seq *sequenceChunker) skipParentIfExists() {
-	if seq.parent != nil && seq.parent.cur != nil {
-		seq.parent.Skip()
-	}
-}
-
-func (seq *sequenceChunker) createParent() {
-	d.Chk.True(seq.parent == nil)
-	var parent *sequenceCursor
-	if seq.cur != nil && seq.cur.parent != nil {
-		// Clone the parent cursor because otherwise calling cur.advance() will affect our parent - and vice versa - in surprising ways. Instead, Skip moves forward our parent's cursor if we advance across a boundary.
-		parent = seq.cur.parent.clone()
-	}
-	seq.parent = newSequenceChunker(parent, seq.parentMakeChunk, seq.parentMakeChunk, seq.newBoundaryChecker(), seq.newBoundaryChecker)
-}
-
-func (seq *sequenceChunker) handleChunkBoundary() {
-	d.Chk.NotEmpty(seq.current)
-	if seq.parent == nil {
-		// Wait until there is a parent.
-		d.Chk.False(seq.isOnChunkBoundary)
-		seq.isOnChunkBoundary = true
-	} else {
-		chunk, _ := seq.makeChunk(seq.current)
-		seq.parent.Append(chunk)
-		seq.current = []sequenceItem{}
-	}
-}
-
-func (seq *sequenceChunker) Done() Value {
-	if seq.cur != nil {
-		seq.finalizeCursor()
-	}
-
-	if seq.isRoot() {
-		_, done := seq.makeChunk(seq.current)
-		d.Chk.True(done != nil)
-		return done
-	}
-
-	if len(seq.current) > 0 {
-		seq.handleChunkBoundary()
-	}
-	return seq.parent.Done()
-}
-
-func (seq *sequenceChunker) isRoot() bool {
-	for ancstr := seq.parent; ancstr != nil; ancstr = ancstr.parent {
-		if ancstr.used {
-			return false
-		}
-	}
-	return true
-}
-
-func (seq *sequenceChunker) finalizeCursor() {
-	if !seq.cur.valid() {
+func (sc *sequenceChunker) finalizeCursor() {
+	if !sc.cur.valid() {
 		// The cursor is past the end, and due to the way cursors work, the parent cursor will actually point to its last chunk. We need to force it to point past the end so that our parent's Done() method doesn't add the last chunk twice.
-		seq.skipParentIfExists()
+		sc.skipParentIfExists()
 		return
 	}

 	// Append the rest of the values in the sequence, up to the window size, plus the rest of that chunk. It needs to be the full window size because anything that was appended/skipped between chunker construction and finalization will have changed the hash state.
-	fzr := seq.cur.clone()
-	for i := 0; i < seq.boundaryChk.WindowSize() || fzr.indexInChunk() > 0; i++ {
+	fzr := sc.cur.clone()
+	for i := 0; i < sc.boundaryChk.WindowSize() || fzr.indexInChunk() > 0; i++ {
 		if i == 0 || fzr.indexInChunk() == 0 {
 			// Every time we step into a chunk from the original sequence, that chunk will no longer exist in the new sequence. The parent must be instructed to skip it.
-			seq.skipParentIfExists()
+			sc.skipParentIfExists()
 		}
-		seq.Append(fzr.current())
+		sc.Append(fzr.current())
 		if !fzr.advance() {
 			break
 		}
@@ -34,7 +34,7 @@ func NewSet(v ...Value) Set {
 		seq.Append(v)
 	}

-	return seq.Done().(Set)
+	return newSet(seq.Done().(orderedSequence))
 }

 func (s Set) Diff(last Set) (added []Value, removed []Value) {
@@ -146,7 +146,7 @@ func (s Set) splice(cur *sequenceCursor, deleteCount uint64, vs ...Value) Set {
 		ch.Append(v)
 	}

-	ns := ch.Done().(Set)
+	ns := newSet(ch.Done().(orderedSequence))
 	return ns
 }

@@ -211,14 +211,15 @@ func newSetLeafBoundaryChecker() boundaryChecker {
 }

 func makeSetLeafChunkFn(vr ValueReader) makeChunkFn {
-	return func(items []sequenceItem) (metaTuple, Collection) {
+	return func(items []sequenceItem) (metaTuple, sequence) {
 		setData := make([]Value, len(items), len(items))

 		for i, v := range items {
 			setData[i] = v.(Value)
 		}

-		set := newSet(newSetLeafSequence(vr, setData...))
+		seq := newSetLeafSequence(vr, setData...)
+		set := newSet(seq)

 		var indexValue Value
 		if len(setData) > 0 {
@@ -228,6 +229,6 @@ func makeSetLeafChunkFn(vr ValueReader) makeChunkFn {
 			}
 		}

-		return newMetaTuple(NewRef(set), indexValue, uint64(len(items)), set), set
+		return newMetaTuple(NewRef(set), indexValue, uint64(len(items)), set), seq
 	}
 }
@@ -156,7 +156,7 @@ function newBlobLeafChunkFn(vr: ?ValueReader, vw: ?ValueWriter): makeChunkFn {
    } else {
      mt = new MetaTuple(new Ref(blob), items.length, items.length, blob);
    }
-    return [mt, blob];
+    return [mt, blobLeaf];
  };
 }

@@ -187,7 +187,7 @@ export class BlobWriter {

  close() {
    assert(this._state === 'writable');
-    this._blob = this._chunker.doneSync();
+    this._blob = Blob.fromSequence(this._chunker.doneSync());
    this._state = 'closed';
  }

@@ -203,6 +203,17 @@ suite('List', () => {
    assert.strictEqual(listOfNRef, s.hash.toString());
  });

+  test('LONG: remove at end', async() => {
+    const nums = intSequence(testListSize / 20);
+    let s = new List(nums);
+
+    for (let i = nums.length - 1; i >= 0; i--) {
+      s = await s.remove(i, i + 1);
+      const expect = new List(nums.slice(0, i));
+      assert.isTrue(equals(expect, s));
+    }
+  });
+
  test('LONG: splice', async () => {
    const nums = intSequence(testListSize);
    let s = new List(nums);
@@ -35,14 +35,15 @@ const listPattern = ((1 << 6) | 0) - 1;

 function newListLeafChunkFn<T: Value>(vr: ?ValueReader, vw: ?ValueWriter): makeChunkFn {
  return (items: Array<T>) => {
-    const list = List.fromSequence(newListLeafSequence(vr, items));
+    const seq = newListLeafSequence(vr, items);
+    const list = List.fromSequence(seq);
    let mt;
    if (vw) {
      mt = new MetaTuple(vw.writeValue(list), items.length, items.length, null);
    } else {
      mt = new MetaTuple(new Ref(list), items.length, items.length, list);
    }
-    return [mt, list];
+    return [mt, seq];
  };
 }

@@ -54,13 +55,14 @@ function newListLeafBoundaryChecker<T: Value>(): BoundaryChecker<T> {

 export default class List<T: Value> extends Collection<IndexedSequence> {
  constructor(values: Array<T> = []) {
-    const self = chunkSequenceSync(
+    const seq = chunkSequenceSync(
        values,
        newListLeafChunkFn(null, null),
        newIndexedMetaSequenceChunkFn(Kind.List, null, null),
        newListLeafBoundaryChecker(),
        newIndexedMetaSequenceBoundaryChecker);
-    super(self.sequence);
+    invariant(seq instanceof IndexedSequence);
+    super(seq);
  }

  async get(idx: number): Promise<T> {
@@ -74,7 +76,7 @@ export default class List<T: Value> extends Collection<IndexedSequence> {
      chunkSequence(cursor, insert, deleteCount, newListLeafChunkFn(vr, null),
                    newIndexedMetaSequenceChunkFn(Kind.List, vr, null),
                    newListLeafBoundaryChecker(),
-                    newIndexedMetaSequenceBoundaryChecker));
+                    newIndexedMetaSequenceBoundaryChecker)).then(s => List.fromSequence(s));
  }

  insert(idx: number, ...values: Array<T>): Promise<List<T>> {
@@ -175,7 +177,7 @@ type ListWriterState = 'writable' | 'closed';
 export class ListWriter<T: Value> {
  _state: ListWriterState;
  _list: ?List<T>;
-  _chunker: SequenceChunker<List<T>, T, ListLeafSequence<T>>;
+  _chunker: SequenceChunker<T, ListLeafSequence<T>>;

  constructor(vrw: ?ValueReadWriter) {
    this._state = 'writable';
@@ -191,7 +193,7 @@ export class ListWriter<T: Value> {

  close() {
    assert(this._state === 'writable');
-    this._list = this._chunker.doneSync();
+    this._list = List.fromSequence(this._chunker.doneSync());
    this._state = 'closed';
  }

@@ -4,6 +4,7 @@
 // Licensed under the Apache License, version 2.0:
 // http://www.apache.org/licenses/LICENSE-2.0

+import {invariant} from './assert.js';
 import BuzHashBoundaryChecker from './buzhash-boundary-checker.js';
 import Ref from './ref.js';
 import type {ValueReader} from './value-store.js';
@@ -44,9 +45,10 @@ function newMapLeafChunkFn<K: Value, V: Value>(vr: ?ValueReader):
      }
    }

-    const nm = Map.fromSequence(newMapLeafSequence(vr, items));
+    const seq = newMapLeafSequence(vr, items);
+    const nm = Map.fromSequence(seq);
    const mt = new MetaTuple(new Ref(nm), indexValue, items.length, nm);
-    return [mt, nm];
+    return [mt, seq];
  };
 }

@@ -88,13 +90,14 @@ function buildMapData<K: Value, V: Value>(
 export default class Map<K: Value, V: Value> extends
    Collection<OrderedSequence> {
  constructor(kvs: Array<MapEntry<K, V>> = []) {
-    const self = chunkSequenceSync(
+    const seq = chunkSequenceSync(
        buildMapData(kvs),
        newMapLeafChunkFn(null),
        newOrderedMetaSequenceChunkFn(Kind.Map, null),
        newMapLeafBoundaryChecker(),
        newOrderedMetaSequenceBoundaryChecker);
-    super(self.sequence);
+    invariant(seq instanceof OrderedSequence);
+    super(seq);
  }

  async has(key: K): Promise<boolean> {
@@ -152,7 +155,7 @@ export default class Map<K: Value, V: Value> extends
    return chunkSequence(cursor, insert, remove, newMapLeafChunkFn(vr),
                         newOrderedMetaSequenceChunkFn(Kind.Map, vr),
                         newMapLeafBoundaryChecker(),
-                         newOrderedMetaSequenceBoundaryChecker);
+                         newOrderedMetaSequenceBoundaryChecker).then(s => Map.fromSequence(s));
  }

  async set(key: K, value: V): Promise<Map<K, V>> {
@@ -48,6 +48,10 @@ export class MetaTuple<K> {
          return c.sequence;
        });
  }
+
+  getSequenceSync(): Sequence {
+    return notNull(this.child).sequence;
+  }
 }

 // The elemTypes of the collection inside the Ref<Collection<?, ?>>
@@ -125,6 +129,15 @@ export class IndexedMetaSequence extends IndexedSequence<MetaTuple<number>> {
    return mt.getSequence(this.vr);
  }

+  getChildSequenceSync(idx: number): ?Sequence {
+    if (!this.isMeta) {
+      return null;
+    }
+
+    const mt = this.items[idx];
+    return mt.getSequenceSync();
+  }
+
  // Returns the sequences pointed to by all items[i], s.t. start <= i < end, and returns the
  // concatentation as one long composite sequence
  getCompositeChildSequence(start: number, length: number):
@@ -195,6 +208,15 @@ export class OrderedMetaSequence<K: Value> extends OrderedSequence<K, MetaTuple<
    return mt.getSequence(this.vr);
  }

+  getChildSequenceSync(idx: number): ?Sequence {
+    if (!this.isMeta) {
+      return null;
+    }
+
+    const mt = this.items[idx];
+    return mt.getSequenceSync();
+  }
+
  getKey(idx: number): K {
    return this.items[idx].value;
  }
@@ -209,14 +231,17 @@ export function newOrderedMetaSequenceChunkFn(kind: NomsKind, vr: ?ValueReader):
  return (tuples: Array<MetaTuple>) => {
    const numLeaves = tuples.reduce((l, mt) => l + mt.numLeaves, 0);
    const last = tuples[tuples.length - 1];
+    let seq: OrderedMetaSequence;
    let col: Collection;
    if (kind === Kind.Map) {
-      col = Map.fromSequence(newMapMetaSequence(vr, tuples));
+      seq = newMapMetaSequence(vr, tuples);
+      col = Map.fromSequence(seq);
    } else {
      invariant(kind === Kind.Set);
-      col = Set.fromSequence(newSetMetaSequence(vr, tuples));
+      seq = newSetMetaSequence(vr, tuples);
+      col = Set.fromSequence(seq);
    }
-    return [new MetaTuple(new Ref(col), last.value, numLeaves, col), col];
+    return [new MetaTuple(new Ref(col), last.value, numLeaves, col), seq];
  };
 }

@@ -237,12 +262,15 @@ export function newIndexedMetaSequenceChunkFn(kind: NomsKind, vr: ?ValueReader,
      invariant(mt.value === mt.numLeaves);
      return l + mt.value;
    }, 0);
+    let seq: IndexedMetaSequence;
    let col: Collection;
    if (kind === Kind.List) {
-      col = List.fromSequence(newListMetaSequence(vr, tuples));
+      seq = newListMetaSequence(vr, tuples);
+      col = List.fromSequence(seq);
    } else {
      invariant(kind === Kind.Blob);
-      col = Blob.fromSequence(newBlobMetaSequence(vr, tuples));
+      seq = newBlobMetaSequence(vr, tuples);
+      col = Blob.fromSequence(seq);
    }
    let mt;
    if (vw) {
@@ -250,7 +278,7 @@ export function newIndexedMetaSequenceChunkFn(kind: NomsKind, vr: ?ValueReader,
    } else {
      mt = new MetaTuple(new Ref(col), sum, sum, col);
    }
-    return [mt, col];
+    return [mt, seq];
  };
 }

@@ -6,27 +6,26 @@

 import type Sequence from './sequence.js'; // eslint-disable-line no-unused-vars
 import {invariant, notNull} from './assert.js';
-import type Collection from './collection.js';
 import type {MetaSequence, MetaTuple} from './meta-sequence.js';
 import type {SequenceCursor} from './sequence.js';

 export type BoundaryChecker<T> = {
  write: (item: T) => boolean;
  windowSize: number;
-}
+};

 export type NewBoundaryCheckerFn = () => BoundaryChecker<MetaTuple>;

-export type makeChunkFn<T: Collection> = (items: Array<any>) => [MetaTuple, T];
+export type makeChunkFn<T, S: Sequence> = (items: Array<T>) => [MetaTuple, S];

-export async function chunkSequence<C: Collection, S>(
+export async function chunkSequence<T, S: Sequence<T>>(
    cursor: SequenceCursor,
-    insert: Array<S>,
+    insert: Array<T>,
    remove: number,
-    makeChunk: makeChunkFn<C>,
-    parentMakeChunk: makeChunkFn<C>,
-    boundaryChecker: BoundaryChecker<S>,
-    newBoundaryChecker: NewBoundaryCheckerFn): Promise<C> {
+    makeChunk: makeChunkFn<T, S>,
+    parentMakeChunk: makeChunkFn<MetaTuple, MetaSequence>,
+    boundaryChecker: BoundaryChecker<T>,
+    newBoundaryChecker: NewBoundaryCheckerFn): Promise<Sequence> {

  const chunker = new SequenceChunker(cursor, makeChunk, parentMakeChunk, boundaryChecker,
                                      newBoundaryChecker);
@@ -49,12 +48,12 @@ export async function chunkSequence<C: Collection, S>(
 // Like |chunkSequence|, but without an existing cursor (implying this is a new collection), so it
 // can be synchronous. Necessary for constructing collections without a Promises or async/await.
 // There is no equivalent in the Go code because Go is already synchronous.
-export function chunkSequenceSync<C: Collection, S>(
-    insert: Array<S>,
-    makeChunk: makeChunkFn<C>,
-    parentMakeChunk: makeChunkFn<C>,
-    boundaryChecker: BoundaryChecker<S>,
-    newBoundaryChecker: NewBoundaryCheckerFn): C {
+export function chunkSequenceSync<T, S: Sequence<T>>(
+    insert: Array<T>,
+    makeChunk: makeChunkFn<T, S>,
+    parentMakeChunk: makeChunkFn<MetaTuple, MetaSequence>,
+    boundaryChecker: BoundaryChecker<T>,
+    newBoundaryChecker: NewBoundaryCheckerFn): Sequence {

  const chunker = new SequenceChunker(null, makeChunk, parentMakeChunk, boundaryChecker,
                                      newBoundaryChecker);
@@ -64,30 +63,30 @@ export function chunkSequenceSync<C: Collection, S>(
  return chunker.doneSync();
 }

-export default class SequenceChunker<C: Collection, S, U: Sequence> {
-  _cursor: ?SequenceCursor<S, U>;
-  _isOnChunkBoundary: boolean;
-  _parent: ?SequenceChunker<C, MetaTuple, MetaSequence>;
-  _current: Array<S>;
-  _makeChunk: makeChunkFn<C>;
-  _parentMakeChunk: makeChunkFn<C>;
-  _boundaryChecker: BoundaryChecker<S>;
+export default class SequenceChunker<T, S: Sequence<T>> {
+  _cursor: ?SequenceCursor<T, S>;
+  _parent: ?SequenceChunker<MetaTuple, MetaSequence>;
+  _current: Array<T>;
+  _lastSeq: ?S;
+  _makeChunk: makeChunkFn<T, S>;
+  _parentMakeChunk: makeChunkFn<MetaTuple, MetaSequence>;
+  _boundaryChecker: BoundaryChecker<T>;
  _newBoundaryChecker: NewBoundaryCheckerFn;
-  _used: boolean;
+  _done: boolean;

  constructor(cursor: ?SequenceCursor, makeChunk: makeChunkFn,
              parentMakeChunk: makeChunkFn,
-              boundaryChecker: BoundaryChecker<S>,
+              boundaryChecker: BoundaryChecker<T>,
              newBoundaryChecker: NewBoundaryCheckerFn) {
    this._cursor = cursor;
-    this._isOnChunkBoundary = false;
    this._parent = null;
    this._current = [];
+    this._lastSeq = null;
    this._makeChunk = makeChunk;
    this._parentMakeChunk = parentMakeChunk;
    this._boundaryChecker = boundaryChecker;
    this._newBoundaryChecker = newBoundaryChecker;
-    this._used = false;
+    this._done = false;
  }

  async resume(): Promise<void> {
@@ -131,20 +130,12 @@ export default class SequenceChunker<C: Collection, S, U: Sequence> {
        this._current.push(item);
      }
    }
-
-    this._used = this._current.length > 0;
  }

-  append(item: S) {
-    if (this._isOnChunkBoundary) {
-      this.createParent();
-      this.handleChunkBoundary();
-      this._isOnChunkBoundary = false;
-    }
+  append(item: T) {
    this._current.push(item);
-    this._used = true;
    if (this._boundaryChecker.write(item)) {
-      this.handleChunkBoundary();
+      this.handleChunkBoundary(true);
    }
  }

@@ -172,63 +163,88 @@ export default class SequenceChunker<C: Collection, S, U: Sequence> {
        this._newBoundaryChecker);
  }

-  handleChunkBoundary() {
+  handleChunkBoundary(createParentIfNil: boolean) {
    invariant(this._current.length > 0);
-    const parent = this._parent;
-    if (!parent) {
-      invariant(!this._isOnChunkBoundary);
-      this._isOnChunkBoundary = true;
-    } else {
-      invariant(this._current.length > 0);
-      const chunk = this._makeChunk(this._current)[0];
-      parent.append(chunk);
-      this._current = [];
+    const [chunk, seq] = this._makeChunk(this._current);
+    this._current = [];
+    this._lastSeq = seq;
+    if (!this._parent && createParentIfNil) {
+      this.createParent();
+    }
+    if (this._parent) {
+      this._parent.append(chunk);
    }
  }

-  async done(): Promise<C> {
-    if (this._cursor) {
-      await this.finalizeCursor();
+  async done(): Promise<Sequence> {
+    invariant(!this._done);
+    this._done = true;
+
+    for (let s = this; s; s = s._parent) {
+      if (s._cursor) {
+        await s.finalizeCursor();
+      }
    }

-    if (this.isRoot()) {
-      return this._makeChunk(this._current)[1];
+    // Chunkers will probably have current items which didn't hit a chunk boundary. Pretend they end
+    // on chunk boundaries for now.
+    this.finalizeChunkBoundaries();
+
+    // The rest of this code figures out which sequence in the parent chain is canonical. That is:
+    // * It's empty, or
+    // * It never chunked, so it's not a prollytree, or
+    // * It chunked, so it's a prollytree, but it must have at least 2 children (or it could have
+    //   been represented as that 1 child).
+    //
+    // Examples of when we may have constructed non-canonical sequences:
+    // * If the previous tree (i.e. its cursor) was deeper, we will have created empty parents.
+    // * If the last appended item was on a chunk boundary, there may be a sequence with a single
+    //   chunk.
+
+    // Firstly, follow up the parent chain to find the highest chunker which did chunk.
+    let seq = this.findRoot();
+    if (!seq) {
+      seq = this._makeChunk([])[1];
+      return seq;
    }

-    if (this._current.length > 0) {
-      this.handleChunkBoundary();
+    // Lastly, step back down to find a meta sequence with more than 1 child.
+    while (seq.length <= 1) {
+      invariant(seq.length !== 0);
+      if (!seq.isMeta) {
+        break;
+      }
+      seq = notNull(await seq.getChildSequence(0));
    }

-    invariant(this._parent);
-    return this._parent.done();
+    return notNull(seq); // flow should not need this notNull
  }

  // Like |done|, but assumes there is no cursor, so it can be synchronous. Necessary for
  // constructing collections without Promises or async/await. There is no equivalent in the Go
  // code because Go is already synchronous.
-  doneSync(): C {
+  doneSync(): Sequence {
    invariant(!this._cursor);
+    invariant(!this._done);
+    this._done = true;

-    if (this.isRoot()) {
-      return this._makeChunk(this._current)[1];
+    this.finalizeChunkBoundaries();
+
+    let seq = this.findRoot();
+    if (!seq) {
+      seq = this._makeChunk([])[1];
+      return seq;
    }

-    if (this._current.length > 0) {
-      this.handleChunkBoundary();
-    }
-
-    invariant(this._parent);
-    return this._parent.doneSync();
-  }
-
-  isRoot(): boolean {
-    for (let ancestor = this._parent; ancestor; ancestor = ancestor._parent) {
-      if (ancestor._used) {
-        return false;
+    while (seq.length <= 1) {
+      invariant(seq.length !== 0);
+      if (!seq.isMeta) {
+        break;
      }
+      seq = notNull(seq.getChildSequenceSync(0));
    }

-    return true;
+    return notNull(seq); // flow should not need this notNull
  }

  async finalizeCursor(): Promise<void> {
@@ -250,4 +266,23 @@ export default class SequenceChunker<C: Collection, S, U: Sequence> {
      }
    }
  }
+
+  finalizeChunkBoundaries() {
+    for (let s = this; s; s = s._parent) {
+      if (s._current.length > 0) {
+        // Don't create a new parent if we haven't chunked.
+        s.handleChunkBoundary(Boolean(s._lastSeq));
+      }
+    }
+  }
+
+  findRoot(): ?Sequence {
+    let root = null;
+    for (let s = this; s; s = s._parent) {
+      if (s._lastSeq) {
+        root = s._lastSeq;
+      }
+    }
+    return root;
+  }
 }
@@ -43,6 +43,10 @@ export default class Sequence<T> {
    return Promise.resolve(null);
  }

+  getChildSequenceSync(idx: number): ?Sequence { // eslint-disable-line no-unused-vars
+    return null;
+  }
+
  get chunks(): Array<Ref> {
    return [];
  }
@@ -41,9 +41,10 @@ function newSetLeafChunkFn<T:Value>(vr: ?ValueReader): makeChunkFn {
      }
    }

-    const ns = Set.fromSequence(newSetLeafSequence(vr, items));
+    const seq = newSetLeafSequence(vr, items);
+    const ns = Set.fromSequence(seq);
    const mt = new MetaTuple(new Ref(ns), indexValue, items.length, ns);
-    return [mt, ns];
+    return [mt, seq];
  };
 }

@@ -68,13 +69,14 @@ export function newSetLeafSequence<K: Value>(

 export default class Set<T: Value> extends Collection<OrderedSequence> {
  constructor(values: Array<T> = []) {
-    const self = chunkSequenceSync(
+    const seq = chunkSequenceSync(
        buildSetData(values),
        newSetLeafChunkFn(null),
        newOrderedMetaSequenceChunkFn(Kind.Set, null),
        newSetLeafBoundaryChecker(),
        newOrderedMetaSequenceBoundaryChecker);
-    super(self.sequence);
+    invariant(seq instanceof OrderedSequence);
+    super(seq);
  }

  async has(key: T): Promise<boolean> {
@@ -118,7 +120,7 @@ export default class Set<T: Value> extends Collection<OrderedSequence> {
    return chunkSequence(cursor, insert, remove, newSetLeafChunkFn(vr),
                         newOrderedMetaSequenceChunkFn(Kind.Set, vr),
                         newSetLeafBoundaryChecker(),
-                         newOrderedMetaSequenceBoundaryChecker);
+                         newOrderedMetaSequenceBoundaryChecker).then(s => Set.fromSequence(s));
  }

  async add(value: T): Promise<Set<T>> {