diff --git a/types/assert.go b/types/assert.go index c4fceda096..13e5e5f470 100644 --- a/types/assert.go +++ b/types/assert.go @@ -10,7 +10,7 @@ func assertType(t Type, v ...Value) { } } -func assertSetsSameType(s setLeaf, v ...Set) { +func assertSetsSameType(s Set, v ...Set) { if s.elemType().Kind() != ValueKind { t := s.Type() for _, v := range v { diff --git a/types/compound_set.go b/types/compound_set.go index 1659da6915..346468e046 100644 --- a/types/compound_set.go +++ b/types/compound_set.go @@ -89,23 +89,38 @@ func (cs compoundSet) Remove(values ...Value) Set { return res.Remove(tail...) } -func (cs compoundSet) sequenceChunkerAtValue(v Value) (*sequenceChunker, bool) { +func (cs compoundSet) sequenceCursorAtValue(v Value) (*sequenceCursor, bool) { metaCur, leaf, idx := cs.findLeaf(v) - cur := &sequenceCursor{metaCur, leaf, idx, len(leaf.data), func(otherLeaf sequenceItem, idx int) sequenceItem { return otherLeaf.(setLeaf).data[idx] }, func(mt sequenceItem) (sequenceItem, int) { otherLeaf := readMetaTupleValue(mt, cs.cs).(setLeaf) return otherLeaf, len(otherLeaf.data) }} - - seq := newSequenceChunker(cur, makeSetLeafChunkFn(cs.t, cs.cs), newSetMetaSequenceChunkFn(cs.t, cs.cs), newSetLeafBoundaryChecker(), newOrderedMetaSequenceBoundaryChecker) found := idx < len(leaf.data) && leaf.data[idx].Equals(v) + return cur, found +} + +func (cs compoundSet) sequenceChunkerAtValue(v Value) (*sequenceChunker, bool) { + cur, found := cs.sequenceCursorAtValue(v) + seq := newSequenceChunker(cur, makeSetLeafChunkFn(cs.t, cs.cs), newSetMetaSequenceChunkFn(cs.t, cs.cs), newSetLeafBoundaryChecker(), newOrderedMetaSequenceBoundaryChecker) return seq, found } +func (cs compoundSet) elemType() Type { + return cs.t.Desc.(CompoundDesc).ElemTypes[0] +} + +func (cs compoundSet) sequenceCursorAtFirst() *sequenceCursor { + // TODO: This can be done more efficiently - Bug 795 + v := cs.First() + cur, found := cs.sequenceCursorAtValue(v) + d.Chk.True(found) + return cur +} + func (cs compoundSet) Union(others ...Set) Set { - panic("not implemented") + return setUnion(cs, cs.cs, others) } func (cs compoundSet) Subtract(others ...Set) Set { diff --git a/types/compound_set_test.go b/types/compound_set_test.go index b8a44e87ee..aafeec12a8 100644 --- a/types/compound_set_test.go +++ b/types/compound_set_test.go @@ -282,3 +282,53 @@ func TestCompoundSetFilter(t *testing.T) { doTest(getTestRefToNativeOrderSet(2)) doTest(getTestRefToValueOrderSet(2)) } + +func TestCompoundSetUnion(t *testing.T) { + assert := assert.New(t) + ms := chunks.NewMemoryStore() + + doTest := func(ts testSet) { + cs := ts.toCompoundSet(ms) + cs2 := cs.Union() + assert.True(cs.Equals(cs2)) + cs3 := cs.Union(cs2) + assert.True(cs.Equals(cs3)) + cs4 := cs.Union(cs2, cs3) + assert.True(cs.Equals(cs4)) + emptySet := NewTypedSet(ms, ts.tr) + cs5 := cs.Union(emptySet) + assert.True(cs.Equals(cs5)) + cs6 := emptySet.Union(cs) + assert.True(cs.Equals(cs6)) + + r := rand.New(rand.NewSource(123)) + subsetValues1 := make([]Value, 0, len(ts.values)) + subsetValues2 := make([]Value, 0, len(ts.values)) + subsetValues3 := make([]Value, 0, len(ts.values)) + subsetValuesAll := make([]Value, 0, len(ts.values)) + for _, v := range ts.values { + if r.Intn(3) == 0 { + subsetValues1 = append(subsetValues1, v) + subsetValuesAll = append(subsetValuesAll, v) + } else if r.Intn(3) == 0 { + subsetValues2 = append(subsetValues2, v) + subsetValuesAll = append(subsetValuesAll, v) + } else if r.Intn(3) == 0 { + subsetValues3 = append(subsetValues3, v) + subsetValuesAll = append(subsetValuesAll, v) + } + } + + s1 := NewTypedSet(ms, ts.tr, subsetValues1...) + s2 := NewTypedSet(ms, ts.tr, subsetValues2...) + s3 := NewTypedSet(ms, ts.tr, subsetValues3...) + sAll := NewTypedSet(ms, ts.tr, subsetValuesAll...) + + assert.True(s1.Union(s2, s3).Equals(sAll)) + } + + doTest(getTestNativeOrderSet(16)) + doTest(getTestRefValueOrderSet(2)) + doTest(getTestRefToNativeOrderSet(2)) + doTest(getTestRefToValueOrderSet(2)) +} diff --git a/types/set.go b/types/set.go index 8de815b60d..5e8e97f492 100644 --- a/types/set.go +++ b/types/set.go @@ -1,6 +1,9 @@ package types -import "github.com/attic-labs/noms/chunks" +import ( + "github.com/attic-labs/noms/chunks" + "github.com/attic-labs/noms/d" +) type Set interface { Value @@ -16,6 +19,8 @@ type Set interface { IterAll(cb setIterAllCallback) IterAllP(concurrency int, f setIterAllCallback) Filter(cb setFilterCallback) Set + elemType() Type + sequenceCursorAtFirst() *sequenceCursor } type indexOfSetFn func(m setData, v Value) int @@ -42,3 +47,66 @@ func newTypedSet(cs chunks.ChunkStore, t Type, data ...Value) Set { return seq.Done().(Set) } + +func setUnion(set Set, cs chunks.ChunkStore, others []Set) Set { + // TODO: This can be done more efficiently by realizing that if two sets have the same meta tuple we only have to traverse one of the subtrees. Bug 794 + if len(others) == 0 { + return set + } + assertSetsSameType(set, others...) + + tr := set.Type() + seq := newEmptySequenceChunker(makeSetLeafChunkFn(tr, cs), newSetMetaSequenceChunkFn(tr, cs), newSetLeafBoundaryChecker(), newOrderedMetaSequenceBoundaryChecker) + + var lessFunction func(a, b sequenceItem) bool + if isSequenceOrderedByIndexedType(tr) { + lessFunction = func(a, b sequenceItem) bool { + return a.(OrderedValue).Less(b.(OrderedValue)) + } + } else { + lessFunction = func(a, b sequenceItem) bool { + return a.(Value).Ref().Less(b.(Value).Ref()) + } + } + + smallest := func(cursors map[*sequenceCursor]bool) (smallestCursor *sequenceCursor, smallestItem sequenceItem) { + for cursor, _ := range cursors { + currentItem := cursor.current() + if smallestCursor == nil || lessFunction(currentItem, smallestItem) { + smallestCursor = cursor + smallestItem = currentItem + } + } + return + } + + cursors := make(map[*sequenceCursor]bool, len(others)+1) + if !set.Empty() { + cursor := set.sequenceCursorAtFirst() + cursors[cursor] = true + } + for _, s := range others { + if !s.Empty() { + cursor := s.sequenceCursorAtFirst() + cursors[cursor] = true + } + } + + var last Value + for len(cursors) > 0 { + smallestCursor, smallestItem := smallest(cursors) + d.Chk.NotNil(smallestCursor) + + // Don't add same value twice + if last == nil || !last.Equals(smallestItem.(Value)) { + seq.Append(smallestItem) + last = smallestItem.(Value) + } + + if !smallestCursor.advance() { + delete(cursors, smallestCursor) + } + } + + return seq.Done().(Set) +} diff --git a/types/set_leaf.go b/types/set_leaf.go index 1fa1c71838..dca49851ef 100644 --- a/types/set_leaf.go +++ b/types/set_leaf.go @@ -57,15 +57,7 @@ func (s setLeaf) Remove(values ...Value) Set { } func (s setLeaf) Union(others ...Set) Set { - assertSetsSameType(s, others...) - var result Set = s - for _, other := range others { - other.Iter(func(v Value) (stop bool) { - result = result.Insert(v) - return - }) - } - return result + return setUnion(s, s.cs, others) } func (s setLeaf) Subtract(others ...Set) Set { @@ -241,3 +233,18 @@ func makeSetLeafChunkFn(t Type, cs chunks.ChunkStore) makeChunkFn { return metaTuple{ref, indexValue}, setLeaf } } + +func (s setLeaf) sequenceCursorAtFirst() *sequenceCursor { + return &sequenceCursor{ + nil, + s.data, + 0, + len(s.data), + func(parent sequenceItem, idx int) sequenceItem { + return s.data[idx] + }, + func(reference sequenceItem) (sequence sequenceItem, length int) { + panic("unreachable") + }, + } +}