From b8be6908f88a4b553a125706bd995d6c229ce3da Mon Sep 17 00:00:00 2001 From: Erik Arvidsson Date: Wed, 16 Dec 2015 16:43:30 -0500 Subject: [PATCH] Implement Set Union This is done by creating a cursor for each set. This is a cursor for the actual values in the sets. We then pick the "smallest" value from the cursors and advance that cursor. This continues until we have exhausted all the cursors. setA.Union(set0, ... setN) The time complexity is O(len(setA) + len(set0)) + ... len(setN)) --- types/assert.go | 2 +- types/compound_set.go | 25 +++++++++++--- types/compound_set_test.go | 50 +++++++++++++++++++++++++++ types/set.go | 70 +++++++++++++++++++++++++++++++++++++- types/set_leaf.go | 25 +++++++++----- 5 files changed, 156 insertions(+), 16 deletions(-) diff --git a/types/assert.go b/types/assert.go index c4fceda096..13e5e5f470 100644 --- a/types/assert.go +++ b/types/assert.go @@ -10,7 +10,7 @@ func assertType(t Type, v ...Value) { } } -func assertSetsSameType(s setLeaf, v ...Set) { +func assertSetsSameType(s Set, v ...Set) { if s.elemType().Kind() != ValueKind { t := s.Type() for _, v := range v { diff --git a/types/compound_set.go b/types/compound_set.go index 1659da6915..346468e046 100644 --- a/types/compound_set.go +++ b/types/compound_set.go @@ -89,23 +89,38 @@ func (cs compoundSet) Remove(values ...Value) Set { return res.Remove(tail...) } -func (cs compoundSet) sequenceChunkerAtValue(v Value) (*sequenceChunker, bool) { +func (cs compoundSet) sequenceCursorAtValue(v Value) (*sequenceCursor, bool) { metaCur, leaf, idx := cs.findLeaf(v) - cur := &sequenceCursor{metaCur, leaf, idx, len(leaf.data), func(otherLeaf sequenceItem, idx int) sequenceItem { return otherLeaf.(setLeaf).data[idx] }, func(mt sequenceItem) (sequenceItem, int) { otherLeaf := readMetaTupleValue(mt, cs.cs).(setLeaf) return otherLeaf, len(otherLeaf.data) }} - - seq := newSequenceChunker(cur, makeSetLeafChunkFn(cs.t, cs.cs), newSetMetaSequenceChunkFn(cs.t, cs.cs), newSetLeafBoundaryChecker(), newOrderedMetaSequenceBoundaryChecker) found := idx < len(leaf.data) && leaf.data[idx].Equals(v) + return cur, found +} + +func (cs compoundSet) sequenceChunkerAtValue(v Value) (*sequenceChunker, bool) { + cur, found := cs.sequenceCursorAtValue(v) + seq := newSequenceChunker(cur, makeSetLeafChunkFn(cs.t, cs.cs), newSetMetaSequenceChunkFn(cs.t, cs.cs), newSetLeafBoundaryChecker(), newOrderedMetaSequenceBoundaryChecker) return seq, found } +func (cs compoundSet) elemType() Type { + return cs.t.Desc.(CompoundDesc).ElemTypes[0] +} + +func (cs compoundSet) sequenceCursorAtFirst() *sequenceCursor { + // TODO: This can be done more efficiently - Bug 795 + v := cs.First() + cur, found := cs.sequenceCursorAtValue(v) + d.Chk.True(found) + return cur +} + func (cs compoundSet) Union(others ...Set) Set { - panic("not implemented") + return setUnion(cs, cs.cs, others) } func (cs compoundSet) Subtract(others ...Set) Set { diff --git a/types/compound_set_test.go b/types/compound_set_test.go index b8a44e87ee..aafeec12a8 100644 --- a/types/compound_set_test.go +++ b/types/compound_set_test.go @@ -282,3 +282,53 @@ func TestCompoundSetFilter(t *testing.T) { doTest(getTestRefToNativeOrderSet(2)) doTest(getTestRefToValueOrderSet(2)) } + +func TestCompoundSetUnion(t *testing.T) { + assert := assert.New(t) + ms := chunks.NewMemoryStore() + + doTest := func(ts testSet) { + cs := ts.toCompoundSet(ms) + cs2 := cs.Union() + assert.True(cs.Equals(cs2)) + cs3 := cs.Union(cs2) + assert.True(cs.Equals(cs3)) + cs4 := cs.Union(cs2, cs3) + assert.True(cs.Equals(cs4)) + emptySet := NewTypedSet(ms, ts.tr) + cs5 := cs.Union(emptySet) + assert.True(cs.Equals(cs5)) + cs6 := emptySet.Union(cs) + assert.True(cs.Equals(cs6)) + + r := rand.New(rand.NewSource(123)) + subsetValues1 := make([]Value, 0, len(ts.values)) + subsetValues2 := make([]Value, 0, len(ts.values)) + subsetValues3 := make([]Value, 0, len(ts.values)) + subsetValuesAll := make([]Value, 0, len(ts.values)) + for _, v := range ts.values { + if r.Intn(3) == 0 { + subsetValues1 = append(subsetValues1, v) + subsetValuesAll = append(subsetValuesAll, v) + } else if r.Intn(3) == 0 { + subsetValues2 = append(subsetValues2, v) + subsetValuesAll = append(subsetValuesAll, v) + } else if r.Intn(3) == 0 { + subsetValues3 = append(subsetValues3, v) + subsetValuesAll = append(subsetValuesAll, v) + } + } + + s1 := NewTypedSet(ms, ts.tr, subsetValues1...) + s2 := NewTypedSet(ms, ts.tr, subsetValues2...) + s3 := NewTypedSet(ms, ts.tr, subsetValues3...) + sAll := NewTypedSet(ms, ts.tr, subsetValuesAll...) + + assert.True(s1.Union(s2, s3).Equals(sAll)) + } + + doTest(getTestNativeOrderSet(16)) + doTest(getTestRefValueOrderSet(2)) + doTest(getTestRefToNativeOrderSet(2)) + doTest(getTestRefToValueOrderSet(2)) +} diff --git a/types/set.go b/types/set.go index 8de815b60d..5e8e97f492 100644 --- a/types/set.go +++ b/types/set.go @@ -1,6 +1,9 @@ package types -import "github.com/attic-labs/noms/chunks" +import ( + "github.com/attic-labs/noms/chunks" + "github.com/attic-labs/noms/d" +) type Set interface { Value @@ -16,6 +19,8 @@ type Set interface { IterAll(cb setIterAllCallback) IterAllP(concurrency int, f setIterAllCallback) Filter(cb setFilterCallback) Set + elemType() Type + sequenceCursorAtFirst() *sequenceCursor } type indexOfSetFn func(m setData, v Value) int @@ -42,3 +47,66 @@ func newTypedSet(cs chunks.ChunkStore, t Type, data ...Value) Set { return seq.Done().(Set) } + +func setUnion(set Set, cs chunks.ChunkStore, others []Set) Set { + // TODO: This can be done more efficiently by realizing that if two sets have the same meta tuple we only have to traverse one of the subtrees. Bug 794 + if len(others) == 0 { + return set + } + assertSetsSameType(set, others...) + + tr := set.Type() + seq := newEmptySequenceChunker(makeSetLeafChunkFn(tr, cs), newSetMetaSequenceChunkFn(tr, cs), newSetLeafBoundaryChecker(), newOrderedMetaSequenceBoundaryChecker) + + var lessFunction func(a, b sequenceItem) bool + if isSequenceOrderedByIndexedType(tr) { + lessFunction = func(a, b sequenceItem) bool { + return a.(OrderedValue).Less(b.(OrderedValue)) + } + } else { + lessFunction = func(a, b sequenceItem) bool { + return a.(Value).Ref().Less(b.(Value).Ref()) + } + } + + smallest := func(cursors map[*sequenceCursor]bool) (smallestCursor *sequenceCursor, smallestItem sequenceItem) { + for cursor, _ := range cursors { + currentItem := cursor.current() + if smallestCursor == nil || lessFunction(currentItem, smallestItem) { + smallestCursor = cursor + smallestItem = currentItem + } + } + return + } + + cursors := make(map[*sequenceCursor]bool, len(others)+1) + if !set.Empty() { + cursor := set.sequenceCursorAtFirst() + cursors[cursor] = true + } + for _, s := range others { + if !s.Empty() { + cursor := s.sequenceCursorAtFirst() + cursors[cursor] = true + } + } + + var last Value + for len(cursors) > 0 { + smallestCursor, smallestItem := smallest(cursors) + d.Chk.NotNil(smallestCursor) + + // Don't add same value twice + if last == nil || !last.Equals(smallestItem.(Value)) { + seq.Append(smallestItem) + last = smallestItem.(Value) + } + + if !smallestCursor.advance() { + delete(cursors, smallestCursor) + } + } + + return seq.Done().(Set) +} diff --git a/types/set_leaf.go b/types/set_leaf.go index 1fa1c71838..dca49851ef 100644 --- a/types/set_leaf.go +++ b/types/set_leaf.go @@ -57,15 +57,7 @@ func (s setLeaf) Remove(values ...Value) Set { } func (s setLeaf) Union(others ...Set) Set { - assertSetsSameType(s, others...) - var result Set = s - for _, other := range others { - other.Iter(func(v Value) (stop bool) { - result = result.Insert(v) - return - }) - } - return result + return setUnion(s, s.cs, others) } func (s setLeaf) Subtract(others ...Set) Set { @@ -241,3 +233,18 @@ func makeSetLeafChunkFn(t Type, cs chunks.ChunkStore) makeChunkFn { return metaTuple{ref, indexValue}, setLeaf } } + +func (s setLeaf) sequenceCursorAtFirst() *sequenceCursor { + return &sequenceCursor{ + nil, + s.data, + 0, + len(s.data), + func(parent sequenceItem, idx int) sequenceItem { + return s.data[idx] + }, + func(reference sequenceItem) (sequence sequenceItem, length int) { + panic("unreachable") + }, + } +}