diff --git a/go.mod b/go.mod index c12f3587cb..1ad9febfae 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/MicahParks/keyfunc v1.9.0 github.com/Nerzal/gocloak/v13 v13.9.0 github.com/bbalet/stopwords v1.0.0 - github.com/blevesearch/bleve/v2 v2.3.10 + github.com/blevesearch/bleve/v2 v2.4.0 github.com/cenkalti/backoff v2.2.1+incompatible github.com/coreos/go-oidc/v3 v3.10.0 github.com/cs3org/go-cs3apis v0.0.0-20231023073225-7748710e0781 @@ -137,12 +137,13 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/bitly/go-simplejson v0.5.0 // indirect github.com/bits-and-blooms/bitset v1.2.1 // indirect - github.com/blevesearch/bleve_index_api v1.0.6 // indirect - github.com/blevesearch/geo v0.1.18 // indirect + github.com/blevesearch/bleve_index_api v1.1.6 // indirect + github.com/blevesearch/geo v0.1.20 // indirect + github.com/blevesearch/go-faiss v1.0.13 // indirect github.com/blevesearch/go-porterstemmer v1.0.3 // indirect github.com/blevesearch/gtreap v0.1.1 // indirect github.com/blevesearch/mmap-go v1.0.4 // indirect - github.com/blevesearch/scorch_segment_api/v2 v2.1.6 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.2.9 // indirect github.com/blevesearch/segment v0.9.1 // indirect github.com/blevesearch/snowballstem v0.9.0 // indirect github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect @@ -152,6 +153,7 @@ require ( github.com/blevesearch/zapx/v13 v13.3.10 // indirect github.com/blevesearch/zapx/v14 v14.3.10 // indirect github.com/blevesearch/zapx/v15 v15.3.13 // indirect + github.com/blevesearch/zapx/v16 v16.0.12 // indirect github.com/bluele/gcache v0.0.2 // indirect github.com/bmizerany/pat v0.0.0-20210406213842-e4b6760bdd6f // indirect github.com/bombsimon/logrusr/v3 v3.1.0 // indirect diff --git a/go.sum b/go.sum index 161c1e5469..e38514fbb2 100644 --- a/go.sum +++ b/go.sum @@ -895,20 +895,22 @@ github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edY github.com/bits-and-blooms/bitset v1.2.1 h1:M+/hrU9xlMp7t4TyTDQW97d3tRPVuKFC6zBEK16QnXY= github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= -github.com/blevesearch/bleve/v2 v2.3.10 h1:z8V0wwGoL4rp7nG/O3qVVLYxUqCbEwskMt4iRJsPLgg= -github.com/blevesearch/bleve/v2 v2.3.10/go.mod h1:RJzeoeHC+vNHsoLR54+crS1HmOWpnH87fL70HAUCzIA= -github.com/blevesearch/bleve_index_api v1.0.6 h1:gyUUxdsrvmW3jVhhYdCVL6h9dCjNT/geNU7PxGn37p8= -github.com/blevesearch/bleve_index_api v1.0.6/go.mod h1:YXMDwaXFFXwncRS8UobWs7nvo0DmusriM1nztTlj1ms= -github.com/blevesearch/geo v0.1.18 h1:Np8jycHTZ5scFe7VEPLrDoHnnb9C4j636ue/CGrhtDw= -github.com/blevesearch/geo v0.1.18/go.mod h1:uRMGWG0HJYfWfFJpK3zTdnnr1K+ksZTuWKhXeSokfnM= +github.com/blevesearch/bleve/v2 v2.4.0 h1:2xyg+Wv60CFHYccXc+moGxbL+8QKT/dZK09AewHgKsg= +github.com/blevesearch/bleve/v2 v2.4.0/go.mod h1:IhQHoFAbHgWKYavb9rQgQEJJVMuY99cKdQ0wPpst2aY= +github.com/blevesearch/bleve_index_api v1.1.6 h1:orkqDFCBuNU2oHW9hN2YEJmet+TE9orml3FCGbl1cKk= +github.com/blevesearch/bleve_index_api v1.1.6/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= +github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= +github.com/blevesearch/go-faiss v1.0.13 h1:zfFs7ZYD0NqXVSY37j0JZjZT1BhE9AE4peJfcx/NB4A= +github.com/blevesearch/go-faiss v1.0.13/go.mod h1:jrxHrbl42X/RnDPI+wBoZU8joxxuRwedrxqswQ3xfU8= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.1.6 h1:CdekX/Ob6YCYmeHzD72cKpwzBjvkOGegHOqhAkXp6yA= -github.com/blevesearch/scorch_segment_api/v2 v2.1.6/go.mod h1:nQQYlp51XvoSVxcciBjtvuHPIVjlWrN1hX4qwK2cqdc= +github.com/blevesearch/scorch_segment_api/v2 v2.2.9 h1:3nBaSBRFokjE4FtPW3eUDgcAu3KphBg1GP07zy/6Uyk= +github.com/blevesearch/scorch_segment_api/v2 v2.2.9/go.mod h1:ckbeb7knyOOvAdZinn/ASbB7EA3HoagnJkmEV3J7+sg= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= @@ -927,6 +929,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7 github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ= github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= +github.com/blevesearch/zapx/v16 v16.0.12 h1:Uccxvjmn+hQ6ywQP+wIiTpdq9LnAviGoryJOmGwAo/I= +github.com/blevesearch/zapx/v16 v16.0.12/go.mod h1:MYnOshRfSm4C4drxx1LGRI+MVFByykJ2anDY1fxdk9Q= github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw= github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0= github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY= diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile b/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile new file mode 100644 index 0000000000..7e8953c78f --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile @@ -0,0 +1,107 @@ +.PHONY: help all test format fmtcheck vet lint qa deps clean nuke ser fetch-real-roaring-datasets + + + + + + + + +# Display general help about this command +help: + @echo "" + @echo "The following commands are available:" + @echo "" + @echo " make qa : Run all the tests" + @echo " make test : Run the unit tests" + @echo "" + @echo " make format : Format the source code" + @echo " make fmtcheck : Check if the source code has been formatted" + @echo " make vet : Check for suspicious constructs" + @echo " make lint : Check for style errors" + @echo "" + @echo " make deps : Get the dependencies" + @echo " make clean : Remove any build artifact" + @echo " make nuke : Deletes any intermediate file" + @echo "" + @echo " make fuzz-smat : Fuzzy testing with smat" + @echo " make fuzz-stream : Fuzzy testing with stream deserialization" + @echo " make fuzz-buffer : Fuzzy testing with buffer deserialization" + @echo "" + +# Alias for help target +all: help +test: + go test + go test -race -run TestConcurrent* +# Format the source code +format: + @find ./ -type f -name "*.go" -exec gofmt -w {} \; + +# Check if the source code has been formatted +fmtcheck: + @mkdir -p target + @find ./ -type f -name "*.go" -exec gofmt -d {} \; | tee target/format.diff + @test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; } + +# Check for syntax errors +vet: + GOPATH=$(GOPATH) go vet ./... + +# Check for style errors +lint: + GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint ./... + + + + + +# Alias to run all quality-assurance checks +qa: fmtcheck test vet lint + +# --- INSTALL --- + +# Get the dependencies +deps: + GOPATH=$(GOPATH) go get github.com/stretchr/testify + GOPATH=$(GOPATH) go get github.com/bits-and-blooms/bitset + GOPATH=$(GOPATH) go get github.com/golang/lint/golint + GOPATH=$(GOPATH) go get github.com/mschoch/smat + GOPATH=$(GOPATH) go get github.com/dvyukov/go-fuzz/go-fuzz + GOPATH=$(GOPATH) go get github.com/dvyukov/go-fuzz/go-fuzz-build + GOPATH=$(GOPATH) go get github.com/glycerine/go-unsnap-stream + GOPATH=$(GOPATH) go get github.com/philhofer/fwd + GOPATH=$(GOPATH) go get github.com/jtolds/gls + +fuzz-smat: + go test -tags=gofuzz -run=TestGenerateSmatCorpus + go-fuzz-build -func FuzzSmat github.com/RoaringBitmap/roaring + go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200 + + +fuzz-stream: + go-fuzz-build -func FuzzSerializationStream github.com/RoaringBitmap/roaring + go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200 + +fuzz-buffer: + go-fuzz-build -func FuzzSerializationBuffer github.com/RoaringBitmap/roaring + go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200 + +# Remove any build artifact +clean: + GOPATH=$(GOPATH) go clean ./... + +# Deletes any intermediate file +nuke: + rm -rf ./target + GOPATH=$(GOPATH) go clean -i ./... + + +cover: + go test -coverprofile=coverage.out + go tool cover -html=coverage.out + +fetch-real-roaring-datasets: + # pull github.com/RoaringBitmap/real-roaring-datasets -> testdata/real-roaring-datasets + git submodule init + git submodule update diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go new file mode 100644 index 0000000000..0e93c03352 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go @@ -0,0 +1,852 @@ +package roaring64 + +import ( + "fmt" + "math/bits" + "runtime" + "sync" + "sync/atomic" +) + +const ( + // Min64BitSigned - Minimum 64 bit value + Min64BitSigned = -9223372036854775808 + // Max64BitSigned - Maximum 64 bit value + Max64BitSigned = 9223372036854775807 +) + +// BSI is at its simplest is an array of bitmaps that represent an encoded +// binary value. The advantage of a BSI is that comparisons can be made +// across ranges of values whereas a bitmap can only represent the existence +// of a single value for a given column ID. Another usage scenario involves +// storage of high cardinality values. +// +// It depends upon the bitmap libraries. It is not thread safe, so +// upstream concurrency guards must be provided. +type BSI struct { + bA []*Bitmap + eBM *Bitmap // Existence BitMap + MaxValue int64 + MinValue int64 + runOptimized bool +} + +// NewBSI constructs a new BSI. Min/Max values are optional. If set to 0 +// then the underlying BSI will be automatically sized. +func NewBSI(maxValue int64, minValue int64) *BSI { + + bitsz := bits.Len64(uint64(minValue)) + if bits.Len64(uint64(maxValue)) > bitsz { + bitsz = bits.Len64(uint64(maxValue)) + } + ba := make([]*Bitmap, bitsz) + for i := 0; i < len(ba); i++ { + ba[i] = NewBitmap() + } + return &BSI{bA: ba, eBM: NewBitmap(), MaxValue: maxValue, MinValue: minValue} +} + +// NewDefaultBSI constructs an auto-sized BSI +func NewDefaultBSI() *BSI { + return NewBSI(int64(0), int64(0)) +} + +// RunOptimize attempts to further compress the runs of consecutive values found in the bitmap +func (b *BSI) RunOptimize() { + b.eBM.RunOptimize() + for i := 0; i < len(b.bA); i++ { + b.bA[i].RunOptimize() + } + b.runOptimized = true +} + +// HasRunCompression returns true if the bitmap benefits from run compression +func (b *BSI) HasRunCompression() bool { + return b.runOptimized +} + +// GetExistenceBitmap returns a pointer to the underlying existence bitmap of the BSI +func (b *BSI) GetExistenceBitmap() *Bitmap { + return b.eBM +} + +// ValueExists tests whether the value exists. +func (b *BSI) ValueExists(columnID uint64) bool { + + return b.eBM.Contains(uint64(columnID)) +} + +// GetCardinality returns a count of unique column IDs for which a value has been set. +func (b *BSI) GetCardinality() uint64 { + return b.eBM.GetCardinality() +} + +// BitCount returns the number of bits needed to represent values. +func (b *BSI) BitCount() int { + + return len(b.bA) +} + +// SetValue sets a value for a given columnID. +func (b *BSI) SetValue(columnID uint64, value int64) { + + // If max/min values are set to zero then automatically determine bit array size + if b.MaxValue == 0 && b.MinValue == 0 { + ba := make([]*Bitmap, bits.Len64(uint64(value))) + for i := len(ba) - b.BitCount(); i > 0; i-- { + b.bA = append(b.bA, NewBitmap()) + if b.runOptimized { + b.bA[i].RunOptimize() + } + } + } + + var wg sync.WaitGroup + + for i := 0; i < b.BitCount(); i++ { + wg.Add(1) + go func(j int) { + defer wg.Done() + if uint64(value)&(1< 0 { + b.bA[j].Add(uint64(columnID)) + } else { + b.bA[j].Remove(uint64(columnID)) + } + }(i) + } + wg.Wait() + b.eBM.Add(uint64(columnID)) +} + +// GetValue gets the value at the column ID. Second param will be false for non-existant values. +func (b *BSI) GetValue(columnID uint64) (int64, bool) { + value := int64(0) + exists := b.eBM.Contains(uint64(columnID)) + if !exists { + return value, exists + } + for i := 0; i < b.BitCount(); i++ { + if b.bA[i].Contains(uint64(columnID)) { + value |= (1 << uint64(i)) + } + } + return int64(value), exists +} + +type action func(t *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.WaitGroup) + +func parallelExecutor(parallelism int, t *task, e action, foundSet *Bitmap) *Bitmap { + + var n int = parallelism + if n == 0 { + n = runtime.NumCPU() + } + + resultsChan := make(chan *Bitmap, n) + + card := foundSet.GetCardinality() + x := card / uint64(n) + + remainder := card - (x * uint64(n)) + var batch []uint64 + var wg sync.WaitGroup + iter := foundSet.ManyIterator() + for i := 0; i < n; i++ { + if i == n-1 { + batch = make([]uint64, x+remainder) + } else { + batch = make([]uint64, x) + } + iter.NextMany(batch) + wg.Add(1) + go e(t, batch, resultsChan, &wg) + } + + wg.Wait() + + close(resultsChan) + + ba := make([]*Bitmap, 0) + for bm := range resultsChan { + ba = append(ba, bm) + } + + return ParOr(0, ba...) + +} + +type bsiAction func(input *BSI, filterSet *Bitmap, batch []uint64, resultsChan chan *BSI, wg *sync.WaitGroup) + +func parallelExecutorBSIResults(parallelism int, input *BSI, e bsiAction, foundSet, filterSet *Bitmap, sumResults bool) *BSI { + + var n int = parallelism + if n == 0 { + n = runtime.NumCPU() + } + + resultsChan := make(chan *BSI, n) + + card := foundSet.GetCardinality() + x := card / uint64(n) + + remainder := card - (x * uint64(n)) + var batch []uint64 + var wg sync.WaitGroup + iter := foundSet.ManyIterator() + for i := 0; i < n; i++ { + if i == n-1 { + batch = make([]uint64, x+remainder) + } else { + batch = make([]uint64, x) + } + iter.NextMany(batch) + wg.Add(1) + go e(input, filterSet, batch, resultsChan, &wg) + } + + wg.Wait() + + close(resultsChan) + + ba := make([]*BSI, 0) + for bm := range resultsChan { + ba = append(ba, bm) + } + + results := NewDefaultBSI() + if sumResults { + for _, v := range ba { + results.Add(v) + } + } else { + results.ParOr(0, ba...) + } + return results + +} + +// Operation identifier +type Operation int + +const ( + // LT less than + LT Operation = 1 + iota + // LE less than or equal + LE + // EQ equal + EQ + // GE greater than or equal + GE + // GT greater than + GT + // RANGE range + RANGE + // MIN find minimum + MIN + // MAX find maximum + MAX +) + +type task struct { + bsi *BSI + op Operation + valueOrStart int64 + end int64 + values map[int64]struct{} + bits *Bitmap +} + +// CompareValue compares value. +// For all operations with the exception of RANGE, the value to be compared is specified by valueOrStart. +// For the RANGE parameter the comparison criteria is >= valueOrStart and <= end. +// The parallelism parameter indicates the number of CPU threads to be applied for processing. A value +// of zero indicates that all available CPU resources will be potentially utilized. +// +func (b *BSI) CompareValue(parallelism int, op Operation, valueOrStart, end int64, + foundSet *Bitmap) *Bitmap { + + comp := &task{bsi: b, op: op, valueOrStart: valueOrStart, end: end} + if foundSet == nil { + return parallelExecutor(parallelism, comp, compareValue, b.eBM) + } + return parallelExecutor(parallelism, comp, compareValue, foundSet) +} + +func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.WaitGroup) { + + defer wg.Done() + + results := NewBitmap() + if e.bsi.runOptimized { + results.RunOptimize() + } + + x := e.bsi.BitCount() + startIsNegative := x == 64 && uint64(e.valueOrStart)&(1< 0 + endIsNegative := x == 64 && uint64(e.end)&(1< 0 + + for i := 0; i < len(batch); i++ { + cID := batch[i] + eq1, eq2 := true, true + lt1, lt2, gt1 := false, false, false + j := e.bsi.BitCount() - 1 + isNegative := false + if x == 64 { + isNegative = e.bsi.bA[j].Contains(cID) + j-- + } + compStartValue := e.valueOrStart + compEndValue := e.end + if isNegative != startIsNegative { + compStartValue = ^e.valueOrStart + 1 + } + if isNegative != endIsNegative { + compEndValue = ^e.end + 1 + } + for ; j >= 0; j-- { + sliceContainsBit := e.bsi.bA[j].Contains(cID) + + if uint64(compStartValue)&(1< 0 { + // BIT in value is SET + if !sliceContainsBit { + if eq1 { + if (e.op == GT || e.op == GE || e.op == RANGE) && startIsNegative && !isNegative { + gt1 = true + } + if e.op == LT || e.op == LE { + if !startIsNegative || (startIsNegative == isNegative) { + lt1 = true + } + } + eq1 = false + break + } + } + } else { + // BIT in value is CLEAR + if sliceContainsBit { + if eq1 { + if (e.op == LT || e.op == LE) && isNegative && !startIsNegative { + lt1 = true + } + if e.op == GT || e.op == GE || e.op == RANGE { + if startIsNegative || (startIsNegative == isNegative) { + gt1 = true + } + } + eq1 = false + if e.op != RANGE { + break + } + } + } + } + + if e.op == RANGE && uint64(compEndValue)&(1< 0 { + // BIT in value is SET + if !sliceContainsBit { + if eq2 { + if !endIsNegative || (endIsNegative == isNegative) { + lt2 = true + } + eq2 = false + if startIsNegative && !endIsNegative { + break + } + } + } + } else if e.op == RANGE { + // BIT in value is CLEAR + if sliceContainsBit { + if eq2 { + if isNegative && !endIsNegative { + lt2 = true + } + eq2 = false + break + } + } + } + + } + + switch e.op { + case LT: + if lt1 { + results.Add(cID) + } + case LE: + if lt1 || (eq1 && (!startIsNegative || (startIsNegative && isNegative))) { + results.Add(cID) + } + case EQ: + if eq1 { + results.Add(cID) + } + case GE: + if gt1 || (eq1 && (startIsNegative || (!startIsNegative && !isNegative))) { + results.Add(cID) + } + case GT: + if gt1 { + results.Add(cID) + } + case RANGE: + if (eq1 || gt1) && (eq2 || lt2) { + results.Add(cID) + } + default: + panic(fmt.Sprintf("Operation [%v] not supported here", e.op)) + } + } + + resultsChan <- results +} + +// MinMax - Find minimum or maximum value. +func (b *BSI) MinMax(parallelism int, op Operation, foundSet *Bitmap) int64 { + + var n int = parallelism + if n == 0 { + n = runtime.NumCPU() + } + + resultsChan := make(chan int64, n) + + card := foundSet.GetCardinality() + x := card / uint64(n) + + remainder := card - (x * uint64(n)) + var batch []uint64 + var wg sync.WaitGroup + iter := foundSet.ManyIterator() + for i := 0; i < n; i++ { + if i == n-1 { + batch = make([]uint64, x+remainder) + } else { + batch = make([]uint64, x) + } + iter.NextMany(batch) + wg.Add(1) + go b.minOrMax(op, batch, resultsChan, &wg) + } + + wg.Wait() + + close(resultsChan) + var minMax int64 + if op == MAX { + minMax = Min64BitSigned + } else { + minMax = Max64BitSigned + } + + for val := range resultsChan { + if (op == MAX && val > minMax) || (op == MIN && val <= minMax) { + minMax = val + } + } + return minMax +} + +func (b *BSI) minOrMax(op Operation, batch []uint64, resultsChan chan int64, wg *sync.WaitGroup) { + + defer wg.Done() + + x := b.BitCount() + var value int64 = Max64BitSigned + if op == MAX { + value = Min64BitSigned + } + + for i := 0; i < len(batch); i++ { + cID := batch[i] + eq := true + lt, gt := false, false + j := b.BitCount() - 1 + var cVal int64 + valueIsNegative := uint64(value)&(1< 0 && bits.Len64(uint64(value)) == 64 + isNegative := false + if x == 64 { + isNegative = b.bA[j].Contains(cID) + if isNegative { + cVal |= 1 << uint64(j) + } + j-- + } + compValue := value + if isNegative != valueIsNegative { + compValue = ^value + 1 + } + for ; j >= 0; j-- { + sliceContainsBit := b.bA[j].Contains(cID) + if sliceContainsBit { + cVal |= 1 << uint64(j) + } + if uint64(compValue)&(1< 0 { + // BIT in value is SET + if !sliceContainsBit { + if eq { + eq = false + if op == MAX && valueIsNegative && !isNegative { + gt = true + break + } + if op == MIN && (!valueIsNegative || (valueIsNegative == isNegative)) { + lt = true + } + } + } + } else { + // BIT in value is CLEAR + if sliceContainsBit { + if eq { + eq = false + if op == MIN && isNegative && !valueIsNegative { + lt = true + } + if op == MAX && (valueIsNegative || (valueIsNegative == isNegative)) { + gt = true + } + } + } + } + } + if lt || gt { + value = cVal + } + } + + resultsChan <- value +} + +// Sum all values contained within the foundSet. As a convenience, the cardinality of the foundSet +// is also returned (for calculating the average). +// +func (b *BSI) Sum(foundSet *Bitmap) (sum int64, count uint64) { + + count = foundSet.GetCardinality() + var wg sync.WaitGroup + for i := 0; i < b.BitCount(); i++ { + wg.Add(1) + go func(j int) { + defer wg.Done() + atomic.AddInt64(&sum, int64(foundSet.AndCardinality(b.bA[j])< bits { + bits = bsis[i].BitCount() + } + } + + // Make sure we have enough bit slices + for bits > b.BitCount() { + newBm := NewBitmap() + if b.runOptimized { + newBm.RunOptimize() + } + b.bA = append(b.bA, newBm) + } + + a := make([][]*Bitmap, bits) + for i := range a { + a[i] = make([]*Bitmap, 0) + for _, x := range bsis { + if len(x.bA) > i { + a[i] = append(a[i], x.bA[i]) + } else { + a[i] = []*Bitmap{NewBitmap()} + if b.runOptimized { + a[i][0].RunOptimize() + } + } + } + } + + // Consolidate existence bit maps + ebms := make([]*Bitmap, len(bsis)) + for i := range ebms { + ebms[i] = bsis[i].eBM + } + + // First merge all the bit slices from all bsi maps that exist in target + var wg sync.WaitGroup + for i := 0; i < bits; i++ { + wg.Add(1) + go func(j int) { + defer wg.Done() + x := []*Bitmap{b.bA[j]} + x = append(x, a[j]...) + b.bA[j] = ParOr(parallelism, x...) + }(i) + } + wg.Wait() + + // merge all the EBM maps + x := []*Bitmap{b.eBM} + x = append(x, ebms...) + b.eBM = ParOr(parallelism, x...) +} + +// UnmarshalBinary de-serialize a BSI. The value at bitData[0] is the EBM. Other indices are in least to most +// significance order starting at bitData[1] (bit position 0). +func (b *BSI) UnmarshalBinary(bitData [][]byte) error { + + for i := 1; i < len(bitData); i++ { + if bitData == nil || len(bitData[i]) == 0 { + continue + } + if b.BitCount() < i { + newBm := NewBitmap() + if b.runOptimized { + newBm.RunOptimize() + } + b.bA = append(b.bA, newBm) + } + if err := b.bA[i-1].UnmarshalBinary(bitData[i]); err != nil { + return err + } + if b.runOptimized { + b.bA[i-1].RunOptimize() + } + + } + // First element of bitData is the EBM + if bitData[0] == nil { + b.eBM = NewBitmap() + if b.runOptimized { + b.eBM.RunOptimize() + } + return nil + } + if err := b.eBM.UnmarshalBinary(bitData[0]); err != nil { + return err + } + if b.runOptimized { + b.eBM.RunOptimize() + } + return nil +} + +// MarshalBinary serializes a BSI +func (b *BSI) MarshalBinary() ([][]byte, error) { + + var err error + data := make([][]byte, b.BitCount()+1) + // Add extra element for EBM (BitCount() + 1) + for i := 1; i < b.BitCount()+1; i++ { + data[i], err = b.bA[i-1].MarshalBinary() + if err != nil { + return nil, err + } + } + // Marshal EBM + data[0], err = b.eBM.MarshalBinary() + if err != nil { + return nil, err + } + return data, nil +} + +// BatchEqual returns a bitmap containing the column IDs where the values are contained within the list of values provided. +func (b *BSI) BatchEqual(parallelism int, values []int64) *Bitmap { + + valMap := make(map[int64]struct{}, len(values)) + for i := 0; i < len(values); i++ { + valMap[values[i]] = struct{}{} + } + comp := &task{bsi: b, values: valMap} + return parallelExecutor(parallelism, comp, batchEqual, b.eBM) +} + +func batchEqual(e *task, batch []uint64, resultsChan chan *Bitmap, + wg *sync.WaitGroup) { + + defer wg.Done() + + results := NewBitmap() + if e.bsi.runOptimized { + results.RunOptimize() + } + + for i := 0; i < len(batch); i++ { + cID := batch[i] + if value, ok := e.bsi.GetValue(uint64(cID)); ok { + if _, yes := e.values[int64(value)]; yes { + results.Add(cID) + } + } + } + resultsChan <- results +} + +// ClearBits cleared the bits that exist in the target if they are also in the found set. +func ClearBits(foundSet, target *Bitmap) { + iter := foundSet.Iterator() + for iter.HasNext() { + cID := iter.Next() + target.Remove(cID) + } +} + +// ClearValues removes the values found in foundSet +func (b *BSI) ClearValues(foundSet *Bitmap) { + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + ClearBits(foundSet, b.eBM) + }() + for i := 0; i < b.BitCount(); i++ { + wg.Add(1) + go func(j int) { + defer wg.Done() + ClearBits(foundSet, b.bA[j]) + }(i) + } + wg.Wait() +} + +// NewBSIRetainSet - Construct a new BSI from a clone of existing BSI, retain only values contained in foundSet +func (b *BSI) NewBSIRetainSet(foundSet *Bitmap) *BSI { + + newBSI := NewBSI(b.MaxValue, b.MinValue) + newBSI.bA = make([]*Bitmap, b.BitCount()) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + newBSI.eBM = b.eBM.Clone() + newBSI.eBM.And(foundSet) + }() + for i := 0; i < b.BitCount(); i++ { + wg.Add(1) + go func(j int) { + defer wg.Done() + newBSI.bA[j] = b.bA[j].Clone() + newBSI.bA[j].And(foundSet) + }(i) + } + wg.Wait() + return newBSI +} + +// Clone performs a deep copy of BSI contents. +func (b *BSI) Clone() *BSI { + return b.NewBSIRetainSet(b.eBM) +} + +// Add - In-place sum the contents of another BSI with this BSI, column wise. +func (b *BSI) Add(other *BSI) { + + b.eBM.Or(other.eBM) + for i := 0; i < len(other.bA); i++ { + b.addDigit(other.bA[i], i) + } +} + +func (b *BSI) addDigit(foundSet *Bitmap, i int) { + + if i >= len(b.bA) { + b.bA = append(b.bA, NewBitmap()) + } + carry := And(b.bA[i], foundSet) + b.bA[i].Xor(foundSet) + if !carry.IsEmpty() { + if i+1 >= len(b.bA) { + b.bA = append(b.bA, NewBitmap()) + } + b.addDigit(carry, i+1) + } +} + +// TransposeWithCounts is a matrix transpose function that returns a BSI that has a columnID system defined by the values +// contained within the input BSI. Given that for BSIs, different columnIDs can have the same value. TransposeWithCounts +// is useful for situations where there is a one-to-many relationship between the vectored integer sets. The resulting BSI +// contains the number of times a particular value appeared in the input BSI. +// +func (b *BSI) TransposeWithCounts(parallelism int, foundSet, filterSet *Bitmap) *BSI { + + return parallelExecutorBSIResults(parallelism, b, transposeWithCounts, foundSet, filterSet, true) +} + +func transposeWithCounts(input *BSI, filterSet *Bitmap, batch []uint64, resultsChan chan *BSI, wg *sync.WaitGroup) { + + defer wg.Done() + + results := NewDefaultBSI() + if input.runOptimized { + results.RunOptimize() + } + for _, cID := range batch { + if value, ok := input.GetValue(uint64(cID)); ok { + if !filterSet.Contains(uint64(value)) { + continue + } + if val, ok2 := results.GetValue(uint64(value)); !ok2 { + results.SetValue(uint64(value), 1) + } else { + val++ + results.SetValue(uint64(value), val) + } + } + } + resultsChan <- results +} + +// Increment - In-place increment of values in a BSI. Found set select columns for incrementing. +func (b *BSI) Increment(foundSet *Bitmap) { + b.addDigit(foundSet, 0) +} + +// IncrementAll - In-place increment of all values in a BSI. +func (b *BSI) IncrementAll() { + b.Increment(b.GetExistenceBitmap()) +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/fastaggregation64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/fastaggregation64.go new file mode 100644 index 0000000000..f23d25a217 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/fastaggregation64.go @@ -0,0 +1,31 @@ +package roaring64 + +// FastAnd computes the intersection between many bitmaps quickly +// Compared to the And function, it can take many bitmaps as input, thus saving the trouble +// of manually calling "And" many times. +func FastAnd(bitmaps ...*Bitmap) *Bitmap { + if len(bitmaps) == 0 { + return NewBitmap() + } else if len(bitmaps) == 1 { + return bitmaps[0].Clone() + } + answer := And(bitmaps[0], bitmaps[1]) + for _, bm := range bitmaps[2:] { + answer.And(bm) + } + return answer +} + +// FastOr computes the union between many bitmaps quickly, as opposed to having to call Or repeatedly. +func FastOr(bitmaps ...*Bitmap) *Bitmap { + if len(bitmaps) == 0 { + return NewBitmap() + } else if len(bitmaps) == 1 { + return bitmaps[0].Clone() + } + answer := Or(bitmaps[0], bitmaps[1]) + for _, bm := range bitmaps[2:] { + answer.Or(bm) + } + return answer +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/iterables64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/iterables64.go new file mode 100644 index 0000000000..73e4f1856a --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/iterables64.go @@ -0,0 +1,169 @@ +package roaring64 + +import ( + "github.com/RoaringBitmap/roaring" +) + +// IntIterable64 allows you to iterate over the values in a Bitmap +type IntIterable64 interface { + HasNext() bool + Next() uint64 +} + +// IntPeekable64 allows you to look at the next value without advancing and +// advance as long as the next value is smaller than minval +type IntPeekable64 interface { + IntIterable64 + // PeekNext peeks the next value without advancing the iterator + PeekNext() uint64 + // AdvanceIfNeeded advances as long as the next value is smaller than minval + AdvanceIfNeeded(minval uint64) +} + +type intIterator struct { + pos int + hs uint64 + iter roaring.IntPeekable + highlowcontainer *roaringArray64 +} + +// HasNext returns true if there are more integers to iterate over +func (ii *intIterator) HasNext() bool { + return ii.pos < ii.highlowcontainer.size() +} + +func (ii *intIterator) init() { + if ii.highlowcontainer.size() > ii.pos { + ii.iter = ii.highlowcontainer.getContainerAtIndex(ii.pos).Iterator() + ii.hs = uint64(ii.highlowcontainer.getKeyAtIndex(ii.pos)) << 32 + } +} + +// Next returns the next integer +func (ii *intIterator) Next() uint64 { + lowbits := ii.iter.Next() + x := uint64(lowbits) | ii.hs + if !ii.iter.HasNext() { + ii.pos = ii.pos + 1 + ii.init() + } + return x +} + +// PeekNext peeks the next value without advancing the iterator +func (ii *intIterator) PeekNext() uint64 { + return uint64(ii.iter.PeekNext()&maxLowBit) | ii.hs +} + +// AdvanceIfNeeded advances as long as the next value is smaller than minval +func (ii *intIterator) AdvanceIfNeeded(minval uint64) { + to := minval >> 32 + + for ii.HasNext() && (ii.hs>>32) < to { + ii.pos++ + ii.init() + } + + if ii.HasNext() && (ii.hs>>32) == to { + ii.iter.AdvanceIfNeeded(lowbits(minval)) + + if !ii.iter.HasNext() { + ii.pos++ + ii.init() + } + } +} + +func newIntIterator(a *Bitmap) *intIterator { + p := new(intIterator) + p.pos = 0 + p.highlowcontainer = &a.highlowcontainer + p.init() + return p +} + +type intReverseIterator struct { + pos int + hs uint64 + iter roaring.IntIterable + highlowcontainer *roaringArray64 +} + +// HasNext returns true if there are more integers to iterate over +func (ii *intReverseIterator) HasNext() bool { + return ii.pos >= 0 +} + +func (ii *intReverseIterator) init() { + if ii.pos >= 0 { + ii.iter = ii.highlowcontainer.getContainerAtIndex(ii.pos).ReverseIterator() + ii.hs = uint64(ii.highlowcontainer.getKeyAtIndex(ii.pos)) << 32 + } else { + ii.iter = nil + } +} + +// Next returns the next integer +func (ii *intReverseIterator) Next() uint64 { + x := uint64(ii.iter.Next()) | ii.hs + if !ii.iter.HasNext() { + ii.pos = ii.pos - 1 + ii.init() + } + return x +} + +func newIntReverseIterator(a *Bitmap) *intReverseIterator { + p := new(intReverseIterator) + p.highlowcontainer = &a.highlowcontainer + p.pos = a.highlowcontainer.size() - 1 + p.init() + return p +} + +// ManyIntIterable64 allows you to iterate over the values in a Bitmap +type ManyIntIterable64 interface { + // pass in a buffer to fill up with values, returns how many values were returned + NextMany([]uint64) int +} + +type manyIntIterator struct { + pos int + hs uint64 + iter roaring.ManyIntIterable + highlowcontainer *roaringArray64 +} + +func (ii *manyIntIterator) init() { + if ii.highlowcontainer.size() > ii.pos { + ii.iter = ii.highlowcontainer.getContainerAtIndex(ii.pos).ManyIterator() + ii.hs = uint64(ii.highlowcontainer.getKeyAtIndex(ii.pos)) << 32 + } else { + ii.iter = nil + } +} + +func (ii *manyIntIterator) NextMany(buf []uint64) int { + n := 0 + for n < len(buf) { + if ii.iter == nil { + break + } + moreN := ii.iter.NextMany64(ii.hs, buf[n:]) + n += moreN + if moreN == 0 { + ii.pos = ii.pos + 1 + ii.init() + } + } + + return n +} + +func newManyIntIterator(a *Bitmap) *manyIntIterator { + p := new(manyIntIterator) + p.pos = 0 + p.highlowcontainer = &a.highlowcontainer + p.init() + return p +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/parallel64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/parallel64.go new file mode 100644 index 0000000000..6fe1803b20 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/parallel64.go @@ -0,0 +1,292 @@ +package roaring64 + +import ( + "fmt" + "runtime" + + "github.com/RoaringBitmap/roaring" +) + +var defaultWorkerCount = runtime.NumCPU() + +// ParOr computes the union (OR) of all provided bitmaps in parallel, +// where the parameter "parallelism" determines how many workers are to be used +// (if it is set to 0, a default number of workers is chosen) +func ParOr(parallelism int, bitmaps ...*Bitmap) *Bitmap { + var lKey uint32 = maxUint32 + var hKey uint32 + + bitmapsFiltered := bitmaps[:0] + for _, b := range bitmaps { + if !b.IsEmpty() { + bitmapsFiltered = append(bitmapsFiltered, b) + } + } + bitmaps = bitmapsFiltered + + for _, b := range bitmaps { + lKey = minOfUint32(lKey, b.highlowcontainer.keys[0]) + hKey = maxOfUint32(hKey, b.highlowcontainer.keys[b.highlowcontainer.size()-1]) + } + + if lKey == maxUint32 && hKey == 0 { + return New() + } else if len(bitmaps) == 1 { + return bitmaps[0] + } + // The following might overflow and we do not want that! + // as it might lead to a channel of size 0 later which, + // on some systems, would block indefinitely. + keyRange := uint64(hKey) - uint64(lKey) + 1 + if keyRange == 1 { + // revert to FastOr. Since the key range is 0 + // no container-level aggregation parallelism is achievable + return FastOr(bitmaps...) + } + + if parallelism == 0 { + parallelism = defaultWorkerCount + } + // We cannot use int since int is 32-bit on 32-bit systems. + var chunkSize int64 + var chunkCount int64 + if int64(parallelism)*4 > int64(keyRange) { + chunkSize = 1 + chunkCount = int64(keyRange) + } else { + chunkCount = int64(parallelism) * 4 + chunkSize = (int64(keyRange) + chunkCount - 1) / chunkCount + } + + if chunkCount*chunkSize < int64(keyRange) { + // it's fine to panic to indicate an implementation error + panic(fmt.Sprintf("invariant check failed: chunkCount * chunkSize < keyRange, %d * %d < %d", chunkCount, chunkSize, keyRange)) + } + + chunks := make([]*roaringArray64, chunkCount) + + chunkSpecChan := make(chan parChunkSpec, minOfInt(maxOfInt(64, 2*parallelism), int(chunkCount))) + chunkChan := make(chan parChunk, minOfInt(32, int(chunkCount))) + + orFunc := func() { + for spec := range chunkSpecChan { + ra := orOnRange(&bitmaps[0].highlowcontainer, &bitmaps[1].highlowcontainer, spec.start, spec.end) + for _, b := range bitmaps[2:] { + ra = iorOnRange(ra, &b.highlowcontainer, spec.start, spec.end) + } + + chunkChan <- parChunk{ra, spec.idx} + } + } + + for i := 0; i < parallelism; i++ { + go orFunc() + } + + go func() { + for i := int64(0); i < chunkCount; i++ { + spec := parChunkSpec{ + start: uint32(int64(lKey) + i*chunkSize), + end: uint32(minOfInt64(int64(lKey)+(i+1)*chunkSize-1, int64(hKey))), + idx: int(i), + } + chunkSpecChan <- spec + } + }() + + chunksRemaining := chunkCount + for chunk := range chunkChan { + chunks[chunk.idx] = chunk.ra + chunksRemaining-- + if chunksRemaining == 0 { + break + } + } + close(chunkChan) + close(chunkSpecChan) + + containerCount := 0 + for _, chunk := range chunks { + containerCount += chunk.size() + } + + result := Bitmap{ + roaringArray64{ + containers: make([]*roaring.Bitmap, containerCount), + keys: make([]uint32, containerCount), + needCopyOnWrite: make([]bool, containerCount), + }, + } + + resultOffset := 0 + for _, chunk := range chunks { + copy(result.highlowcontainer.containers[resultOffset:], chunk.containers) + copy(result.highlowcontainer.keys[resultOffset:], chunk.keys) + copy(result.highlowcontainer.needCopyOnWrite[resultOffset:], chunk.needCopyOnWrite) + resultOffset += chunk.size() + } + + return &result +} + +type parChunkSpec struct { + start uint32 + end uint32 + idx int +} + +type parChunk struct { + ra *roaringArray64 + idx int +} + +func (c parChunk) size() int { + return c.ra.size() +} + +func parNaiveStartAt(ra *roaringArray64, start uint32, last uint32) int { + for idx, key := range ra.keys { + if key >= start && key <= last { + return idx + } else if key > last { + break + } + } + return ra.size() +} + +func orOnRange(ra1, ra2 *roaringArray64, start, last uint32) *roaringArray64 { + answer := &roaringArray64{} + length1 := ra1.size() + length2 := ra2.size() + + idx1 := parNaiveStartAt(ra1, start, last) + idx2 := parNaiveStartAt(ra2, start, last) + + var key1 uint32 + var key2 uint32 + if idx1 < length1 && idx2 < length2 { + key1 = ra1.getKeyAtIndex(idx1) + key2 = ra2.getKeyAtIndex(idx2) + + for key1 <= last && key2 <= last { + + if key1 < key2 { + answer.appendCopy(*ra1, idx1) + idx1++ + if idx1 == length1 { + break + } + key1 = ra1.getKeyAtIndex(idx1) + } else if key1 > key2 { + answer.appendCopy(*ra2, idx2) + idx2++ + if idx2 == length2 { + break + } + key2 = ra2.getKeyAtIndex(idx2) + } else { + c1 := ra1.getContainerAtIndex(idx1) + + //answer.appendContainer(key1, c1.lazyOR(ra2.getContainerAtIndex(idx2)), false) + answer.appendContainer(key1, roaring.Or(c1, ra2.getContainerAtIndex(idx2)), false) + idx1++ + idx2++ + if idx1 == length1 || idx2 == length2 { + break + } + + key1 = ra1.getKeyAtIndex(idx1) + key2 = ra2.getKeyAtIndex(idx2) + } + } + } + + if idx2 < length2 { + key2 = ra2.getKeyAtIndex(idx2) + for key2 <= last { + answer.appendCopy(*ra2, idx2) + idx2++ + if idx2 == length2 { + break + } + key2 = ra2.getKeyAtIndex(idx2) + } + } + + if idx1 < length1 { + key1 = ra1.getKeyAtIndex(idx1) + for key1 <= last { + answer.appendCopy(*ra1, idx1) + idx1++ + if idx1 == length1 { + break + } + key1 = ra1.getKeyAtIndex(idx1) + } + } + return answer +} + +func iorOnRange(ra1, ra2 *roaringArray64, start, last uint32) *roaringArray64 { + length1 := ra1.size() + length2 := ra2.size() + + idx1 := 0 + idx2 := parNaiveStartAt(ra2, start, last) + + var key1 uint32 + var key2 uint32 + if idx1 < length1 && idx2 < length2 { + key1 = ra1.getKeyAtIndex(idx1) + key2 = ra2.getKeyAtIndex(idx2) + + for key1 <= last && key2 <= last { + if key1 < key2 { + idx1++ + if idx1 >= length1 { + break + } + key1 = ra1.getKeyAtIndex(idx1) + } else if key1 > key2 { + ra1.insertNewKeyValueAt(idx1, key2, ra2.getContainerAtIndex(idx2)) + ra1.needCopyOnWrite[idx1] = true + idx2++ + idx1++ + length1++ + if idx2 >= length2 { + break + } + key2 = ra2.getKeyAtIndex(idx2) + } else { + c1 := ra1.getWritableContainerAtIndex(idx1) + + //ra1.containers[idx1] = c1.lazyIOR(ra2.getContainerAtIndex(idx2)) + c1.Or(ra2.getContainerAtIndex(idx2)) + ra1.setContainerAtIndex(idx1, c1) + + ra1.needCopyOnWrite[idx1] = false + idx1++ + idx2++ + if idx1 >= length1 || idx2 >= length2 { + break + } + + key1 = ra1.getKeyAtIndex(idx1) + key2 = ra2.getKeyAtIndex(idx2) + } + } + } + if idx2 < length2 { + key2 = ra2.getKeyAtIndex(idx2) + for key2 <= last { + ra1.appendCopy(*ra2, idx2) + idx2++ + if idx2 >= length2 { + break + } + key2 = ra2.getKeyAtIndex(idx2) + } + } + return ra1 +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go new file mode 100644 index 0000000000..688f84d820 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go @@ -0,0 +1,1230 @@ +package roaring64 + +import ( + "bytes" + "encoding/base64" + "encoding/binary" + "fmt" + "io" + "strconv" + + "github.com/RoaringBitmap/roaring" +) + +const serialCookieNoRunContainer = 12346 // only arrays and bitmaps +const serialCookie = 12347 // runs, arrays, and bitmaps + +// Bitmap represents a compressed bitmap where you can add integers. +type Bitmap struct { + highlowcontainer roaringArray64 +} + +// ToBase64 serializes a bitmap as Base64 +func (rb *Bitmap) ToBase64() (string, error) { + buf := new(bytes.Buffer) + _, err := rb.WriteTo(buf) + return base64.StdEncoding.EncodeToString(buf.Bytes()), err + +} + +// FromBase64 deserializes a bitmap from Base64 +func (rb *Bitmap) FromBase64(str string) (int64, error) { + data, err := base64.StdEncoding.DecodeString(str) + if err != nil { + return 0, err + } + buf := bytes.NewBuffer(data) + + return rb.ReadFrom(buf) +} + +// ToBytes returns an array of bytes corresponding to what is written +// when calling WriteTo +func (rb *Bitmap) ToBytes() ([]byte, error) { + var buf bytes.Buffer + _, err := rb.WriteTo(&buf) + return buf.Bytes(), err +} + +// WriteTo writes a serialized version of this bitmap to stream. +// The format is compatible with other 64-bit RoaringBitmap +// implementations (Java, Go, C++) and it has a specification : +// https://github.com/RoaringBitmap/RoaringFormatSpec#extention-for-64-bit-implementations +func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) { + + var n int64 + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, uint64(rb.highlowcontainer.size())) + written, err := stream.Write(buf) + if err != nil { + return n, err + } + n += int64(written) + pos := 0 + keyBuf := make([]byte, 4) + for pos < rb.highlowcontainer.size() { + c := rb.highlowcontainer.getContainerAtIndex(pos) + binary.LittleEndian.PutUint32(keyBuf, rb.highlowcontainer.getKeyAtIndex(pos)) + pos++ + written, err = stream.Write(keyBuf) + n += int64(written) + if err != nil { + return n, err + } + written, err := c.WriteTo(stream) + n += int64(written) + if err != nil { + return n, err + } + } + return n, nil +} + +// ReadFrom reads a serialized version of this bitmap from stream. +// The format is compatible with other 64-bit RoaringBitmap +// implementations (Java, Go, C++) and it has a specification : +// https://github.com/RoaringBitmap/RoaringFormatSpec#extention-for-64-bit-implementations +func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) { + cookie, r32, p, err := tryReadFromRoaring32(rb, stream) + if err != nil { + return p, err + } else if r32 { + return p, nil + } + // TODO: Add buffer interning as in base roaring package. + + sizeBuf := make([]byte, 4) + var n int + n, err = stream.Read(sizeBuf) + if n == 0 || err != nil { + return int64(n), fmt.Errorf("error in bitmap.readFrom: could not read number of containers: %s", err) + } + p += int64(n) + sizeBuf = append(cookie, sizeBuf...) + + size := binary.LittleEndian.Uint64(sizeBuf) + rb.highlowcontainer = roaringArray64{} + rb.highlowcontainer.keys = make([]uint32, size) + rb.highlowcontainer.containers = make([]*roaring.Bitmap, size) + rb.highlowcontainer.needCopyOnWrite = make([]bool, size) + keyBuf := make([]byte, 4) + for i := uint64(0); i < size; i++ { + n, err = stream.Read(keyBuf) + if n == 0 || err != nil { + return int64(n), fmt.Errorf("error in bitmap.readFrom: could not read key #%d: %s", i, err) + } + p += int64(n) + rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf) + rb.highlowcontainer.containers[i] = roaring.NewBitmap() + n, err := rb.highlowcontainer.containers[i].ReadFrom(stream) + if n == 0 || err != nil { + return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) + } + p += int64(n) + } + + return p, nil +} + +func tryReadFromRoaring32(rb *Bitmap, stream io.Reader) (cookie []byte, r32 bool, p int64, err error) { + // Verify the first two bytes are a valid MagicNumber. + cookie = make([]byte, 4) + size, err := stream.Read(cookie) + if err != nil { + return cookie, false, int64(size), err + } + fileMagic := int(binary.LittleEndian.Uint16(cookie[0:2])) + if fileMagic == serialCookieNoRunContainer || fileMagic == serialCookie { + bm32 := roaring.NewBitmap() + p, err = bm32.ReadFrom(stream, cookie...) + if err != nil { + return + } + rb.highlowcontainer = roaringArray64{ + keys: []uint32{0}, + containers: []*roaring.Bitmap{bm32}, + needCopyOnWrite: []bool{false}, + } + return cookie, true, p, nil + } + return +} + +// FromBuffer creates a bitmap from its serialized version stored in buffer +// func (rb *Bitmap) FromBuffer(data []byte) (p int64, err error) { +// +// // TODO: Add buffer interning as in base roaring package. +// buf := bytes.NewBuffer(data) +// return rb.ReadFrom(buf) +// } + +// MarshalBinary implements the encoding.BinaryMarshaler interface for the bitmap +// (same as ToBytes) +func (rb *Bitmap) MarshalBinary() ([]byte, error) { + return rb.ToBytes() +} + +// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface for the bitmap +func (rb *Bitmap) UnmarshalBinary(data []byte) error { + r := bytes.NewReader(data) + _, err := rb.ReadFrom(r) + return err +} + +// RunOptimize attempts to further compress the runs of consecutive values found in the bitmap +func (rb *Bitmap) RunOptimize() { + rb.highlowcontainer.runOptimize() +} + +// HasRunCompression returns true if the bitmap benefits from run compression +func (rb *Bitmap) HasRunCompression() bool { + return rb.highlowcontainer.hasRunCompression() +} + +// NewBitmap creates a new empty Bitmap (see also New) +func NewBitmap() *Bitmap { + return &Bitmap{} +} + +// New creates a new empty Bitmap (same as NewBitmap) +func New() *Bitmap { + return &Bitmap{} +} + +// Clear resets the Bitmap to be logically empty, but may retain +// some memory allocations that may speed up future operations +func (rb *Bitmap) Clear() { + rb.highlowcontainer.clear() +} + +// ToArray creates a new slice containing all of the integers stored in the Bitmap in sorted order +func (rb *Bitmap) ToArray() []uint64 { + array := make([]uint64, rb.GetCardinality()) + pos := 0 + pos2 := uint64(0) + + for pos < rb.highlowcontainer.size() { + hs := uint64(rb.highlowcontainer.getKeyAtIndex(pos)) << 32 + c := rb.highlowcontainer.getContainerAtIndex(pos) + pos++ + c.ManyIterator().NextMany64(hs, array[pos2:]) + pos2 += c.GetCardinality() + } + return array +} + +// GetSizeInBytes estimates the memory usage of the Bitmap. Note that this +// might differ slightly from the amount of bytes required for persistent storage +func (rb *Bitmap) GetSizeInBytes() uint64 { + size := uint64(8) + for _, c := range rb.highlowcontainer.containers { + size += uint64(4) + c.GetSizeInBytes() + } + return size +} + +// String creates a string representation of the Bitmap +func (rb *Bitmap) String() string { + // inspired by https://github.com/fzandona/goroar/ + var buffer bytes.Buffer + start := []byte("{") + buffer.Write(start) + i := rb.Iterator() + counter := 0 + if i.HasNext() { + counter = counter + 1 + buffer.WriteString(strconv.FormatUint(uint64(i.Next()), 10)) + } + for i.HasNext() { + buffer.WriteString(",") + counter = counter + 1 + // to avoid exhausting the memory + if counter > 0x40000 { + buffer.WriteString("...") + break + } + buffer.WriteString(strconv.FormatUint(uint64(i.Next()), 10)) + } + buffer.WriteString("}") + return buffer.String() +} + +// Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order; +// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). +func (rb *Bitmap) Iterator() IntPeekable64 { + return newIntIterator(rb) +} + +// ReverseIterator creates a new IntIterable to iterate over the integers contained in the bitmap, in sorted order; +// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). +func (rb *Bitmap) ReverseIterator() IntIterable64 { + return newIntReverseIterator(rb) +} + +// ManyIterator creates a new ManyIntIterable to iterate over the integers contained in the bitmap, in sorted order; +// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). +func (rb *Bitmap) ManyIterator() ManyIntIterable64 { + return newManyIntIterator(rb) +} + +// Clone creates a copy of the Bitmap +func (rb *Bitmap) Clone() *Bitmap { + ptr := new(Bitmap) + ptr.highlowcontainer = *rb.highlowcontainer.clone() + return ptr +} + +// Minimum get the smallest value stored in this roaring bitmap, assumes that it is not empty +func (rb *Bitmap) Minimum() uint64 { + return uint64(rb.highlowcontainer.containers[0].Minimum()) | (uint64(rb.highlowcontainer.keys[0]) << 32) +} + +// Maximum get the largest value stored in this roaring bitmap, assumes that it is not empty +func (rb *Bitmap) Maximum() uint64 { + lastindex := len(rb.highlowcontainer.containers) - 1 + return uint64(rb.highlowcontainer.containers[lastindex].Maximum()) | (uint64(rb.highlowcontainer.keys[lastindex]) << 32) +} + +// Contains returns true if the integer is contained in the bitmap +func (rb *Bitmap) Contains(x uint64) bool { + hb := highbits(x) + c := rb.highlowcontainer.getContainer(hb) + return c != nil && c.Contains(lowbits(x)) +} + +// ContainsInt returns true if the integer is contained in the bitmap (this is a convenience method, the parameter is casted to uint64 and Contains is called) +func (rb *Bitmap) ContainsInt(x int) bool { + return rb.Contains(uint64(x)) +} + +// Equals returns true if the two bitmaps contain the same integers +func (rb *Bitmap) Equals(o interface{}) bool { + srb, ok := o.(*Bitmap) + if ok { + return srb.highlowcontainer.equals(rb.highlowcontainer) + } + return false +} + +// Add the integer x to the bitmap +func (rb *Bitmap) Add(x uint64) { + hb := highbits(x) + ra := &rb.highlowcontainer + i := ra.getIndex(hb) + if i >= 0 { + ra.getWritableContainerAtIndex(i).Add(lowbits(x)) + } else { + newBitmap := roaring.NewBitmap() + newBitmap.Add(lowbits(x)) + rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newBitmap) + } +} + +// CheckedAdd adds the integer x to the bitmap and return true if it was added (false if the integer was already present) +func (rb *Bitmap) CheckedAdd(x uint64) bool { + hb := highbits(x) + i := rb.highlowcontainer.getIndex(hb) + if i >= 0 { + c := rb.highlowcontainer.getWritableContainerAtIndex(i) + return c.CheckedAdd(lowbits(x)) + } + newBitmap := roaring.NewBitmap() + newBitmap.Add(lowbits(x)) + rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newBitmap) + return true +} + +// AddInt adds the integer x to the bitmap (convenience method: the parameter is casted to uint32 and we call Add) +func (rb *Bitmap) AddInt(x int) { + rb.Add(uint64(x)) +} + +// Remove the integer x from the bitmap +func (rb *Bitmap) Remove(x uint64) { + hb := highbits(x) + i := rb.highlowcontainer.getIndex(hb) + if i >= 0 { + c := rb.highlowcontainer.getWritableContainerAtIndex(i) + c.Remove(lowbits(x)) + if c.IsEmpty() { + rb.highlowcontainer.removeAtIndex(i) + } + } +} + +// CheckedRemove removes the integer x from the bitmap and return true if the integer was effectively remove (and false if the integer was not present) +func (rb *Bitmap) CheckedRemove(x uint64) bool { + hb := highbits(x) + i := rb.highlowcontainer.getIndex(hb) + if i >= 0 { + c := rb.highlowcontainer.getWritableContainerAtIndex(i) + removed := c.CheckedRemove(lowbits(x)) + if removed && c.IsEmpty() { + rb.highlowcontainer.removeAtIndex(i) + } + return removed + } + return false +} + +// IsEmpty returns true if the Bitmap is empty (it is faster than doing (GetCardinality() == 0)) +func (rb *Bitmap) IsEmpty() bool { + return rb.highlowcontainer.size() == 0 +} + +// GetCardinality returns the number of integers contained in the bitmap +func (rb *Bitmap) GetCardinality() uint64 { + size := uint64(0) + for _, c := range rb.highlowcontainer.containers { + size += c.GetCardinality() + } + return size +} + +// Rank returns the number of integers that are smaller or equal to x (Rank(infinity) would be GetCardinality()) +func (rb *Bitmap) Rank(x uint64) uint64 { + size := uint64(0) + for i := 0; i < rb.highlowcontainer.size(); i++ { + key := rb.highlowcontainer.getKeyAtIndex(i) + if key > highbits(x) { + return size + } + if key < highbits(x) { + size += rb.highlowcontainer.getContainerAtIndex(i).GetCardinality() + } else { + return size + rb.highlowcontainer.getContainerAtIndex(i).Rank(lowbits(x)) + } + } + return size +} + +// Select returns the xth integer in the bitmap +func (rb *Bitmap) Select(x uint64) (uint64, error) { + cardinality := rb.GetCardinality() + if cardinality <= x { + return 0, fmt.Errorf("can't find %dth integer in a bitmap with only %d items", x, cardinality) + } + + remaining := x + for i := 0; i < rb.highlowcontainer.size(); i++ { + c := rb.highlowcontainer.getContainerAtIndex(i) + if bitmapSize := c.GetCardinality(); remaining >= bitmapSize { + remaining -= bitmapSize + } else { + key := rb.highlowcontainer.getKeyAtIndex(i) + selected, err := c.Select(uint32(remaining)) + if err != nil { + return 0, err + } + return uint64(key)<<32 + uint64(selected), nil + } + } + return 0, fmt.Errorf("can't find %dth integer in a bitmap with only %d items", x, cardinality) +} + +// And computes the intersection between two bitmaps and stores the result in the current bitmap +func (rb *Bitmap) And(x2 *Bitmap) { + pos1 := 0 + pos2 := 0 + intersectionsize := 0 + length1 := rb.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + +main: + for { + if pos1 < length1 && pos2 < length2 { + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + for { + if s1 == s2 { + c1 := rb.highlowcontainer.getWritableContainerAtIndex(pos1) + c2 := x2.highlowcontainer.getContainerAtIndex(pos2) + c1.And(c2) + if !c1.IsEmpty() { + rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, c1, false) + intersectionsize++ + } + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else if s1 < s2 { + pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) + if pos1 == length1 { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + } else { // s1 > s2 + pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } else { + break + } + } + rb.highlowcontainer.resize(intersectionsize) +} + +// OrCardinality returns the cardinality of the union between two bitmaps, bitmaps are not modified +func (rb *Bitmap) OrCardinality(x2 *Bitmap) uint64 { + pos1 := 0 + pos2 := 0 + length1 := rb.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + answer := uint64(0) +main: + for { + if (pos1 < length1) && (pos2 < length2) { + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + + for { + if s1 < s2 { + answer += rb.highlowcontainer.getContainerAtIndex(pos1).GetCardinality() + pos1++ + if pos1 == length1 { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + } else if s1 > s2 { + answer += x2.highlowcontainer.getContainerAtIndex(pos2).GetCardinality() + pos2++ + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else { + // TODO: could be faster if we did not have to materialize the container + answer += roaring.Or(rb.highlowcontainer.getContainerAtIndex(pos1), x2.highlowcontainer.getContainerAtIndex(pos2)).GetCardinality() + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } else { + break + } + } + for ; pos1 < length1; pos1++ { + answer += rb.highlowcontainer.getContainerAtIndex(pos1).GetCardinality() + } + for ; pos2 < length2; pos2++ { + answer += x2.highlowcontainer.getContainerAtIndex(pos2).GetCardinality() + } + return answer +} + +// AndCardinality returns the cardinality of the intersection between two bitmaps, bitmaps are not modified +func (rb *Bitmap) AndCardinality(x2 *Bitmap) uint64 { + pos1 := 0 + pos2 := 0 + answer := uint64(0) + length1 := rb.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + +main: + for { + if pos1 < length1 && pos2 < length2 { + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + for { + if s1 == s2 { + c1 := rb.highlowcontainer.getContainerAtIndex(pos1) + c2 := x2.highlowcontainer.getContainerAtIndex(pos2) + answer += c1.AndCardinality(c2) + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else if s1 < s2 { + pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) + if pos1 == length1 { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + } else { // s1 > s2 + pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } else { + break + } + } + return answer +} + +// Intersects checks whether two bitmap intersects, bitmaps are not modified +func (rb *Bitmap) Intersects(x2 *Bitmap) bool { + pos1 := 0 + pos2 := 0 + length1 := rb.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + +main: + for { + if pos1 < length1 && pos2 < length2 { + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + for { + if s1 == s2 { + c1 := rb.highlowcontainer.getContainerAtIndex(pos1) + c2 := x2.highlowcontainer.getContainerAtIndex(pos2) + if c1.Intersects(c2) { + return true + } + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else if s1 < s2 { + pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) + if pos1 == length1 { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + } else { // s1 > s2 + pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } else { + break + } + } + return false +} + +// Xor computes the symmetric difference between two bitmaps and stores the result in the current bitmap +func (rb *Bitmap) Xor(x2 *Bitmap) { + pos1 := 0 + pos2 := 0 + length1 := rb.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + for { + if (pos1 < length1) && (pos2 < length2) { + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + if s1 < s2 { + pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) + if pos1 == length1 { + break + } + } else if s1 > s2 { + c := x2.highlowcontainer.getWritableContainerAtIndex(pos2) + rb.highlowcontainer.insertNewKeyValueAt(pos1, x2.highlowcontainer.getKeyAtIndex(pos2), c) + length1++ + pos1++ + pos2++ + } else { + // TODO: couple be computed in-place for reduced memory usage + c := roaring.Xor(rb.highlowcontainer.getContainerAtIndex(pos1), x2.highlowcontainer.getContainerAtIndex(pos2)) + if !c.IsEmpty() { + rb.highlowcontainer.setContainerAtIndex(pos1, c) + pos1++ + } else { + rb.highlowcontainer.removeAtIndex(pos1) + length1-- + } + pos2++ + } + } else { + break + } + } + if pos1 == length1 { + rb.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) + } +} + +// Or computes the union between two bitmaps and stores the result in the current bitmap +func (rb *Bitmap) Or(x2 *Bitmap) { + pos1 := 0 + pos2 := 0 + length1 := rb.highlowcontainer.size() + length2 := x2.highlowcontainer.size() +main: + for (pos1 < length1) && (pos2 < length2) { + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + + for { + if s1 < s2 { + pos1++ + if pos1 == length1 { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + } else if s1 > s2 { + rb.highlowcontainer.insertNewKeyValueAt(pos1, s2, x2.highlowcontainer.getContainerAtIndex(pos2).Clone()) + pos1++ + length1++ + pos2++ + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else { + rb.highlowcontainer.getContainerAtIndex(pos1).Or(x2.highlowcontainer.getContainerAtIndex(pos2)) + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } + if pos1 == length1 { + rb.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) + } +} + +// AndNot computes the difference between two bitmaps and stores the result in the current bitmap +func (rb *Bitmap) AndNot(x2 *Bitmap) { + pos1 := 0 + pos2 := 0 + intersectionsize := 0 + length1 := rb.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + +main: + for { + if pos1 < length1 && pos2 < length2 { + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + for { + if s1 == s2 { + c1 := rb.highlowcontainer.getWritableContainerAtIndex(pos1) + c2 := x2.highlowcontainer.getContainerAtIndex(pos2) + c1.AndNot(c2) + if !c1.IsEmpty() { + rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, c1, false) + intersectionsize++ + } + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else if s1 < s2 { + c1 := rb.highlowcontainer.getContainerAtIndex(pos1) + mustCopyOnWrite := rb.highlowcontainer.needsCopyOnWrite(pos1) + rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, c1, mustCopyOnWrite) + intersectionsize++ + pos1++ + if pos1 == length1 { + break main + } + s1 = rb.highlowcontainer.getKeyAtIndex(pos1) + } else { // s1 > s2 + pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } else { + break + } + } + // TODO:implement as a copy + for pos1 < length1 { + c1 := rb.highlowcontainer.getContainerAtIndex(pos1) + s1 := rb.highlowcontainer.getKeyAtIndex(pos1) + mustCopyOnWrite := rb.highlowcontainer.needsCopyOnWrite(pos1) + rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, c1, mustCopyOnWrite) + intersectionsize++ + pos1++ + } + rb.highlowcontainer.resize(intersectionsize) +} + +// Or computes the union between two bitmaps and returns the result +func Or(x1, x2 *Bitmap) *Bitmap { + answer := NewBitmap() + pos1 := 0 + pos2 := 0 + length1 := x1.highlowcontainer.size() + length2 := x2.highlowcontainer.size() +main: + for (pos1 < length1) && (pos2 < length2) { + s1 := x1.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + + for { + if s1 < s2 { + answer.highlowcontainer.appendCopy(x1.highlowcontainer, pos1) + pos1++ + if pos1 == length1 { + break main + } + s1 = x1.highlowcontainer.getKeyAtIndex(pos1) + } else if s1 > s2 { + answer.highlowcontainer.appendCopy(x2.highlowcontainer, pos2) + pos2++ + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else { + answer.highlowcontainer.appendContainer(s1, + roaring.Or(x1.highlowcontainer.getContainerAtIndex(pos1), x2.highlowcontainer.getContainerAtIndex(pos2)), false) + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = x1.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } + if pos1 == length1 { + answer.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) + } else if pos2 == length2 { + answer.highlowcontainer.appendCopyMany(x1.highlowcontainer, pos1, length1) + } + return answer +} + +// And computes the intersection between two bitmaps and returns the result +func And(x1, x2 *Bitmap) *Bitmap { + answer := NewBitmap() + pos1 := 0 + pos2 := 0 + length1 := x1.highlowcontainer.size() + length2 := x2.highlowcontainer.size() +main: + for pos1 < length1 && pos2 < length2 { + s1 := x1.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + for { + if s1 == s2 { + c := roaring.And(x1.highlowcontainer.getContainerAtIndex(pos1), x2.highlowcontainer.getContainerAtIndex(pos2)) + if !c.IsEmpty() { + answer.highlowcontainer.appendContainer(s1, c, false) + } + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = x1.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else if s1 < s2 { + pos1 = x1.highlowcontainer.advanceUntil(s2, pos1) + if pos1 == length1 { + break main + } + s1 = x1.highlowcontainer.getKeyAtIndex(pos1) + } else { // s1 > s2 + pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } + return answer +} + +// Xor computes the symmetric difference between two bitmaps and returns the result +func Xor(x1, x2 *Bitmap) *Bitmap { + answer := NewBitmap() + pos1 := 0 + pos2 := 0 + length1 := x1.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + for { + if (pos1 < length1) && (pos2 < length2) { + s1 := x1.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + if s1 < s2 { + answer.highlowcontainer.appendCopy(x1.highlowcontainer, pos1) + pos1++ + } else if s1 > s2 { + answer.highlowcontainer.appendCopy(x2.highlowcontainer, pos2) + pos2++ + } else { + c := roaring.Xor(x1.highlowcontainer.getContainerAtIndex(pos1), x2.highlowcontainer.getContainerAtIndex(pos2)) + if !c.IsEmpty() { + answer.highlowcontainer.appendContainer(s1, c, false) + } + pos1++ + pos2++ + } + } else { + break + } + } + if pos1 == length1 { + answer.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) + } else if pos2 == length2 { + answer.highlowcontainer.appendCopyMany(x1.highlowcontainer, pos1, length1) + } + return answer +} + +// AndNot computes the difference between two bitmaps and returns the result +func AndNot(x1, x2 *Bitmap) *Bitmap { + answer := NewBitmap() + pos1 := 0 + pos2 := 0 + length1 := x1.highlowcontainer.size() + length2 := x2.highlowcontainer.size() + +main: + for { + if pos1 < length1 && pos2 < length2 { + s1 := x1.highlowcontainer.getKeyAtIndex(pos1) + s2 := x2.highlowcontainer.getKeyAtIndex(pos2) + for { + if s1 < s2 { + answer.highlowcontainer.appendCopy(x1.highlowcontainer, pos1) + pos1++ + if pos1 == length1 { + break main + } + s1 = x1.highlowcontainer.getKeyAtIndex(pos1) + } else if s1 == s2 { + c := roaring.AndNot(x1.highlowcontainer.getContainerAtIndex(pos1), x2.highlowcontainer.getContainerAtIndex(pos2)) + if !c.IsEmpty() { + answer.highlowcontainer.appendContainer(s1, c, false) + } + pos1++ + pos2++ + if (pos1 == length1) || (pos2 == length2) { + break main + } + s1 = x1.highlowcontainer.getKeyAtIndex(pos1) + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } else { // s1 > s2 + pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) + if pos2 == length2 { + break main + } + s2 = x2.highlowcontainer.getKeyAtIndex(pos2) + } + } + } else { + break + } + } + if pos2 == length2 { + answer.highlowcontainer.appendCopyMany(x1.highlowcontainer, pos1, length1) + } + return answer +} + +// AddMany add all of the values in dat +func (rb *Bitmap) AddMany(dat []uint64) { + if len(dat) == 0 { + return + } + + start, batchHighBits := 0, highbits(dat[0]) + for end := 1; end < len(dat); end++ { + hi := highbits(dat[end]) + if hi != batchHighBits { + batch := make([]uint32, end-start) + for i := 0; i < end-start; i++ { + batch[i] = lowbits(dat[start+i]) + } + rb.getOrCreateContainer(batchHighBits).AddMany(batch) + + batchHighBits = hi + start = end + } + } + + batch := make([]uint32, len(dat)-start) + for i := 0; i < len(dat)-start; i++ { + batch[i] = lowbits(dat[start+i]) + } + rb.getOrCreateContainer(batchHighBits).AddMany(batch) +} + +// getOrCreateContainer gets the roaring.Bitmap for key hb, +// or creates an *empty* roaring.Bitmap, inserts it to rb.highlowcontainer, and returns the new roaring.Bitmap. +func (rb *Bitmap) getOrCreateContainer(hb uint32) *roaring.Bitmap { + i := rb.highlowcontainer.getIndex(hb) + if i >= 0 { + return rb.highlowcontainer.getWritableContainerAtIndex(i) + } + c := roaring.NewBitmap() + rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, c) + return c +} + +// BitmapOf generates a new bitmap filled with the specified integers +func BitmapOf(dat ...uint64) *Bitmap { + ans := NewBitmap() + ans.AddMany(dat) + return ans +} + +// Flip negates the bits in the given range (i.e., [rangeStart,rangeEnd)), any integer present in this range and in the bitmap is removed, +// and any integer present in the range and not in the bitmap is added. +func (rb *Bitmap) Flip(rangeStart, rangeEnd uint64) { + if rangeStart >= rangeEnd { + return + } + + hbStart := uint64(highbits(rangeStart)) + lbStart := uint64(lowbits(rangeStart)) + hbLast := uint64(highbits(rangeEnd)) + lbLast := uint64(lowbits(rangeEnd)) + + var max uint64 = maxLowBit + 1 + for hb := hbStart; hb <= hbLast; hb++ { + var containerStart uint64 + if hb == hbStart { + containerStart = lbStart + } + containerLast := max + if hb == hbLast { + containerLast = lbLast + } + + i := rb.highlowcontainer.getIndex(uint32(hb)) + + if i >= 0 { + c := rb.highlowcontainer.getWritableContainerAtIndex(i) + c.Flip(containerStart, containerLast) + if c.IsEmpty() { + rb.highlowcontainer.removeAtIndex(i) + } + } else { // *think* the range of ones must never be empty. + c := roaring.NewBitmap() + c.Flip(containerStart, containerLast) + if !c.IsEmpty() { + rb.highlowcontainer.insertNewKeyValueAt(-i-1, uint32(hb), c) + } + } + } +} + +// FlipInt calls Flip after casting the parameters (convenience method) +func (rb *Bitmap) FlipInt(rangeStart, rangeEnd int) { + rb.Flip(uint64(rangeStart), uint64(rangeEnd)) +} + +// AddRange adds the integers in [rangeStart, rangeEnd) to the bitmap. +func (rb *Bitmap) AddRange(rangeStart, rangeEnd uint64) { + if rangeStart >= rangeEnd { + return + } + hbStart := uint64(highbits(rangeStart)) + lbStart := uint64(lowbits(rangeStart)) + hbLast := uint64(highbits(rangeEnd - 1)) + lbLast := uint64(lowbits(rangeEnd - 1)) + + var max uint64 = maxLowBit + for hb := hbStart; hb <= hbLast; hb++ { + containerStart := uint64(0) + if hb == hbStart { + containerStart = lbStart + } + containerLast := max + if hb == hbLast { + containerLast = lbLast + } + + rb.getOrCreateContainer(uint32(hb)).AddRange(containerStart, containerLast+1) + } +} + +// RemoveRange removes the integers in [rangeStart, rangeEnd) from the bitmap. +func (rb *Bitmap) RemoveRange(rangeStart, rangeEnd uint64) { + if rangeStart >= rangeEnd { + return + } + hbStart := uint64(highbits(rangeStart)) + lbStart := uint64(lowbits(rangeStart)) + hbLast := uint64(highbits(rangeEnd - 1)) + lbLast := uint64(lowbits(rangeEnd - 1)) + + var max uint64 = maxLowBit + + if hbStart == hbLast { + i := rb.highlowcontainer.getIndex(uint32(hbStart)) + if i < 0 { + return + } + c := rb.highlowcontainer.getWritableContainerAtIndex(i) + c.RemoveRange(lbStart, lbLast+1) + if c.IsEmpty() { + rb.highlowcontainer.removeAtIndex(i) + } + return + } + ifirst := rb.highlowcontainer.getIndex(uint32(hbStart)) + ilast := rb.highlowcontainer.getIndex(uint32(hbLast)) + + if ifirst >= 0 { + if lbStart != 0 { + c := rb.highlowcontainer.getWritableContainerAtIndex(ifirst) + c.RemoveRange(lbStart, max+1) + if !c.IsEmpty() { + ifirst++ + } + } + } else { + ifirst = -ifirst - 1 + } + if ilast >= 0 { + if lbLast != max { + c := rb.highlowcontainer.getWritableContainerAtIndex(ilast) + c.RemoveRange(0, lbLast+1) + if c.IsEmpty() { + ilast++ + } + } else { + ilast++ + } + } else { + ilast = -ilast - 1 + } + rb.highlowcontainer.removeIndexRange(ifirst, ilast) +} + +// Flip negates the bits in the given range (i.e., [rangeStart,rangeEnd)), any integer present in this range and in the bitmap is removed, +// and any integer present in the range and not in the bitmap is added, a new bitmap is returned leaving +// the current bitmap unchanged. +func Flip(rb *Bitmap, rangeStart, rangeEnd uint64) *Bitmap { + if rangeStart >= rangeEnd { + return rb.Clone() + } + + answer := NewBitmap() + hbStart := uint64(highbits(rangeStart)) + lbStart := uint64(lowbits(rangeStart)) + hbLast := uint64(highbits(rangeEnd)) + lbLast := uint64(lowbits(rangeEnd)) + + // copy the containers before the active area + answer.highlowcontainer.appendCopiesUntil(rb.highlowcontainer, uint32(hbStart)) + + var max uint64 = maxLowBit + 1 + for hb := hbStart; hb <= hbLast; hb++ { + var containerStart uint64 + if hb == hbStart { + containerStart = lbStart + } + containerLast := max + if hb == hbLast { + containerLast = lbLast + } + + i := rb.highlowcontainer.getIndex(uint32(hb)) + j := answer.highlowcontainer.getIndex(uint32(hb)) + + if i >= 0 { + c := roaring.Flip(rb.highlowcontainer.getContainerAtIndex(i), containerStart, containerLast) + if !c.IsEmpty() { + answer.highlowcontainer.insertNewKeyValueAt(-j-1, uint32(hb), c) + } + + } else { // *think* the range of ones must never be empty. + c := roaring.NewBitmap() + c.Flip(containerStart, containerLast) + if !c.IsEmpty() { + answer.highlowcontainer.insertNewKeyValueAt(-i-1, uint32(hb), c) + } + } + } + // copy the containers after the active area. + answer.highlowcontainer.appendCopiesAfter(rb.highlowcontainer, uint32(hbLast)) + + return answer +} + +// SetCopyOnWrite sets this bitmap to use copy-on-write so that copies are fast and memory conscious +// if the parameter is true, otherwise we leave the default where hard copies are made +// (copy-on-write requires extra care in a threaded context). +// Calling SetCopyOnWrite(true) on a bitmap created with FromBuffer is unsafe. +func (rb *Bitmap) SetCopyOnWrite(val bool) { + rb.highlowcontainer.copyOnWrite = val +} + +// GetCopyOnWrite gets this bitmap's copy-on-write property +func (rb *Bitmap) GetCopyOnWrite() (val bool) { + return rb.highlowcontainer.copyOnWrite +} + +// CloneCopyOnWriteContainers clones all containers which have +// needCopyOnWrite set to true. +// This can be used to make sure it is safe to munmap a []byte +// that the roaring array may still have a reference to, after +// calling FromBuffer. +// More generally this function is useful if you call FromBuffer +// to construct a bitmap with a backing array buf +// and then later discard the buf array. Note that you should call +// CloneCopyOnWriteContainers on all bitmaps that were derived +// from the 'FromBuffer' bitmap since they map have dependencies +// on the buf array as well. +func (rb *Bitmap) CloneCopyOnWriteContainers() { + rb.highlowcontainer.cloneCopyOnWriteContainers() +} + +// FlipInt calls Flip after casting the parameters (convenience method) +func FlipInt(bm *Bitmap, rangeStart, rangeEnd int) *Bitmap { + return Flip(bm, uint64(rangeStart), uint64(rangeEnd)) +} + +// Stats returns details on container type usage in a Statistics struct. +func (rb *Bitmap) Stats() roaring.Statistics { + stats := roaring.Statistics{} + for _, c := range rb.highlowcontainer.containers { + bitmapStats := c.Stats() + stats.Cardinality += bitmapStats.Cardinality + stats.Containers += bitmapStats.Containers + stats.ArrayContainers += bitmapStats.ArrayContainers + stats.ArrayContainerBytes += bitmapStats.ArrayContainerBytes + stats.ArrayContainerValues += bitmapStats.ArrayContainerValues + stats.BitmapContainers += bitmapStats.BitmapContainers + stats.BitmapContainerBytes += bitmapStats.BitmapContainerBytes + stats.BitmapContainerValues += bitmapStats.BitmapContainerValues + stats.RunContainers += bitmapStats.RunContainers + stats.RunContainerBytes += bitmapStats.RunContainerBytes + stats.RunContainerValues += bitmapStats.RunContainerValues + } + return stats +} + +// GetSerializedSizeInBytes computes the serialized size in bytes +// of the Bitmap. It should correspond to the number +// of bytes written when invoking WriteTo. You can expect +// that this function is much cheaper computationally than WriteTo. +func (rb *Bitmap) GetSerializedSizeInBytes() uint64 { + return rb.highlowcontainer.serializedSizeInBytes() +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go new file mode 100644 index 0000000000..9e3b34bc8d --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go @@ -0,0 +1,398 @@ +package roaring64 + +import "github.com/RoaringBitmap/roaring" + +type roaringArray64 struct { + keys []uint32 + containers []*roaring.Bitmap + needCopyOnWrite []bool + copyOnWrite bool +} + +// runOptimize compresses the element containers to minimize space consumed. +// Q: how does this interact with copyOnWrite and needCopyOnWrite? +// A: since we aren't changing the logical content, just the representation, +// we don't bother to check the needCopyOnWrite bits. We replace +// (possibly all) elements of ra.containers in-place with space +// optimized versions. +func (ra *roaringArray64) runOptimize() { + for i := range ra.containers { + ra.containers[i].RunOptimize() + } +} + +func (ra *roaringArray64) appendContainer(key uint32, value *roaring.Bitmap, mustCopyOnWrite bool) { + ra.keys = append(ra.keys, key) + ra.containers = append(ra.containers, value) + ra.needCopyOnWrite = append(ra.needCopyOnWrite, mustCopyOnWrite) +} + +func (ra *roaringArray64) appendWithoutCopy(sa roaringArray64, startingindex int) { + mustCopyOnWrite := sa.needCopyOnWrite[startingindex] + ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], mustCopyOnWrite) +} + +func (ra *roaringArray64) appendCopy(sa roaringArray64, startingindex int) { + // cow only if the two request it, or if we already have a lightweight copy + copyonwrite := (ra.copyOnWrite && sa.copyOnWrite) || sa.needsCopyOnWrite(startingindex) + if !copyonwrite { + // since there is no copy-on-write, we need to clone the container (this is important) + ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex].Clone(), copyonwrite) + } else { + ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], copyonwrite) + if !sa.needsCopyOnWrite(startingindex) { + sa.setNeedsCopyOnWrite(startingindex) + } + } +} + +func (ra *roaringArray64) appendWithoutCopyMany(sa roaringArray64, startingindex, end int) { + for i := startingindex; i < end; i++ { + ra.appendWithoutCopy(sa, i) + } +} + +func (ra *roaringArray64) appendCopyMany(sa roaringArray64, startingindex, end int) { + for i := startingindex; i < end; i++ { + ra.appendCopy(sa, i) + } +} + +func (ra *roaringArray64) appendCopiesUntil(sa roaringArray64, stoppingKey uint32) { + // cow only if the two request it, or if we already have a lightweight copy + copyonwrite := ra.copyOnWrite && sa.copyOnWrite + + for i := 0; i < sa.size(); i++ { + if sa.keys[i] >= stoppingKey { + break + } + thiscopyonewrite := copyonwrite || sa.needsCopyOnWrite(i) + if thiscopyonewrite { + ra.appendContainer(sa.keys[i], sa.containers[i], thiscopyonewrite) + if !sa.needsCopyOnWrite(i) { + sa.setNeedsCopyOnWrite(i) + } + + } else { + // since there is no copy-on-write, we need to clone the container (this is important) + ra.appendContainer(sa.keys[i], sa.containers[i].Clone(), thiscopyonewrite) + } + } +} + +func (ra *roaringArray64) appendCopiesAfter(sa roaringArray64, beforeStart uint32) { + // cow only if the two request it, or if we already have a lightweight copy + copyonwrite := ra.copyOnWrite && sa.copyOnWrite + + startLocation := sa.getIndex(beforeStart) + if startLocation >= 0 { + startLocation++ + } else { + startLocation = -startLocation - 1 + } + + for i := startLocation; i < sa.size(); i++ { + thiscopyonewrite := copyonwrite || sa.needsCopyOnWrite(i) + if thiscopyonewrite { + ra.appendContainer(sa.keys[i], sa.containers[i], thiscopyonewrite) + if !sa.needsCopyOnWrite(i) { + sa.setNeedsCopyOnWrite(i) + } + } else { + // since there is no copy-on-write, we need to clone the container (this is important) + ra.appendContainer(sa.keys[i], sa.containers[i].Clone(), thiscopyonewrite) + } + } +} + +func (ra *roaringArray64) removeIndexRange(begin, end int) { + if end <= begin { + return + } + + r := end - begin + + copy(ra.keys[begin:], ra.keys[end:]) + copy(ra.containers[begin:], ra.containers[end:]) + copy(ra.needCopyOnWrite[begin:], ra.needCopyOnWrite[end:]) + + ra.resize(len(ra.keys) - r) +} + +func (ra *roaringArray64) resize(newsize int) { + for k := newsize; k < len(ra.containers); k++ { + ra.containers[k] = nil + } + + ra.keys = ra.keys[:newsize] + ra.containers = ra.containers[:newsize] + ra.needCopyOnWrite = ra.needCopyOnWrite[:newsize] +} + +func (ra *roaringArray64) clear() { + ra.resize(0) + ra.copyOnWrite = false +} + +func (ra *roaringArray64) clone() *roaringArray64 { + + sa := roaringArray64{} + sa.copyOnWrite = ra.copyOnWrite + + // this is where copyOnWrite is used. + if ra.copyOnWrite { + sa.keys = make([]uint32, len(ra.keys)) + copy(sa.keys, ra.keys) + sa.containers = make([]*roaring.Bitmap, len(ra.containers)) + copy(sa.containers, ra.containers) + sa.needCopyOnWrite = make([]bool, len(ra.needCopyOnWrite)) + + ra.markAllAsNeedingCopyOnWrite() + sa.markAllAsNeedingCopyOnWrite() + + // sa.needCopyOnWrite is shared + } else { + // make a full copy + + sa.keys = make([]uint32, len(ra.keys)) + copy(sa.keys, ra.keys) + + sa.containers = make([]*roaring.Bitmap, len(ra.containers)) + for i := range sa.containers { + sa.containers[i] = ra.containers[i].Clone() + } + + sa.needCopyOnWrite = make([]bool, len(ra.needCopyOnWrite)) + } + return &sa +} + +// clone all containers which have needCopyOnWrite set to true +// This can be used to make sure it is safe to munmap a []byte +// that the roaring array may still have a reference to. +func (ra *roaringArray64) cloneCopyOnWriteContainers() { + for i, needCopyOnWrite := range ra.needCopyOnWrite { + if needCopyOnWrite { + ra.containers[i] = ra.containers[i].Clone() + ra.needCopyOnWrite[i] = false + } + } +} + +// unused function: +// func (ra *roaringArray64) containsKey(x uint32) bool { +// return (ra.binarySearch(0, int64(len(ra.keys)), x) >= 0) +// } + +func (ra *roaringArray64) getContainer(x uint32) *roaring.Bitmap { + i := ra.binarySearch(0, int64(len(ra.keys)), x) + if i < 0 { + return nil + } + return ra.containers[i] +} + +func (ra *roaringArray64) getContainerAtIndex(i int) *roaring.Bitmap { + return ra.containers[i] +} + +func (ra *roaringArray64) getWritableContainerAtIndex(i int) *roaring.Bitmap { + if ra.needCopyOnWrite[i] { + ra.containers[i] = ra.containers[i].Clone() + ra.needCopyOnWrite[i] = false + } + return ra.containers[i] +} + +func (ra *roaringArray64) getIndex(x uint32) int { + // before the binary search, we optimize for frequent cases + size := len(ra.keys) + if (size == 0) || (ra.keys[size-1] == x) { + return size - 1 + } + return ra.binarySearch(0, int64(size), x) +} + +func (ra *roaringArray64) getKeyAtIndex(i int) uint32 { + return ra.keys[i] +} + +func (ra *roaringArray64) insertNewKeyValueAt(i int, key uint32, value *roaring.Bitmap) { + ra.keys = append(ra.keys, 0) + ra.containers = append(ra.containers, nil) + + copy(ra.keys[i+1:], ra.keys[i:]) + copy(ra.containers[i+1:], ra.containers[i:]) + + ra.keys[i] = key + ra.containers[i] = value + + ra.needCopyOnWrite = append(ra.needCopyOnWrite, false) + copy(ra.needCopyOnWrite[i+1:], ra.needCopyOnWrite[i:]) + ra.needCopyOnWrite[i] = false +} + +func (ra *roaringArray64) remove(key uint32) bool { + i := ra.binarySearch(0, int64(len(ra.keys)), key) + if i >= 0 { // if a new key + ra.removeAtIndex(i) + return true + } + return false +} + +func (ra *roaringArray64) removeAtIndex(i int) { + copy(ra.keys[i:], ra.keys[i+1:]) + copy(ra.containers[i:], ra.containers[i+1:]) + + copy(ra.needCopyOnWrite[i:], ra.needCopyOnWrite[i+1:]) + + ra.resize(len(ra.keys) - 1) +} + +func (ra *roaringArray64) setContainerAtIndex(i int, c *roaring.Bitmap) { + ra.containers[i] = c +} + +func (ra *roaringArray64) replaceKeyAndContainerAtIndex(i int, key uint32, c *roaring.Bitmap, mustCopyOnWrite bool) { + ra.keys[i] = key + ra.containers[i] = c + ra.needCopyOnWrite[i] = mustCopyOnWrite +} + +func (ra *roaringArray64) size() int { + return len(ra.keys) +} + +func (ra *roaringArray64) binarySearch(begin, end int64, ikey uint32) int { + low := begin + high := end - 1 + for low+16 <= high { + middleIndex := low + (high-low)/2 // avoid overflow + middleValue := ra.keys[middleIndex] + + if middleValue < ikey { + low = middleIndex + 1 + } else if middleValue > ikey { + high = middleIndex - 1 + } else { + return int(middleIndex) + } + } + for ; low <= high; low++ { + val := ra.keys[low] + if val >= ikey { + if val == ikey { + return int(low) + } + break + } + } + return -int(low + 1) +} + +func (ra *roaringArray64) equals(o interface{}) bool { + srb, ok := o.(roaringArray64) + if ok { + + if srb.size() != ra.size() { + return false + } + for i, k := range ra.keys { + if k != srb.keys[i] { + return false + } + } + + for i, c := range ra.containers { + if !c.Equals(srb.containers[i]) { + return false + } + } + return true + } + return false +} + +func (ra *roaringArray64) hasRunCompression() bool { + for _, c := range ra.containers { + if c.HasRunCompression() { + return true + } + } + return false +} + +func (ra *roaringArray64) advanceUntil(min uint32, pos int) int { + lower := pos + 1 + + if lower >= len(ra.keys) || ra.keys[lower] >= min { + return lower + } + + spansize := 1 + + for lower+spansize < len(ra.keys) && ra.keys[lower+spansize] < min { + spansize *= 2 + } + var upper int + if lower+spansize < len(ra.keys) { + upper = lower + spansize + } else { + upper = len(ra.keys) - 1 + } + + if ra.keys[upper] == min { + return upper + } + + if ra.keys[upper] < min { + // means + // array + // has no + // item + // >= min + // pos = array.length; + return len(ra.keys) + } + + // we know that the next-smallest span was too small + lower += (spansize >> 1) + + mid := 0 + for lower+1 != upper { + mid = (lower + upper) >> 1 + if ra.keys[mid] == min { + return mid + } else if ra.keys[mid] < min { + lower = mid + } else { + upper = mid + } + } + return upper +} + +func (ra *roaringArray64) markAllAsNeedingCopyOnWrite() { + for i := range ra.needCopyOnWrite { + ra.needCopyOnWrite[i] = true + } +} + +func (ra *roaringArray64) needsCopyOnWrite(i int) bool { + return ra.needCopyOnWrite[i] +} + +func (ra *roaringArray64) setNeedsCopyOnWrite(i int) { + ra.needCopyOnWrite[i] = true +} + +// should be dirt cheap +func (ra *roaringArray64) serializedSizeInBytes() uint64 { + answer := uint64(8) + for _, c := range ra.containers { + answer += 4 + answer += c.GetSerializedSizeInBytes() + } + return answer +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/util.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/util.go new file mode 100644 index 0000000000..3743cd7db9 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/util.go @@ -0,0 +1,49 @@ +package roaring64 + +import "github.com/RoaringBitmap/roaring" + +func highbits(x uint64) uint32 { + return uint32(x >> 32) +} + +func lowbits(x uint64) uint32 { + return uint32(x & maxLowBit) +} + +const maxLowBit = roaring.MaxUint32 +const maxUint32 = roaring.MaxUint32 + +func minOfInt64(a, b int64) int64 { + if a < b { + return a + } + return b +} + +func minOfInt(a, b int) int { + if a < b { + return a + } + return b +} + +func maxOfInt(a, b int) int { + if a > b { + return a + } + return b +} + +func maxOfUint32(a, b uint32) uint32 { + if a > b { + return a + } + return b +} + +func minOfUint32(a, b uint32) uint32 { + if a < b { + return a + } + return b +} diff --git a/vendor/github.com/blevesearch/bleve/v2/README.md b/vendor/github.com/blevesearch/bleve/v2/README.md index a89be4dd9a..dbe5b78989 100644 --- a/vendor/github.com/blevesearch/bleve/v2/README.md +++ b/vendor/github.com/blevesearch/bleve/v2/README.md @@ -22,7 +22,8 @@ A modern text indexing library in go * Conjunction, Disjunction, Boolean (must/should/must_not) * Term Range, Numeric Range, Date Range * [Geo Spatial](https://github.com/blevesearch/bleve/blob/master/geo/README.md) - * Simple [query string syntax](http://www.blevesearch.com/docs/Query-String-Query/) for human entry + * Simple [query string syntax](http://www.blevesearch.com/docs/Query-String-Query/) + * [Vector Search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md) * [tf-idf](https://en.wikipedia.org/wiki/Tf-idf) Scoring * Query time boosting * Search result match highlighting with document fragments @@ -101,7 +102,7 @@ Use "bleve [command] --help" for more information about a command. Bleve includes general-purpose analyzers (customizable) as well as pre-built text analyzers for the following languages: -Arabic (ar), Bulgarian (bg), Catalan (ca), Chinese-Japanese-Korean (cjk), Kurdish (ckb), Danish (da), German (de), Greek (el), English (en), Spanish - Castilian (es), Basque (eu), Persian (fa), Finnish (fi), French (fr), Gaelic (ga), Spanish - Galician (gl), Hindi (hi), Croatian (hr), Hungarian (hu), Armenian (hy), Indonesian (id, in), Italian (it), Dutch (nl), Norwegian (no), Portuguese (pt), Romanian (ro), Russian (ru), Swedish (sv), Turkish (tr) +Arabic (ar), Bulgarian (bg), Catalan (ca), Chinese-Japanese-Korean (cjk), Kurdish (ckb), Danish (da), German (de), Greek (el), English (en), Spanish - Castilian (es), Basque (eu), Persian (fa), Finnish (fi), French (fr), Gaelic (ga), Spanish - Galician (gl), Hindi (hi), Croatian (hr), Hungarian (hu), Armenian (hy), Indonesian (id, in), Italian (it), Dutch (nl), Norwegian (no), Polish (pl), Portuguese (pt), Romanian (ro), Russian (ru), Swedish (sv), Turkish (tr) ## Text Analysis Wizard diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go b/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go new file mode 100644 index 0000000000..b019361cbf --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go @@ -0,0 +1,145 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package document + +import ( + "fmt" + "reflect" + + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeVectorField int + +func init() { + var f VectorField + reflectStaticSizeVectorField = int(reflect.TypeOf(f).Size()) +} + +const DefaultVectorIndexingOptions = index.IndexField + +type VectorField struct { + name string + dims int // Dimensionality of the vector + similarity string // Similarity metric to use for scoring + options index.FieldIndexingOptions + value []float32 + numPlainTextBytes uint64 + vectorIndexOptimizedFor string // Optimization applied to this index. +} + +func (n *VectorField) Size() int { + return reflectStaticSizeVectorField + size.SizeOfPtr + + len(n.name) + + int(numBytesFloat32s(n.value)) +} + +func (n *VectorField) Name() string { + return n.name +} + +func (n *VectorField) ArrayPositions() []uint64 { + return nil +} + +func (n *VectorField) Options() index.FieldIndexingOptions { + return n.options +} + +func (n *VectorField) NumPlainTextBytes() uint64 { + return n.numPlainTextBytes +} + +func (n *VectorField) AnalyzedLength() int { + // vectors aren't analyzed + return 0 +} + +func (n *VectorField) EncodedFieldType() byte { + return 'v' +} + +func (n *VectorField) AnalyzedTokenFrequencies() index.TokenFrequencies { + // vectors aren't analyzed + return nil +} + +func (n *VectorField) Analyze() { + // vectors aren't analyzed +} + +func (n *VectorField) Value() []byte { + return nil +} + +func (n *VectorField) GoString() string { + return fmt.Sprintf("&document.VectorField{Name:%s, Options: %s, "+ + "Value: %+v}", n.name, n.options, n.value) +} + +// For the sake of not polluting the API, we are keeping arrayPositions as a +// parameter, but it is not used. +func NewVectorField(name string, arrayPositions []uint64, + vector []float32, dims int, similarity, vectorIndexOptimizedFor string) *VectorField { + return NewVectorFieldWithIndexingOptions(name, arrayPositions, + vector, dims, similarity, vectorIndexOptimizedFor, + DefaultVectorIndexingOptions) +} + +// For the sake of not polluting the API, we are keeping arrayPositions as a +// parameter, but it is not used. +func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64, + vector []float32, dims int, similarity, vectorIndexOptimizedFor string, + options index.FieldIndexingOptions) *VectorField { + options = options | DefaultVectorIndexingOptions + + return &VectorField{ + name: name, + dims: dims, + similarity: similarity, + options: options, + value: vector, + numPlainTextBytes: numBytesFloat32s(vector), + vectorIndexOptimizedFor: vectorIndexOptimizedFor, + } +} + +func numBytesFloat32s(value []float32) uint64 { + return uint64(len(value) * size.SizeOfFloat32) +} + +// ----------------------------------------------------------------------------- +// Following methods help in implementing the bleve_index_api's VectorField +// interface. + +func (n *VectorField) Vector() []float32 { + return n.value +} + +func (n *VectorField) Dims() int { + return n.dims +} + +func (n *VectorField) Similarity() string { + return n.similarity +} + +func (n *VectorField) IndexOptimizedFor() string { + return n.vectorIndexOptimizedFor +} diff --git a/vendor/github.com/blevesearch/bleve/v2/error.go b/vendor/github.com/blevesearch/bleve/v2/error.go index 7dd21194c8..2d2751cd42 100644 --- a/vendor/github.com/blevesearch/bleve/v2/error.go +++ b/vendor/github.com/blevesearch/bleve/v2/error.go @@ -26,6 +26,7 @@ const ( ErrorUnknownIndexType ErrorEmptyID ErrorIndexReadInconsistency + ErrorTwoPhaseSearchInconsistency ) // Error represents a more strongly typed bleve error for detecting @@ -37,14 +38,15 @@ func (e Error) Error() string { } var errorMessages = map[Error]string{ - ErrorIndexPathExists: "cannot create new index, path already exists", - ErrorIndexPathDoesNotExist: "cannot open index, path does not exist", - ErrorIndexMetaMissing: "cannot open index, metadata missing", - ErrorIndexMetaCorrupt: "cannot open index, metadata corrupt", - ErrorIndexClosed: "index is closed", - ErrorAliasMulti: "cannot perform single index operation on multiple index alias", - ErrorAliasEmpty: "cannot perform operation on empty alias", - ErrorUnknownIndexType: "unknown index type", - ErrorEmptyID: "document ID cannot be empty", - ErrorIndexReadInconsistency: "index read inconsistency detected", + ErrorIndexPathExists: "cannot create new index, path already exists", + ErrorIndexPathDoesNotExist: "cannot open index, path does not exist", + ErrorIndexMetaMissing: "cannot open index, metadata missing", + ErrorIndexMetaCorrupt: "cannot open index, metadata corrupt", + ErrorIndexClosed: "index is closed", + ErrorAliasMulti: "cannot perform single index operation on multiple index alias", + ErrorAliasEmpty: "cannot perform operation on empty alias", + ErrorUnknownIndexType: "unknown index type", + ErrorEmptyID: "document ID cannot be empty", + ErrorIndexReadInconsistency: "index read inconsistency detected", + ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change", } diff --git a/vendor/github.com/blevesearch/bleve/v2/geo/parse.go b/vendor/github.com/blevesearch/bleve/v2/geo/parse.go index 01ec1dd81f..34f731a9ee 100644 --- a/vendor/github.com/blevesearch/bleve/v2/geo/parse.go +++ b/vendor/github.com/blevesearch/bleve/v2/geo/parse.go @@ -18,6 +18,8 @@ import ( "reflect" "strconv" "strings" + + "github.com/blevesearch/bleve/v2/util" ) // ExtractGeoPoint takes an arbitrary interface{} and tries it's best to @@ -61,12 +63,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { first := thingVal.Index(0) if first.CanInterface() { firstVal := first.Interface() - lon, foundLon = extractNumericVal(firstVal) + lon, foundLon = util.ExtractNumericValFloat64(firstVal) } second := thingVal.Index(1) if second.CanInterface() { secondVal := second.Interface() - lat, foundLat = extractNumericVal(secondVal) + lat, foundLat = util.ExtractNumericValFloat64(secondVal) } } } @@ -105,12 +107,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { // is it a map if l, ok := thing.(map[string]interface{}); ok { if lval, ok := l["lon"]; ok { - lon, foundLon = extractNumericVal(lval) + lon, foundLon = util.ExtractNumericValFloat64(lval) } else if lval, ok := l["lng"]; ok { - lon, foundLon = extractNumericVal(lval) + lon, foundLon = util.ExtractNumericValFloat64(lval) } if lval, ok := l["lat"]; ok { - lat, foundLat = extractNumericVal(lval) + lat, foundLat = util.ExtractNumericValFloat64(lval) } } @@ -121,19 +123,19 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { if strings.HasPrefix(strings.ToLower(fieldName), "lon") { if thingVal.Field(i).CanInterface() { fieldVal := thingVal.Field(i).Interface() - lon, foundLon = extractNumericVal(fieldVal) + lon, foundLon = util.ExtractNumericValFloat64(fieldVal) } } if strings.HasPrefix(strings.ToLower(fieldName), "lng") { if thingVal.Field(i).CanInterface() { fieldVal := thingVal.Field(i).Interface() - lon, foundLon = extractNumericVal(fieldVal) + lon, foundLon = util.ExtractNumericValFloat64(fieldVal) } } if strings.HasPrefix(strings.ToLower(fieldName), "lat") { if thingVal.Field(i).CanInterface() { fieldVal := thingVal.Field(i).Interface() - lat, foundLat = extractNumericVal(fieldVal) + lat, foundLat = util.ExtractNumericValFloat64(fieldVal) } } } @@ -157,25 +159,6 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { return lon, lat, foundLon && foundLat } -// extract numeric value (if possible) and returns a float64 -func extractNumericVal(v interface{}) (float64, bool) { - val := reflect.ValueOf(v) - if !val.IsValid() { - return 0, false - } - typ := val.Type() - switch typ.Kind() { - case reflect.Float32, reflect.Float64: - return val.Float(), true - case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: - return float64(val.Int()), true - case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: - return float64(val.Uint()), true - } - - return 0, false -} - // various support interfaces which can be used to find lat/lon type loner interface { Lon() float64 @@ -209,12 +192,12 @@ func extractCoordinates(thing interface{}) []float64 { first := thingVal.Index(0) if first.CanInterface() { firstVal := first.Interface() - lon, foundLon = extractNumericVal(firstVal) + lon, foundLon = util.ExtractNumericValFloat64(firstVal) } second := thingVal.Index(1) if second.CanInterface() { secondVal := second.Interface() - lat, foundLat = extractNumericVal(secondVal) + lat, foundLat = util.ExtractNumericValFloat64(secondVal) } if !foundLon || !foundLat { diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go index 123e71d63d..2cb1398ece 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go @@ -30,6 +30,7 @@ type segmentIntroduction struct { obsoletes map[uint64]*roaring.Bitmap ids []string internal map[string][]byte + stats *fieldStats applied chan error persisted chan error @@ -146,7 +147,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newss := &SegmentSnapshot{ id: root.segment[i].id, segment: root.segment[i].segment, + stats: root.segment[i].stats, cachedDocs: root.segment[i].cachedDocs, + cachedMeta: root.segment[i].cachedMeta, creator: root.segment[i].creator, } @@ -154,7 +157,11 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { if root.segment[i].deleted == nil { newss.deleted = delta } else { - newss.deleted = roaring.Or(root.segment[i].deleted, delta) + if delta.IsEmpty() { + newss.deleted = root.segment[i].deleted + } else { + newss.deleted = roaring.Or(root.segment[i].deleted, delta) + } } if newss.deleted.IsEmpty() { newss.deleted = nil @@ -188,7 +195,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newSegmentSnapshot := &SegmentSnapshot{ id: next.id, segment: next.data, // take ownership of next.data's ref-count + stats: next.stats, cachedDocs: &cachedDocs{cache: nil}, + cachedMeta: &cachedMeta{meta: nil}, creator: "introduceSegment", } newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) @@ -275,7 +284,9 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { id: segmentSnapshot.id, segment: replacement, deleted: segmentSnapshot.deleted, + stats: segmentSnapshot.stats, cachedDocs: segmentSnapshot.cachedDocs, + cachedMeta: segmentSnapshot.cachedMeta, creator: "introducePersist", mmaped: 1, } @@ -374,7 +385,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { id: root.segment[i].id, segment: root.segment[i].segment, deleted: root.segment[i].deleted, + stats: root.segment[i].stats, cachedDocs: root.segment[i].cachedDocs, + cachedMeta: root.segment[i].cachedMeta, creator: root.segment[i].creator, }) root.segment[i].segment.AddRef() @@ -394,7 +407,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } } - // before the newMerge introduction, need to clean the newly // merged segment wrt the current root segments, hence // applying the obsolete segment contents to newly merged segment @@ -415,12 +427,19 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { if nextMerge.new != nil && nextMerge.new.Count() > newSegmentDeleted.GetCardinality() { + stats := newFieldStats() + if fsr, ok := nextMerge.new.(segment.FieldStatsReporter); ok { + fsr.UpdateFieldStats(stats) + } + // put new segment at end newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ id: nextMerge.id, segment: nextMerge.new, // take ownership for nextMerge.new's ref-count deleted: newSegmentDeleted, + stats: stats, cachedDocs: &cachedDocs{cache: nil}, + cachedMeta: &cachedMeta{meta: nil}, creator: "introduceMerge", mmaped: nextMerge.mmaped, }) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go index 92adc3fd4b..339ec5969e 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go @@ -290,7 +290,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments))) - oldMap := make(map[uint64]*SegmentSnapshot) + oldMap := make(map[uint64]*SegmentSnapshot, len(task.Segments)) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) segmentsToMerge := make([]segment.Segment, 0, len(task.Segments)) docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) @@ -357,7 +357,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, totalBytesRead := seg.BytesRead() + prevBytesReadTotal seg.ResetBytesRead(totalBytesRead) - oldNewDocNums = make(map[uint64][]uint64) + oldNewDocNums = make(map[uint64][]uint64, len(newDocNums)) for i, segNewDocNums := range newDocNums { oldNewDocNums[task.Segments[i].Id()] = segNewDocNums } @@ -485,8 +485,8 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, sm := &segmentMerge{ id: newSegmentID, - old: make(map[uint64]*SegmentSnapshot), - oldNewDocNums: make(map[uint64][]uint64), + old: make(map[uint64]*SegmentSnapshot, len(sbsIndexes)), + oldNewDocNums: make(map[uint64][]uint64, len(sbsIndexes)), new: seg, notifyCh: make(chan *mergeTaskIntroStatus), } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go index 3c7969fa9e..968a744ac0 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go @@ -16,10 +16,11 @@ package scorch import ( "fmt" + "sync/atomic" + "github.com/RoaringBitmap/roaring" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" - "sync/atomic" ) var OptimizeConjunction = true diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go new file mode 100644 index 0000000000..330e214f3e --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go @@ -0,0 +1,187 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package scorch + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" + segment_api "github.com/blevesearch/scorch_segment_api/v2" +) + +type OptimizeVR struct { + ctx context.Context + snapshot *IndexSnapshot + totalCost uint64 + // maps field to vector readers + vrs map[string][]*IndexSnapshotVectorReader +} + +// This setting _MUST_ only be changed during init and not after. +var BleveMaxKNNConcurrency = 10 + +func (o *OptimizeVR) invokeSearcherEndCallback() { + if o.ctx != nil { + if cb := o.ctx.Value(search.SearcherEndCallbackKey); cb != nil { + if cbF, ok := cb.(search.SearcherEndCallbackFn); ok { + if o.totalCost > 0 { + // notify the callback that the searcher creation etc. is finished + // and report back the total cost for it to track and take actions + // appropriately. + _ = cbF(o.totalCost) + } + } + } + } +} + +func (o *OptimizeVR) Finish() error { + // for each field, get the vector index --> invoke the zap func. + // for each VR, populate postings list and iterators + // by passing the obtained vector index and getting similar vectors. + // defer close index - just once. + var errorsM sync.Mutex + var errors []error + + defer o.invokeSearcherEndCallback() + + wg := sync.WaitGroup{} + semaphore := make(chan struct{}, BleveMaxKNNConcurrency) + // Launch goroutines to get vector index for each segment + for i, seg := range o.snapshot.segment { + if sv, ok := seg.segment.(segment_api.VectorSegment); ok { + wg.Add(1) + semaphore <- struct{}{} // Acquire a semaphore slot + go func(index int, segment segment_api.VectorSegment, origSeg *SegmentSnapshot) { + defer func() { + <-semaphore // Release the semaphore slot + wg.Done() + }() + for field, vrs := range o.vrs { + vecIndex, err := segment.InterpretVectorIndex(field) + if err != nil { + errorsM.Lock() + errors = append(errors, err) + errorsM.Unlock() + return + } + + // update the vector index size as a meta value in the segment snapshot + vectorIndexSize := vecIndex.Size() + origSeg.cachedMeta.updateMeta(field, vectorIndexSize) + for _, vr := range vrs { + // for each VR, populate postings list and iterators + // by passing the obtained vector index and getting similar vectors. + pl, err := vecIndex.Search(vr.vector, vr.k, origSeg.deleted) + if err != nil { + errorsM.Lock() + errors = append(errors, err) + errorsM.Unlock() + go vecIndex.Close() + return + } + + atomic.AddUint64(&o.snapshot.parent.stats.TotKNNSearches, uint64(1)) + + // postings and iterators are already alloc'ed when + // IndexSnapshotVectorReader is created + vr.postings[index] = pl + vr.iterators[index] = pl.Iterator(vr.iterators[index]) + } + go vecIndex.Close() + } + }(i, sv, seg) + } + } + wg.Wait() + close(semaphore) + if len(errors) > 0 { + return errors[0] + } + return nil +} + +func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context, + octx index.VectorOptimizableContext) (index.VectorOptimizableContext, error) { + + if s.snapshot.parent.segPlugin.Version() < VectorSearchSupportedSegmentVersion { + return nil, fmt.Errorf("vector search not supported for this index, "+ + "index's segment version %v, supported segment version for vector search %v", + s.snapshot.parent.segPlugin.Version(), VectorSearchSupportedSegmentVersion) + } + + if octx == nil { + octx = &OptimizeVR{snapshot: s.snapshot, + vrs: make(map[string][]*IndexSnapshotVectorReader), + } + } + + o, ok := octx.(*OptimizeVR) + if !ok { + return octx, nil + } + o.ctx = ctx + + if o.snapshot != s.snapshot { + o.invokeSearcherEndCallback() + return nil, fmt.Errorf("tried to optimize KNN across different snapshots") + } + + // for every searcher creation, consult the segment snapshot to see + // what's the vector index size and since you're anyways going + // to use this vector index to perform the search etc. as part of the Finish() + // perform a check as to whether we allow the searcher creation (the downstream) + // Finish() logic to even occur or not. + var sumVectorIndexSize uint64 + for _, seg := range o.snapshot.segment { + vecIndexSize := seg.cachedMeta.fetchMeta(s.field) + if vecIndexSize != nil { + sumVectorIndexSize += vecIndexSize.(uint64) + } + } + + if o.ctx != nil { + if cb := o.ctx.Value(search.SearcherStartCallbackKey); cb != nil { + if cbF, ok := cb.(search.SearcherStartCallbackFn); ok { + err := cbF(sumVectorIndexSize) + if err != nil { + // it's important to invoke the end callback at this point since + // if the earlier searchers of this optimze struct were successful + // the cost corresponding to it would be incremented and if the + // current searcher fails the check then we end up erroring out + // the overall optimized searcher creation, the cost needs to be + // handled appropriately. + o.invokeSearcherEndCallback() + return nil, err + } + } + } + } + + // total cost is essentially the sum of the vector indexes' size across all the + // searchers - all of them end up reading and maintaining a vector index. + // misacconting this value would end up calling the "end" callback with a value + // not equal to the value passed to "start" callback. + o.totalCost += sumVectorIndexSize + o.vrs[s.field] = append(o.vrs[s.field], s) + return o, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go index 217582fe1b..afd518dde9 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go @@ -17,6 +17,7 @@ package scorch import ( "bytes" "encoding/binary" + "encoding/json" "fmt" "io" "log" @@ -424,6 +425,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( id: newSegmentID, segment: segment.segment, deleted: nil, // nil since merging handled deletions + stats: nil, }) break } @@ -602,6 +604,18 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, return nil, nil, err } } + + // store segment stats + if segmentSnapshot.stats != nil { + b, err := json.Marshal(segmentSnapshot.stats.Fetch()) + if err != nil { + return nil, nil, err + } + err = snapshotSegmentBucket.Put(boltStatsKey, b) + if err != nil { + return nil, nil, err + } + } } return filenames, newSegmentPaths, nil @@ -634,7 +648,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { // the newly populated boltdb snapshotBucket above if len(newSegmentPaths) > 0 { // now try to open all the new snapshots - newSegments := make(map[uint64]segment.Segment) + newSegments := make(map[uint64]segment.Segment, len(newSegmentPaths)) defer func() { for _, s := range newSegments { if s != nil { @@ -704,6 +718,7 @@ var boltMetaDataKey = []byte{'m'} var boltMetaDataSegmentTypeKey = []byte("type") var boltMetaDataSegmentVersionKey = []byte("version") var boltMetaDataTimeStamp = []byte("timeStamp") +var boltStatsKey = []byte("stats") var TotBytesWrittenKey = []byte("TotBytesWritten") func (s *Scorch) loadFromBolt() error { @@ -858,6 +873,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro rv := &SegmentSnapshot{ segment: segment, cachedDocs: &cachedDocs{cache: nil}, + cachedMeta: &cachedMeta{meta: nil}, } deletedBytes := segmentBucket.Get(boltDeletedKey) if deletedBytes != nil { @@ -872,6 +888,18 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro rv.deleted = deletedBitmap } } + statBytes := segmentBucket.Get(boltStatsKey) + if statBytes != nil { + var statsMap map[string]map[string]uint64 + + err := json.Unmarshal(statBytes, &statsMap) + stats := &fieldStats{statMap: statsMap} + if err != nil { + _ = segment.Close() + return nil, fmt.Errorf("error reading stat bytes: %v", err) + } + rv.stats = stats + } return rv, nil } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go index f30d795e95..2e6435ee01 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go @@ -428,6 +428,8 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { var newSegment segment.Segment var bufBytes uint64 + stats := newFieldStats() + if len(analysisResults) > 0 { newSegment, bufBytes, err = s.segPlugin.New(analysisResults) if err != nil { @@ -438,11 +440,14 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { segB.BytesWritten()) } atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes) + if fsr, ok := newSegment.(segment.FieldStatsReporter); ok { + fsr.UpdateFieldStats(stats) + } } else { atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) } - err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback()) + err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback(), stats) if err != nil { if newSegment != nil { _ = newSegment.Close() @@ -462,15 +467,15 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, - internalOps map[string][]byte, persistedCallback index.BatchCallback) error { + internalOps map[string][]byte, persistedCallback index.BatchCallback, stats *fieldStats) error { // new introduction introduction := &segmentIntroduction{ id: atomic.AddUint64(&s.nextSegmentID, 1), data: newSegment, ids: ids, - obsoletes: make(map[uint64]*roaring.Bitmap), internal: internalOps, + stats: stats, applied: make(chan error), persistedCallback: persistedCallback, } @@ -487,6 +492,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, defer func() { _ = root.DecRef() }() + introduction.obsoletes = make(map[uint64]*roaring.Bitmap, len(root.segment)) + for _, seg := range root.segment { delta, err := seg.segment.DocNumbers(ids) if err != nil { @@ -617,6 +624,8 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["index_time"] = m["TotIndexTime"] m["term_searchers_started"] = m["TotTermSearchersStarted"] m["term_searchers_finished"] = m["TotTermSearchersFinished"] + m["knn_searches"] = m["TotKNNSearches"] + m["num_bytes_read_at_query_time"] = m["TotBytesReadAtQueryTime"] m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] m["num_bytes_written_at_index_time"] = m["TotBytesWrittenAtIndexTime"] @@ -638,6 +647,20 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"] m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] + // calculate the aggregate of all the segment's field stats + aggFieldStats := newFieldStats() + for _, segmentSnapshot := range indexSnapshot.Segments() { + if segmentSnapshot.stats != nil { + aggFieldStats.Aggregate(segmentSnapshot.stats) + } + } + + aggFieldStatsMap := aggFieldStats.Fetch() + for statName, stats := range aggFieldStatsMap { + for fieldName, val := range stats { + m["field:"+fieldName+":"+statName] = val + } + } return m } @@ -762,3 +785,50 @@ func parseToInteger(i interface{}) (int, error) { return 0, fmt.Errorf("expects int or float64 value") } } + +// Holds Zap's field level stats at a segment level +type fieldStats struct { + // StatName -> FieldName -> value + statMap map[string]map[string]uint64 +} + +// Add the data into the map after checking if the statname is valid +func (fs *fieldStats) Store(statName, fieldName string, value uint64) { + if _, exists := fs.statMap[statName]; !exists { + fs.statMap[statName] = make(map[string]uint64) + } + fs.statMap[statName][fieldName] = value +} + +// Combine the given stats map with the existing map +func (fs *fieldStats) Aggregate(stats segment.FieldStats) { + + statMap := stats.Fetch() + if statMap == nil { + return + } + for statName, statMap := range statMap { + if _, exists := fs.statMap[statName]; !exists { + fs.statMap[statName] = make(map[string]uint64) + } + for fieldName, val := range statMap { + if _, exists := fs.statMap[statName][fieldName]; !exists { + fs.statMap[statName][fieldName] = 0 + } + fs.statMap[statName][fieldName] += val + } + } +} + +// Returns the stats map +func (fs *fieldStats) Fetch() map[string]map[string]uint64 { + return fs.statMap +} + +// Initializes an empty stats map +func newFieldStats() *fieldStats { + rv := &fieldStats{ + statMap: map[string]map[string]uint64{}, + } + return rv +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go index a84d2d55ff..b3b9ba01f8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go @@ -28,6 +28,7 @@ import ( zapv13 "github.com/blevesearch/zapx/v13" zapv14 "github.com/blevesearch/zapx/v14" zapv15 "github.com/blevesearch/zapx/v15" + zapv16 "github.com/blevesearch/zapx/v16" ) // SegmentPlugin represents the essential functions required by a package to plug in @@ -73,7 +74,8 @@ var defaultSegmentPlugin SegmentPlugin func init() { ResetSegmentPlugins() - RegisterSegmentPlugin(&zapv15.ZapPlugin{}, true) + RegisterSegmentPlugin(&zapv16.ZapPlugin{}, true) + RegisterSegmentPlugin(&zapv15.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv14.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv13.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv12.ZapPlugin{}, false) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go new file mode 100644 index 0000000000..04a9e0e6d2 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go @@ -0,0 +1,158 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package scorch + +import ( + "bytes" + "context" + "fmt" + "reflect" + + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" + segment_api "github.com/blevesearch/scorch_segment_api/v2" +) + +const VectorSearchSupportedSegmentVersion = 16 + +var reflectStaticSizeIndexSnapshotVectorReader int + +func init() { + var istfr IndexSnapshotVectorReader + reflectStaticSizeIndexSnapshotVectorReader = int(reflect.TypeOf(istfr).Size()) +} + +type IndexSnapshotVectorReader struct { + vector []float32 + field string + k int64 + snapshot *IndexSnapshot + postings []segment_api.VecPostingsList + iterators []segment_api.VecPostingsIterator + segmentOffset int + currPosting segment_api.VecPosting + currID index.IndexInternalID + ctx context.Context +} + +func (i *IndexSnapshotVectorReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotVectorReader + size.SizeOfPtr + + len(i.vector) + len(i.field) + len(i.currID) + + for _, entry := range i.postings { + sizeInBytes += entry.Size() + } + + for _, entry := range i.iterators { + sizeInBytes += entry.Size() + } + + if i.currPosting != nil { + sizeInBytes += i.currPosting.Size() + } + + return sizeInBytes +} + +func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( + *index.VectorDoc, error) { + rv := preAlloced + if rv == nil { + rv = &index.VectorDoc{} + } + + for i.segmentOffset < len(i.iterators) { + next, err := i.iterators[i.segmentOffset].Next() + if err != nil { + return nil, err + } + if next != nil { + // make segment number into global number by adding offset + globalOffset := i.snapshot.offsets[i.segmentOffset] + nnum := next.Number() + rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) + rv.Score = float64(next.Score()) + + i.currID = rv.ID + i.currPosting = next + + return rv, nil + } + i.segmentOffset++ + } + + return nil, nil +} + +func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, + preAlloced *index.VectorDoc) (*index.VectorDoc, error) { + + if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k) + if err != nil { + return nil, err + } + // close the current term field reader before replacing it with a new one + _ = i.Close() + *i = *(i2.(*IndexSnapshotVectorReader)) + } + + num, err := docInternalToNumber(ID) + if err != nil { + return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) + } + segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) + if segIndex >= len(i.snapshot.segment) { + return nil, fmt.Errorf("computed segment index %d out of bounds %d", + segIndex, len(i.snapshot.segment)) + } + // skip directly to the target segment + i.segmentOffset = segIndex + next, err := i.iterators[i.segmentOffset].Advance(ldocNum) + if err != nil { + return nil, err + } + if next == nil { + // we jumped directly to the segment that should have contained it + // but it wasn't there, so reuse Next() which should correctly + // get the next hit after it (we moved i.segmentOffset) + return i.Next(preAlloced) + } + + if preAlloced == nil { + preAlloced = &index.VectorDoc{} + } + preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + i.snapshot.offsets[segIndex]) + i.currID = preAlloced.ID + i.currPosting = next + return preAlloced, nil +} + +func (i *IndexSnapshotVectorReader) Count() uint64 { + var rv uint64 + for _, posting := range i.postings { + rv += posting.Count() + } + return rv +} + +func (i *IndexSnapshotVectorReader) Close() error { + // TODO Consider if any scope of recycling here. + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go index 0b76ec7465..1c14af7265 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go @@ -39,6 +39,9 @@ type SegmentSnapshot struct { segment segment.Segment deleted *roaring.Bitmap creator string + stats *fieldStats + + cachedMeta *cachedMeta cachedDocs *cachedDocs } @@ -282,3 +285,30 @@ func (c *cachedDocs) visitDoc(localDocNum uint64, c.m.Unlock() } + +// the purpose of the cachedMeta is to simply allow the user of this type to record +// and cache certain meta data information (specific to the segment) that can be +// used across calls to save compute on the same. +// for example searcher creations on the same index snapshot can use this struct +// to help and fetch the backing index size information which can be used in +// memory usage calculation thereby deciding whether to allow a query or not. +type cachedMeta struct { + m sync.RWMutex + meta map[string]interface{} +} + +func (c *cachedMeta) updateMeta(field string, val interface{}) { + c.m.Lock() + if c.meta == nil { + c.meta = make(map[string]interface{}) + } + c.meta[field] = val + c.m.Unlock() +} + +func (c *cachedMeta) fetchMeta(field string) (rv interface{}) { + c.m.RLock() + rv = c.meta[field] + c.m.RUnlock() + return rv +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go new file mode 100644 index 0000000000..9d6f0700e5 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go @@ -0,0 +1,48 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package scorch + +import ( + "context" + + index "github.com/blevesearch/bleve_index_api" + segment_api "github.com/blevesearch/scorch_segment_api/v2" +) + +func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32, + field string, k int64) ( + index.VectorReader, error) { + + rv := &IndexSnapshotVectorReader{ + vector: vector, + field: field, + k: k, + snapshot: is, + } + + if rv.postings == nil { + rv.postings = make([]segment_api.VecPostingsList, len(is.segment)) + } + if rv.iterators == nil { + rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment)) + } + + // initialize postings and iterators within the OptimizeVR's Finish() + + return rv, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go index dc74d9f29e..269ae2f639 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go @@ -51,6 +51,8 @@ type Stats struct { TotTermSearchersStarted uint64 TotTermSearchersFinished uint64 + TotKNNSearches uint64 + TotEventTriggerStarted uint64 TotEventTriggerCompleted uint64 diff --git a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go index a73dd6b8fe..3c7cdcd32d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go @@ -21,6 +21,8 @@ import ( "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/collector" + "github.com/blevesearch/bleve/v2/search/query" index "github.com/blevesearch/bleve_index_api" ) @@ -160,13 +162,92 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest if len(i.indexes) < 1 { return nil, ErrorAliasEmpty } + if _, ok := ctx.Value(search.PreSearchKey).(bool); ok { + // since preSearchKey is set, it means that the request + // is being executed as part of a preSearch, which + // indicates that this index alias is set as an Index + // in another alias, so we need to do a preSearch search + // and NOT a real search + return preSearchDataSearch(ctx, req, i.indexes...) + } + + // at this point we know we are doing a real search + // either after a preSearch is done, or directly + // on the alias + + // check if request has preSearchData which would indicate that the + // request has already been preSearched and we can skip the + // preSearch step now, we call an optional function to + // redistribute the preSearchData to the individual indexes + // if necessary + var preSearchData map[string]map[string]interface{} + if req.PreSearchData != nil { + if requestHasKNN(req) { + var err error + preSearchData, err = redistributeKNNPreSearchData(req, i.indexes) + if err != nil { + return nil, err + } + } + } // short circuit the simple case if len(i.indexes) == 1 { + if preSearchData != nil { + req.PreSearchData = preSearchData[i.indexes[0].Name()] + } return i.indexes[0].SearchInContext(ctx, req) } - return MultiSearch(ctx, req, i.indexes...) + // at this stage we know we have multiple indexes + // check if preSearchData needs to be gathered from all indexes + // before executing the query + var err error + // only perform preSearch if + // - the request does not already have preSearchData + // - the request requires preSearch + var preSearchDuration time.Duration + var sr *SearchResult + if req.PreSearchData == nil && preSearchRequired(req) { + searchStart := time.Now() + preSearchResult, err := preSearch(ctx, req, i.indexes...) + if err != nil { + return nil, err + } + // check if the preSearch result has any errors and if so + // return the search result as is without executing the query + // so that the errors are not lost + if preSearchResult.Status.Failed > 0 || len(preSearchResult.Status.Errors) > 0 { + return preSearchResult, nil + } + // finalize the preSearch result now + finalizePreSearchResult(req, preSearchResult) + + // if there are no errors, then merge the data in the preSearch result + // and construct the preSearchData to be used in the actual search + // if the request is satisfied by the preSearch result, then we can + // directly return the preSearch result as the final result + if requestSatisfiedByPreSearch(req) { + sr = finalizeSearchResult(req, preSearchResult) + // no need to run the 2nd phase MultiSearch(..) + } else { + preSearchData, err = constructPreSearchData(req, preSearchResult, i.indexes) + if err != nil { + return nil, err + } + } + preSearchDuration = time.Since(searchStart) + } + + // check if search result was generated as part of preSearch itself + if sr == nil { + sr, err = MultiSearch(ctx, req, preSearchData, i.indexes...) + if err != nil { + return nil, err + } + } + sr.Took += preSearchDuration + return sr, nil } func (i *indexAliasImpl) Fields() ([]string, error) { @@ -429,22 +510,8 @@ func (i *indexAliasImpl) Swap(in, out []Index) { // the actual final results. // Perhaps that part needs to be optional, // could be slower in remote usages. -func createChildSearchRequest(req *SearchRequest) *SearchRequest { - rv := SearchRequest{ - Query: req.Query, - Size: req.Size + req.From, - From: 0, - Highlight: req.Highlight, - Fields: req.Fields, - Facets: req.Facets, - Explain: req.Explain, - Sort: req.Sort.Copy(), - IncludeLocations: req.IncludeLocations, - Score: req.Score, - SearchAfter: req.SearchAfter, - SearchBefore: req.SearchBefore, - } - return &rv +func createChildSearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest { + return copySearchRequest(req, preSearchData) } type asyncSearchResult struct { @@ -453,9 +520,195 @@ type asyncSearchResult struct { Err error } +func preSearchRequired(req *SearchRequest) bool { + return requestHasKNN(req) +} + +func preSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { + // create a dummy request with a match none query + // since we only care about the preSearchData in PreSearch + dummyRequest := &SearchRequest{ + Query: query.NewMatchNoneQuery(), + } + newCtx := context.WithValue(ctx, search.PreSearchKey, true) + if requestHasKNN(req) { + addKnnToDummyRequest(dummyRequest, req) + } + return preSearchDataSearch(newCtx, dummyRequest, indexes...) +} + +// if the request is satisfied by just the preSearch result, +// finalize the result and return it directly without +// performing multi search +func finalizeSearchResult(req *SearchRequest, preSearchResult *SearchResult) *SearchResult { + if preSearchResult == nil { + return nil + } + + // global values across all hits irrespective of pagination settings + preSearchResult.Total = uint64(preSearchResult.Hits.Len()) + maxScore := float64(0) + for i, hit := range preSearchResult.Hits { + // since we are now using the preSearch result as the final result + // we can discard the indexNames from the hits as they are no longer + // relevant. + hit.IndexNames = nil + if hit.Score > maxScore { + maxScore = hit.Score + } + hit.HitNumber = uint64(i) + } + preSearchResult.MaxScore = maxScore + // now apply pagination settings + var reverseQueryExecution bool + if req.SearchBefore != nil { + reverseQueryExecution = true + req.Sort.Reverse() + req.SearchAfter = req.SearchBefore + } + if req.SearchAfter != nil { + preSearchResult.Hits = collector.FilterHitsBySearchAfter(preSearchResult.Hits, req.Sort, req.SearchAfter) + } + preSearchResult.Hits = hitsInCurrentPage(req, preSearchResult.Hits) + if reverseQueryExecution { + // reverse the sort back to the original + req.Sort.Reverse() + // resort using the original order + mhs := newSearchHitSorter(req.Sort, preSearchResult.Hits) + req.SortFunc()(mhs) + req.SearchAfter = nil + } + + if req.Explain { + preSearchResult.Request = req + } + return preSearchResult +} + +func requestSatisfiedByPreSearch(req *SearchRequest) bool { + if requestHasKNN(req) && isKNNrequestSatisfiedByPreSearch(req) { + return true + } + return false +} + +func constructPreSearchData(req *SearchRequest, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { + mergedOut := make(map[string]map[string]interface{}, len(indexes)) + for _, index := range indexes { + mergedOut[index.Name()] = make(map[string]interface{}) + } + var err error + if requestHasKNN(req) { + mergedOut, err = constructKnnPreSearchData(mergedOut, preSearchResult, indexes) + if err != nil { + return nil, err + } + } + return mergedOut, nil +} + +func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { + asyncResults := make(chan *asyncSearchResult, len(indexes)) + // run search on each index in separate go routine + var waitGroup sync.WaitGroup + var searchChildIndex = func(in Index, childReq *SearchRequest) { + rv := asyncSearchResult{Name: in.Name()} + rv.Result, rv.Err = in.SearchInContext(ctx, childReq) + asyncResults <- &rv + waitGroup.Done() + } + waitGroup.Add(len(indexes)) + for _, in := range indexes { + go searchChildIndex(in, createChildSearchRequest(req, nil)) + } + // on another go routine, close after finished + go func() { + waitGroup.Wait() + close(asyncResults) + }() + // the final search result to be returned after combining the preSearch results + var sr *SearchResult + // the preSearch result processor + var prp preSearchResultProcessor + // error map + indexErrors := make(map[string]error) + for asr := range asyncResults { + if asr.Err == nil { + // a valid preSearch result + if prp == nil { + // first valid preSearch result + // create a new preSearch result processor + prp = createPreSearchResultProcessor(req) + } + prp.add(asr.Result, asr.Name) + if sr == nil { + // first result + sr = &SearchResult{ + Status: asr.Result.Status, + Cost: asr.Result.Cost, + } + } else { + // merge with previous + sr.Status.Merge(asr.Result.Status) + sr.Cost += asr.Result.Cost + } + } else { + indexErrors[asr.Name] = asr.Err + } + } + // handle case where no results were successful + if sr == nil { + sr = &SearchResult{ + Status: &SearchStatus{ + Errors: make(map[string]error), + }, + } + } + // in preSearch, partial results are not allowed as it can lead to + // the real search giving incorrect results, and hence the search + // result is not populated with any of the processed data from + // the preSearch result processor if there are any errors + // or the preSearch result status has any failures + if len(indexErrors) > 0 || sr.Status.Failed > 0 { + if sr.Status.Errors == nil { + sr.Status.Errors = make(map[string]error) + } + for indexName, indexErr := range indexErrors { + sr.Status.Errors[indexName] = indexErr + sr.Status.Total++ + sr.Status.Failed++ + } + } else { + prp.finalize(sr) + } + return sr, nil +} + +// hitsInCurrentPage returns the hits in the current page +// using the From and Size parameters in the request +func hitsInCurrentPage(req *SearchRequest, hits []*search.DocumentMatch) []*search.DocumentMatch { + sortFunc := req.SortFunc() + // sort all hits with the requested order + if len(req.Sort) > 0 { + sorter := newSearchHitSorter(req.Sort, hits) + sortFunc(sorter) + } + // now skip over the correct From + if req.From > 0 && len(hits) > req.From { + hits = hits[req.From:] + } else if req.From > 0 { + hits = search.DocumentMatchCollection{} + } + // now trim to the correct size + if req.Size > 0 && len(hits) > req.Size { + hits = hits[0:req.Size] + } + return hits +} + // MultiSearch executes a SearchRequest across multiple Index objects, // then merges the results. The indexes must honor any ctx deadline. -func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { +func MultiSearch(ctx context.Context, req *SearchRequest, preSearchData map[string]map[string]interface{}, indexes ...Index) (*SearchResult, error) { searchStart := time.Now() asyncResults := make(chan *asyncSearchResult, len(indexes)) @@ -480,7 +733,11 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se waitGroup.Add(len(indexes)) for _, in := range indexes { - go searchChildIndex(in, createChildSearchRequest(req)) + var payload map[string]interface{} + if preSearchData != nil { + payload = preSearchData[in.Name()] + } + go searchChildIndex(in, createChildSearchRequest(req, payload)) } // on another go routine, close after finished @@ -518,24 +775,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se } } - sortFunc := req.SortFunc() - // sort all hits with the requested order - if len(req.Sort) > 0 { - sorter := newSearchHitSorter(req.Sort, sr.Hits) - sortFunc(sorter) - } - - // now skip over the correct From - if req.From > 0 && len(sr.Hits) > req.From { - sr.Hits = sr.Hits[req.From:] - } else if req.From > 0 { - sr.Hits = search.DocumentMatchCollection{} - } - - // now trim to the correct size - if req.Size > 0 && len(sr.Hits) > req.Size { - sr.Hits = sr.Hits[0:req.Size] - } + sr.Hits = hitsInCurrentPage(req, sr.Hits) // fix up facets for name, fr := range req.Facets { @@ -547,14 +787,16 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se req.Sort.Reverse() // resort using the original order mhs := newSearchHitSorter(req.Sort, sr.Hits) - sortFunc(mhs) + req.SortFunc()(mhs) // reset request req.SearchBefore = req.SearchAfter req.SearchAfter = nil } // fix up original request - sr.Request = req + if req.Explain { + sr.Request = req + } searchDuration := time.Since(searchStart) sr.Took = searchDuration diff --git a/vendor/github.com/blevesearch/bleve/v2/index_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_impl.go index d5f34a2a3d..a52547352b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_impl.go @@ -433,6 +433,25 @@ func memNeededForSearch(req *SearchRequest, return uint64(estimate) } +func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader index.IndexReader) (*SearchResult, error) { + var knnHits []*search.DocumentMatch + var err error + if requestHasKNN(req) { + knnHits, err = i.runKnnCollector(ctx, req, reader, true) + if err != nil { + return nil, err + } + } + + return &SearchResult{ + Status: &SearchStatus{ + Total: 1, + Successful: 1, + }, + Hits: knnHits, + }, nil +} + // SearchInContext executes a search request operation within the provided // Context. Returns a SearchResult object or an error. func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) { @@ -445,6 +464,25 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr return nil, ErrorIndexClosed } + // open a reader for this search + indexReader, err := i.i.Reader() + if err != nil { + return nil, fmt.Errorf("error opening index reader %v", err) + } + defer func() { + if cerr := indexReader.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + if _, ok := ctx.Value(search.PreSearchKey).(bool); ok { + preSearchResult, err := i.preSearch(ctx, req, indexReader) + if err != nil { + return nil, err + } + return preSearchResult, nil + } + var reverseQueryExecution bool if req.SearchBefore != nil { reverseQueryExecution = true @@ -460,16 +498,31 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr coll = collector.NewTopNCollector(req.Size, req.From, req.Sort) } - // open a reader for this search - indexReader, err := i.i.Reader() - if err != nil { - return nil, fmt.Errorf("error opening index reader %v", err) - } - defer func() { - if cerr := indexReader.Close(); err == nil && cerr != nil { - err = cerr + var knnHits []*search.DocumentMatch + var ok bool + var skipKnnCollector bool + if req.PreSearchData != nil { + for k, v := range req.PreSearchData { + switch k { + case search.KnnPreSearchDataKey: + if v != nil { + knnHits, ok = v.([]*search.DocumentMatch) + if !ok { + return nil, fmt.Errorf("knn preSearchData must be of type []*search.DocumentMatch") + } + } + skipKnnCollector = true + } } - }() + } + if !skipKnnCollector && requestHasKNN(req) { + knnHits, err = i.runKnnCollector(ctx, req, indexReader, false) + if err != nil { + return nil, err + } + } + + setKnnHitsInCollector(knnHits, req, coll) // This callback and variable handles the tracking of bytes read // 1. as part of creation of tfr and its Next() calls which is @@ -540,14 +593,14 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if dateTimeParser == nil { return nil, fmt.Errorf("no date time parser named `%s` registered", dateTimeParserName) } - start, end, startLayout, endLayout, err := dr.ParseDates(dateTimeParser) + start, end, err := dr.ParseDates(dateTimeParser) if err != nil { return nil, fmt.Errorf("ParseDates err: %v, using date time parser named %s", err, dateTimeParserName) } if start.IsZero() && end.IsZero() { return nil, fmt.Errorf("date range query must specify either start, end or both for date range name '%s'", dr.Name) } - facetBuilder.AddRange(dr.Name, start, end, startLayout, endLayout) + facetBuilder.AddRange(dr.Name, start, end) } facetsBuilder.Add(facetName, facetBuilder) } else { @@ -605,7 +658,9 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr var storedFieldsCost uint64 for _, hit := range hits { - if i.name != "" { + // KNN documents will already have their Index value set as part of the knn collector output + // so check if the index is empty and set it to the current index name + if i.name != "" && hit.Index == "" { hit.Index = i.name } err, storedFieldsBytes := LoadAndHighlightFields(hit, req, i.name, indexReader, highlighter) @@ -638,18 +693,23 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr req.SearchAfter = nil } - return &SearchResult{ + rv := &SearchResult{ Status: &SearchStatus{ Total: 1, Successful: 1, }, - Request: req, Hits: hits, Total: coll.Total(), MaxScore: coll.MaxScore(), Took: searchDuration, Facets: coll.FacetResults(), - }, nil + } + + if req.Explain { + rv.Request = req + } + + return rv, nil } func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, @@ -658,9 +718,9 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, var totalStoredFieldsBytes uint64 if len(req.Fields) > 0 || highlighter != nil { doc, err := r.Document(hit.ID) - totalStoredFieldsBytes = doc.StoredFieldsBytes() if err == nil && doc != nil { - if len(req.Fields) > 0 { + if len(req.Fields) > 0 && hit.Fields == nil { + totalStoredFieldsBytes = doc.StoredFieldsBytes() fieldsToLoad := deDuplicate(req.Fields) for _, f := range fieldsToLoad { doc.VisitFields(func(docF index.Field) { diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go index aacaa0a551..73bb124db2 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go @@ -50,7 +50,8 @@ type DocumentMapping struct { StructTagKey string `json:"struct_tag_key,omitempty"` } -func (dm *DocumentMapping) Validate(cache *registry.Cache) error { +func (dm *DocumentMapping) Validate(cache *registry.Cache, + parentName string, fieldAliasCtx map[string]*FieldMapping) error { var err error if dm.DefaultAnalyzer != "" { _, err := cache.AnalyzerNamed(dm.DefaultAnalyzer) @@ -58,8 +59,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache) error { return err } } - for _, property := range dm.Properties { - err = property.Validate(cache) + for propertyName, property := range dm.Properties { + newParent := propertyName + if parentName != "" { + newParent = fmt.Sprintf("%s.%s", parentName, propertyName) + } + err = property.Validate(cache, newParent, fieldAliasCtx) if err != nil { return err } @@ -77,15 +82,25 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache) error { return err } } - switch field.Type { - case "text", "datetime", "number", "boolean", "geopoint", "geoshape", "IP": - default: - return fmt.Errorf("unknown field type: '%s'", field.Type) + + err := validateFieldMapping(field, parentName, fieldAliasCtx) + if err != nil { + return err } } return nil } +func validateFieldType(field *FieldMapping) error { + switch field.Type { + case "text", "datetime", "number", "boolean", "geopoint", "geoshape", "IP": + return nil + default: + return fmt.Errorf("field: '%s', unknown field type: '%s'", + field.Name, field.Type) + } +} + // analyzerNameForPath attempts to first find the field // described by this path, then returns the analyzer // configured for that field @@ -141,15 +156,20 @@ func (dm *DocumentMapping) fieldDescribedByPath(path string) *FieldMapping { return nil } -// documentMappingForPath returns the EXACT and closest matches for a sub +// documentMappingForPathElements returns the EXACT and closest matches for a sub // document or for an explicitly mapped field; the closest most specific // document mapping could be one that matches part of the provided path. -func (dm *DocumentMapping) documentMappingForPath(path string) ( +func (dm *DocumentMapping) documentMappingForPathElements(pathElements []string) ( *DocumentMapping, *DocumentMapping) { - pathElements := decodePath(path) + var pathElementsCopy []string + if len(pathElements) == 0 { + pathElementsCopy = []string{""} + } else { + pathElementsCopy = pathElements + } current := dm OUTER: - for i, pathElement := range pathElements { + for i, pathElement := range pathElementsCopy { if subDocMapping, exists := current.Properties[pathElement]; exists { current = subDocMapping continue OUTER @@ -157,7 +177,7 @@ OUTER: // no subDocMapping matches this pathElement // only if this is the last element check for field name - if i == len(pathElements)-1 { + if i == len(pathElementsCopy)-1 { for _, field := range current.Fields { if field.Name == pathElement { break @@ -170,6 +190,15 @@ OUTER: return current, current } +// documentMappingForPath returns the EXACT and closest matches for a sub +// document or for an explicitly mapped field; the closest most specific +// document mapping could be one that matches part of the provided path. +func (dm *DocumentMapping) documentMappingForPath(path string) ( + *DocumentMapping, *DocumentMapping) { + pathElements := decodePath(path) + return dm.documentMappingForPathElements(pathElements) +} + // NewDocumentMapping returns a new document mapping // with all the default values. func NewDocumentMapping() *DocumentMapping { @@ -388,9 +417,8 @@ func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes } func (dm *DocumentMapping) processProperty(property interface{}, path []string, indexes []uint64, context *walkContext) { - pathString := encodePath(path) // look to see if there is a mapping for this field - subDocMapping, closestDocMapping := dm.documentMappingForPath(pathString) + subDocMapping, closestDocMapping := dm.documentMappingForPathElements(path) // check to see if we even need to do further processing if subDocMapping != nil && !subDocMapping.Enabled { @@ -402,6 +430,8 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, // cannot do anything with the zero value return } + + pathString := encodePath(path) propertyType := propertyValue.Type() switch propertyType.Kind() { case reflect.String: @@ -502,9 +532,20 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, dm.walkDocument(property, path, indexes, context) } case reflect.Map, reflect.Slice: + var isPropertyVector bool + var isPropertyVectorInitialized bool if subDocMapping != nil { for _, fieldMapping := range subDocMapping.Fields { switch fieldMapping.Type { + case "vector": + processed := fieldMapping.processVector(property, pathString, path, + indexes, context) + if !isPropertyVectorInitialized { + isPropertyVector = processed + isPropertyVectorInitialized = true + } else { + isPropertyVector = isPropertyVector && processed + } case "geopoint": fieldMapping.processGeoPoint(property, pathString, path, indexes, context) case "IP": @@ -517,7 +558,9 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, } } } - dm.walkDocument(property, path, indexes, context) + if !isPropertyVector { + dm.walkDocument(property, path, indexes, context) + } case reflect.Ptr: if !propertyValue.IsNil() { switch property := property.(type) { diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go index 82d51f317f..f4339b3845 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go @@ -69,6 +69,17 @@ type FieldMapping struct { // the processing of freq/norm details when the default score based relevancy // isn't needed. SkipFreqNorm bool `json:"skip_freq_norm,omitempty"` + + // Dimensionality of the vector + Dims int `json:"dims,omitempty"` + + // Similarity is the similarity algorithm used for scoring + // vector fields. + // See: index.DefaultSimilarityMetric & index.SupportedSimilarityMetrics + Similarity string `json:"similarity,omitempty"` + + // Applicable to vector fields only - optimization string + VectorIndexOptimizedFor string `json:"vector_index_optimized_for,omitempty"` } // NewTextFieldMapping returns a default field mapping for text @@ -448,6 +459,21 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "dims": + err := json.Unmarshal(v, &fm.Dims) + if err != nil { + return err + } + case "similarity": + err := json.Unmarshal(v, &fm.Similarity) + if err != nil { + return err + } + case "vector_index_optimized_for": + err := json.Unmarshal(v, &fm.VectorIndexOptimizedFor) + if err != nil { + return err + } default: invalidKeys = append(invalidKeys, k) } diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go index 0de4147a47..171ee1a728 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go @@ -174,12 +174,14 @@ func (im *IndexMappingImpl) Validate() error { if err != nil { return err } - err = im.DefaultMapping.Validate(im.cache) + + fieldAliasCtx := make(map[string]*FieldMapping) + err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { return err } for _, docMapping := range im.TypeMapping { - err = docMapping.Validate(im.cache) + err = docMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { return err } @@ -431,6 +433,33 @@ func (im *IndexMappingImpl) FieldAnalyzer(field string) string { return im.AnalyzerNameForPath(field) } +// FieldMappingForPath returns the mapping for a specific field 'path'. +func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping { + if im.TypeMapping != nil { + for _, v := range im.TypeMapping { + for field, property := range v.Properties { + for _, v1 := range property.Fields { + if field == path { + // Return field mapping if the name matches the path param. + return *v1 + } + } + } + } + } + + for field, property := range im.DefaultMapping.Properties { + for _, v1 := range property.Fields { + if field == path { + // Return field mapping if the name matches the path param. + return *v1 + } + } + } + + return FieldMapping{} +} + // wrapper to satisfy new interface func (im *IndexMappingImpl) DefaultSearchField() string { diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go index a3e5a54e05..cbfc98faa0 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go @@ -55,4 +55,6 @@ type IndexMapping interface { AnalyzerNameForPath(path string) string AnalyzerNamed(name string) analysis.Analyzer + + FieldMappingForPath(path string) FieldMapping } diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_no_vectors.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_no_vectors.go new file mode 100644 index 0000000000..f9f35f57cc --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_no_vectors.go @@ -0,0 +1,35 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !vectors +// +build !vectors + +package mapping + +func NewVectorFieldMapping() *FieldMapping { + return nil +} + +func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, + pathString string, path []string, indexes []uint64, context *walkContext) bool { + return false +} + +// ----------------------------------------------------------------------------- +// document validation functions + +func validateFieldMapping(field *FieldMapping, parentName string, + fieldAliasCtx map[string]*FieldMapping) error { + return validateFieldType(field) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go new file mode 100644 index 0000000000..a0b7126089 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go @@ -0,0 +1,220 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package mapping + +import ( + "fmt" + "reflect" + + "github.com/blevesearch/bleve/v2/document" + "github.com/blevesearch/bleve/v2/util" + index "github.com/blevesearch/bleve_index_api" +) + +// Min and Max allowed dimensions for a vector field +const ( + MinVectorDims = 1 + MaxVectorDims = 2048 +) + +func NewVectorFieldMapping() *FieldMapping { + return &FieldMapping{ + Type: "vector", + Store: false, + Index: true, + IncludeInAll: false, + DocValues: false, + SkipFreqNorm: true, + } +} + +// validate and process a flat vector +func processFlatVector(vecV reflect.Value, dims int) ([]float32, bool) { + if vecV.Len() != dims { + return nil, false + } + + rv := make([]float32, dims) + for i := 0; i < vecV.Len(); i++ { + item := vecV.Index(i) + if !item.CanInterface() { + return nil, false + } + itemI := item.Interface() + itemFloat, ok := util.ExtractNumericValFloat32(itemI) + if !ok { + return nil, false + } + rv[i] = itemFloat + } + + return rv, true +} + +// validate and process a vector +// max supported depth of nesting is 2 ([][]float32) +func processVector(vecI interface{}, dims int) ([]float32, bool) { + vecV := reflect.ValueOf(vecI) + if !vecV.IsValid() || vecV.Kind() != reflect.Slice || vecV.Len() == 0 { + return nil, false + } + + // Let's examine the first element (head) of the vector. + // If head is a slice, then vector is nested, otherwise flat. + head := vecV.Index(0) + if !head.CanInterface() { + return nil, false + } + headI := head.Interface() + headV := reflect.ValueOf(headI) + if !headV.IsValid() { + return nil, false + } + if headV.Kind() != reflect.Slice { // vector is flat + return processFlatVector(vecV, dims) + } + + // # process nested vector + + // pre-allocate memory for the flattened vector + // so that we can use copy() later + rv := make([]float32, dims*vecV.Len()) + + for i := 0; i < vecV.Len(); i++ { + subVec := vecV.Index(i) + if !subVec.CanInterface() { + return nil, false + } + subVecI := subVec.Interface() + subVecV := reflect.ValueOf(subVecI) + if !subVecV.IsValid() { + return nil, false + } + + if subVecV.Kind() != reflect.Slice { + return nil, false + } + + flatVector, ok := processFlatVector(subVecV, dims) + if !ok { + return nil, false + } + + copy(rv[i*dims:(i+1)*dims], flatVector) + } + + return rv, true +} + +func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, + pathString string, path []string, indexes []uint64, context *walkContext) bool { + vector, ok := processVector(propertyMightBeVector, fm.Dims) + // Don't add field to document if vector is invalid + if !ok { + return false + } + + fieldName := getFieldName(pathString, path, fm) + options := fm.Options() + field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, vector, + fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options) + context.doc.AddField(field) + + // "_all" composite field is not applicable for vector field + context.excludedFromAll = append(context.excludedFromAll, fieldName) + return true +} + +// ----------------------------------------------------------------------------- +// document validation functions + +func validateFieldMapping(field *FieldMapping, parentName string, + fieldAliasCtx map[string]*FieldMapping) error { + switch field.Type { + case "vector": + return validateVectorFieldAlias(field, parentName, fieldAliasCtx) + default: // non-vector field + return validateFieldType(field) + } +} + +func validateVectorFieldAlias(field *FieldMapping, parentName string, + fieldAliasCtx map[string]*FieldMapping) error { + + if field.Name == "" { + field.Name = parentName + } + + if field.Similarity == "" { + field.Similarity = index.DefaultSimilarityMetric + } + + if field.VectorIndexOptimizedFor == "" { + field.VectorIndexOptimizedFor = index.DefaultIndexOptimization + } + if _, exists := index.SupportedVectorIndexOptimizations[field.VectorIndexOptimizedFor]; !exists { + // if an unsupported config is provided, override to default + field.VectorIndexOptimizedFor = index.DefaultIndexOptimization + } + + // following fields are not applicable for vector + // thus, we set them to default values + field.IncludeInAll = false + field.IncludeTermVectors = false + field.Store = false + field.DocValues = false + field.SkipFreqNorm = true + + // # If alias is present, validate the field options as per the alias + // note: reading from a nil map is safe + if fieldAlias, ok := fieldAliasCtx[field.Name]; ok { + if field.Dims != fieldAlias.Dims { + return fmt.Errorf("field: '%s', invalid alias "+ + "(different dimensions %d and %d)", fieldAlias.Name, field.Dims, + fieldAlias.Dims) + } + + if field.Similarity != fieldAlias.Similarity { + return fmt.Errorf("field: '%s', invalid alias "+ + "(different similarity values %s and %s)", fieldAlias.Name, + field.Similarity, fieldAlias.Similarity) + } + + return nil + } + + // # Validate field options + + if field.Dims < MinVectorDims || field.Dims > MaxVectorDims { + return fmt.Errorf("field: '%s', invalid vector dimension: %d,"+ + " value should be in range (%d, %d)", field.Name, field.Dims, + MinVectorDims, MaxVectorDims) + } + + if _, ok := index.SupportedSimilarityMetrics[field.Similarity]; !ok { + return fmt.Errorf("field: '%s', invalid similarity "+ + "metric: '%s', valid metrics are: %+v", field.Name, field.Similarity, + reflect.ValueOf(index.SupportedSimilarityMetrics).MapKeys()) + } + + if fieldAliasCtx != nil { // writing to a nil map is unsafe + fieldAliasCtx[field.Name] = field + } + + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping_vector.go b/vendor/github.com/blevesearch/bleve/v2/mapping_vector.go new file mode 100644 index 0000000000..594313861e --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/mapping_vector.go @@ -0,0 +1,24 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package bleve + +import "github.com/blevesearch/bleve/v2/mapping" + +func NewVectorFieldMapping() *mapping.FieldMapping { + return mapping.NewVectorFieldMapping() +} diff --git a/vendor/github.com/blevesearch/bleve/v2/pre_search.go b/vendor/github.com/blevesearch/bleve/v2/pre_search.go new file mode 100644 index 0000000000..c8c55bfbc5 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/pre_search.go @@ -0,0 +1,59 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bleve + +// A preSearchResultProcessor processes the data in +// the preSearch result from multiple +// indexes in an alias and merges them together to +// create the final preSearch result +type preSearchResultProcessor interface { + // adds the preSearch result to the processor + add(*SearchResult, string) + // updates the final search result with the finalized + // data from the processor + finalize(*SearchResult) +} + +type knnPreSearchResultProcessor struct { + addFn func(sr *SearchResult, indexName string) + finalizeFn func(sr *SearchResult) +} + +func (k *knnPreSearchResultProcessor) add(sr *SearchResult, indexName string) { + if k.addFn != nil { + k.addFn(sr, indexName) + } +} + +func (k *knnPreSearchResultProcessor) finalize(sr *SearchResult) { + if k.finalizeFn != nil { + k.finalizeFn(sr) + } +} + +// ----------------------------------------------------------------------------- + +func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) { + if requestHasKNN(req) { + preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) + } +} + +func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor { + if requestHasKNN(req) { + return newKnnPreSearchResultProcessor(req) + } + return &knnPreSearchResultProcessor{} // equivalent to nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search.go b/vendor/github.com/blevesearch/bleve/v2/search.go index 8ca0310fbf..7861d24b8f 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search.go +++ b/vendor/github.com/blevesearch/bleve/v2/search.go @@ -15,7 +15,6 @@ package bleve import ( - "encoding/json" "fmt" "reflect" "sort" @@ -32,19 +31,19 @@ import ( "github.com/blevesearch/bleve/v2/util" ) -const defaultDateTimeParser = optional.Name +var reflectStaticSizeSearchResult int +var reflectStaticSizeSearchStatus int + +func init() { + var sr SearchResult + reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size()) + var ss SearchStatus + reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size()) +} var cache = registry.NewCache() -var ( - reflectStaticSizeSearchResult int - reflectStaticSizeSearchStatus int -) - -func init() { - reflectStaticSizeSearchResult = int(reflect.TypeOf(SearchResult{}).Size()) - reflectStaticSizeSearchStatus = int(reflect.TypeOf(SearchStatus{}).Size()) -} +const defaultDateTimeParser = optional.Name type dateTimeRange struct { Name string `json:"name,omitempty"` @@ -55,28 +54,24 @@ type dateTimeRange struct { endString *string } -func (dr *dateTimeRange) ParseDates(dateTimeParser analysis.DateTimeParser) (start, end time.Time, startLayout, endLayout string, err error) { +func (dr *dateTimeRange) ParseDates(dateTimeParser analysis.DateTimeParser) (start, end time.Time, err error) { start = dr.Start - startLayout = time.RFC3339Nano if dr.Start.IsZero() && dr.startString != nil { - s, layout, parseError := dateTimeParser.ParseDateTime(*dr.startString) + s, _, parseError := dateTimeParser.ParseDateTime(*dr.startString) if parseError != nil { - return start, end, startLayout, endLayout, fmt.Errorf("error parsing start date '%s' for date range name '%s': %v", *dr.startString, dr.Name, parseError) + return start, end, fmt.Errorf("error parsing start date '%s' for date range name '%s': %v", *dr.startString, dr.Name, parseError) } start = s - startLayout = layout } end = dr.End - endLayout = time.RFC3339Nano if dr.End.IsZero() && dr.endString != nil { - e, layout, parseError := dateTimeParser.ParseDateTime(*dr.endString) + e, _, parseError := dateTimeParser.ParseDateTime(*dr.endString) if parseError != nil { - return start, end, startLayout, endLayout, fmt.Errorf("error parsing end date '%s' for date range name '%s': %v", *dr.endString, dr.Name, parseError) + return start, end, fmt.Errorf("error parsing end date '%s' for date range name '%s': %v", *dr.endString, dr.Name, parseError) } end = e - endLayout = layout } - return start, end, startLayout, endLayout, err + return start, end, err } func (dr *dateTimeRange) UnmarshalJSON(input []byte) error { @@ -187,7 +182,7 @@ func (fr *FacetRequest) Validate() error { if dr.DateTimeParser == "" { // cannot parse the date range dates as the defaultDateTimeParser is overridden // so perform this validation at query time - start, end, _, _, err := dr.ParseDates(dateTimeParser) + start, end, err := dr.ParseDates(dateTimeParser) if err != nil { return fmt.Errorf("ParseDates err: %v, using date time parser named %s", err, defaultDateTimeParser) } @@ -285,51 +280,10 @@ func (h *HighlightRequest) AddField(field string) { h.Fields = append(h.Fields, field) } -// A SearchRequest describes all the parameters -// needed to search the index. -// Query is required. -// Size/From describe how much and which part of the -// result set to return. -// Highlight describes optional search result -// highlighting. -// Fields describes a list of field values which -// should be retrieved for result documents, provided they -// were stored while indexing. -// Facets describe the set of facets to be computed. -// Explain triggers inclusion of additional search -// result score explanations. -// Sort describes the desired order for the results to be returned. -// Score controls the kind of scoring performed -// SearchAfter supports deep paging by providing a minimum sort key -// SearchBefore supports deep paging by providing a maximum sort key -// sortFunc specifies the sort implementation to use for sorting results. -// -// A special field named "*" can be used to return all fields. -type SearchRequest struct { - ClientContextID string `json:"client_context_id,omitempty"` - Query query.Query `json:"query"` - Size int `json:"size"` - From int `json:"from"` - Highlight *HighlightRequest `json:"highlight"` - Fields []string `json:"fields"` - Facets FacetsRequest `json:"facets"` - Explain bool `json:"explain"` - Sort search.SortOrder `json:"sort"` - IncludeLocations bool `json:"includeLocations"` - Score string `json:"score,omitempty"` - SearchAfter []string `json:"search_after"` - SearchBefore []string `json:"search_before"` - - sortFunc func(sort.Interface) -} - -func (r *SearchRequest) SetClientContextID(id string) { - r.ClientContextID = id -} - func (r *SearchRequest) Validate() error { if srq, ok := r.Query.(query.ValidatableQuery); ok { - if err := srq.Validate(); err != nil { + err := srq.Validate() + if err != nil { return err } } @@ -355,6 +309,10 @@ func (r *SearchRequest) Validate() error { } } + err := validateKNN(r) + if err != nil { + return err + } return r.Facets.Validate() } @@ -393,69 +351,6 @@ func (r *SearchRequest) SetSearchBefore(before []string) { r.SearchBefore = before } -// UnmarshalJSON deserializes a JSON representation of -// a SearchRequest -func (r *SearchRequest) UnmarshalJSON(input []byte) error { - var ( - temp struct { - ClientContextID string `json:"client_context_id"` - Q json.RawMessage `json:"query"` - Size *int `json:"size"` - From int `json:"from"` - Highlight *HighlightRequest `json:"highlight"` - Fields []string `json:"fields"` - Facets FacetsRequest `json:"facets"` - Explain bool `json:"explain"` - Sort []json.RawMessage `json:"sort"` - IncludeLocations bool `json:"includeLocations"` - Score string `json:"score"` - SearchAfter []string `json:"search_after"` - SearchBefore []string `json:"search_before"` - } - err error - ) - - if err = util.UnmarshalJSON(input, &temp); err != nil { - return err - } - - if temp.Size == nil { - r.Size = 10 - } else { - r.Size = *temp.Size - } - if temp.Sort == nil { - r.Sort = search.SortOrder{&search.SortScore{Desc: true}} - } else { - if r.Sort, err = search.ParseSortOrderJSON(temp.Sort); err != nil { - return err - } - } - r.ClientContextID = temp.ClientContextID - r.From = temp.From - r.Explain = temp.Explain - r.Highlight = temp.Highlight - r.Fields = temp.Fields - r.Facets = temp.Facets - r.IncludeLocations = temp.IncludeLocations - r.Score = temp.Score - r.SearchAfter = temp.SearchAfter - r.SearchBefore = temp.SearchBefore - if r.Query, err = query.ParseQuery(temp.Q); err != nil { - return err - } - - if r.Size < 0 { - r.Size = 10 - } - if r.From < 0 { - r.From = 0 - } - - return nil - -} - // NewSearchRequest creates a new SearchRequest // for the Query, using default values for all // other search parameters. @@ -491,7 +386,8 @@ func (iem IndexErrMap) MarshalJSON() ([]byte, error) { func (iem IndexErrMap) UnmarshalJSON(data []byte) error { var tmp map[string]string - if err := util.UnmarshalJSON(data, &tmp); err != nil { + err := util.UnmarshalJSON(data, &tmp) + if err != nil { return err } for k, v := range tmp { @@ -541,7 +437,7 @@ func (ss *SearchStatus) Merge(other *SearchStatus) { // Facets - The facet results for the search. type SearchResult struct { Status *SearchStatus `json:"status"` - Request *SearchRequest `json:"request"` + Request *SearchRequest `json:"request,omitempty"` Hits search.DocumentMatchCollection `json:"hits"` Total uint64 `json:"total_hits"` Cost uint64 `json:"cost"` @@ -571,7 +467,7 @@ func (sr *SearchResult) Size() int { func (sr *SearchResult) String() string { rv := "" if sr.Total > 0 { - if sr.Request.Size > 0 { + if sr.Request != nil && sr.Request.Size > 0 { rv = fmt.Sprintf("%d matches, showing %d through %d, took %s\n", sr.Total, sr.Request.From+1, sr.Request.From+len(sr.Hits), sr.Took) for i, hit := range sr.Hits { rv += fmt.Sprintf("%5d. %s (%f)\n", i+sr.Request.From+1, hit.ID, hit.Score) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector.go b/vendor/github.com/blevesearch/bleve/v2/search/collector.go index 38e34fe7c0..e81219e540 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector.go @@ -44,9 +44,15 @@ type MakeDocumentMatchHandlerKeyType string var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType( "MakeDocumentMatchHandlerKey") +var MakeKNNDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType( + "MakeKNNDocumentMatchHandlerKey") + // MakeDocumentMatchHandler is an optional DocumentMatchHandler // builder function which the applications can pass to bleve. // These builder methods gives a DocumentMatchHandler function // to bleve, which it will invoke on every document matches. type MakeDocumentMatchHandler func(ctx *SearchContext) ( callback DocumentMatchHandler, loadID bool, err error) + +type MakeKNNDocumentMatchHandler func(ctx *SearchContext) ( + callback DocumentMatchHandler, err error) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/heap.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/heap.go index 9503f00603..cd662bcf9b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector/heap.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/heap.go @@ -69,6 +69,10 @@ func (c *collectStoreHeap) Final(skip int, fixup collectorFixup) (search.Documen return rv, nil } +func (c *collectStoreHeap) Internal() search.DocumentMatchCollection { + return c.heap +} + // heap interface implementation func (c *collectStoreHeap) Len() int { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/knn.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/knn.go new file mode 100644 index 0000000000..465bf69272 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/knn.go @@ -0,0 +1,262 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package collector + +import ( + "context" + "time" + + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +type collectStoreKNN struct { + internalHeaps []collectorStore + kValues []int64 + allHits map[*search.DocumentMatch]struct{} + ejectedDocs map[*search.DocumentMatch]struct{} +} + +func newStoreKNN(internalHeaps []collectorStore, kValues []int64) *collectStoreKNN { + return &collectStoreKNN{ + internalHeaps: internalHeaps, + kValues: kValues, + ejectedDocs: make(map[*search.DocumentMatch]struct{}), + allHits: make(map[*search.DocumentMatch]struct{}), + } +} + +// Adds a document to the collector store and returns the documents that were ejected +// from the store. The documents that were ejected from the store are the ones that +// were not in the top K documents for any of the heaps. +// These document are put back into the pool document match pool in the KNN Collector. +func (c *collectStoreKNN) AddDocument(doc *search.DocumentMatch) []*search.DocumentMatch { + for heapIdx := 0; heapIdx < len(c.internalHeaps); heapIdx++ { + if _, ok := doc.ScoreBreakdown[heapIdx]; !ok { + continue + } + ejectedDoc := c.internalHeaps[heapIdx].AddNotExceedingSize(doc, int(c.kValues[heapIdx])) + if ejectedDoc != nil { + delete(ejectedDoc.ScoreBreakdown, heapIdx) + c.ejectedDocs[ejectedDoc] = struct{}{} + } + } + var rv []*search.DocumentMatch + for doc := range c.ejectedDocs { + if len(doc.ScoreBreakdown) == 0 { + rv = append(rv, doc) + } + // clear out the ejectedDocs map to reuse it in the next AddDocument call + delete(c.ejectedDocs, doc) + } + return rv +} + +func (c *collectStoreKNN) Final(fixup collectorFixup) (search.DocumentMatchCollection, error) { + for _, heap := range c.internalHeaps { + for _, doc := range heap.Internal() { + // duplicates may be present across the internal heaps + // meaning the same document match may be in the top K + // for multiple KNN queries. + c.allHits[doc] = struct{}{} + } + } + size := len(c.allHits) + if size <= 0 { + return make(search.DocumentMatchCollection, 0), nil + } + rv := make(search.DocumentMatchCollection, size) + i := 0 + for doc := range c.allHits { + if fixup != nil { + err := fixup(doc) + if err != nil { + return nil, err + } + } + rv[i] = doc + i++ + } + return rv, nil +} + +func MakeKNNDocMatchHandler(ctx *search.SearchContext) (search.DocumentMatchHandler, error) { + var hc *KNNCollector + var ok bool + if hc, ok = ctx.Collector.(*KNNCollector); ok { + return func(d *search.DocumentMatch) error { + if d == nil { + return nil + } + toRelease := hc.knnStore.AddDocument(d) + for _, doc := range toRelease { + ctx.DocumentMatchPool.Put(doc) + } + return nil + }, nil + } + return nil, nil +} + +func GetNewKNNCollectorStore(kArray []int64) *collectStoreKNN { + internalHeaps := make([]collectorStore, len(kArray)) + for knnIdx, k := range kArray { + // TODO - Check if the datatype of k can be made into an int instead of int64 + idx := knnIdx + internalHeaps[idx] = getOptimalCollectorStore(int(k), 0, func(i, j *search.DocumentMatch) int { + if i.ScoreBreakdown[idx] < j.ScoreBreakdown[idx] { + return 1 + } + return -1 + }) + } + return newStoreKNN(internalHeaps, kArray) +} + +// implements Collector interface +type KNNCollector struct { + knnStore *collectStoreKNN + size int + total uint64 + took time.Duration + results search.DocumentMatchCollection + maxScore float64 +} + +func NewKNNCollector(kArray []int64, size int64) *KNNCollector { + return &KNNCollector{ + knnStore: GetNewKNNCollectorStore(kArray), + size: int(size), + } +} + +func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { + startTime := time.Now() + var err error + var next *search.DocumentMatch + + // pre-allocate enough space in the DocumentMatchPool + // unless the sum of K is too large, then cap it + // everything should still work, just allocates DocumentMatches on demand + backingSize := hc.size + if backingSize > PreAllocSizeSkipCap { + backingSize = PreAllocSizeSkipCap + 1 + } + searchContext := &search.SearchContext{ + DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), 0), + Collector: hc, + IndexReader: reader, + } + + dmHandlerMakerKNN := MakeKNNDocMatchHandler + if cv := ctx.Value(search.MakeKNNDocumentMatchHandlerKey); cv != nil { + dmHandlerMakerKNN = cv.(search.MakeKNNDocumentMatchHandler) + } + // use the application given builder for making the custom document match + // handler and perform callbacks/invocations on the newly made handler. + dmHandler, err := dmHandlerMakerKNN(searchContext) + if err != nil { + return err + } + select { + case <-ctx.Done(): + search.RecordSearchCost(ctx, search.AbortM, 0) + return ctx.Err() + default: + next, err = searcher.Next(searchContext) + } + for err == nil && next != nil { + if hc.total%CheckDoneEvery == 0 { + select { + case <-ctx.Done(): + search.RecordSearchCost(ctx, search.AbortM, 0) + return ctx.Err() + default: + } + } + hc.total++ + + err = dmHandler(next) + if err != nil { + break + } + + next, err = searcher.Next(searchContext) + } + if err != nil { + return err + } + + // help finalize/flush the results in case + // of custom document match handlers. + err = dmHandler(nil) + if err != nil { + return err + } + + // compute search duration + hc.took = time.Since(startTime) + + // finalize actual results + err = hc.finalizeResults(reader) + if err != nil { + return err + } + return nil +} + +func (hc *KNNCollector) finalizeResults(r index.IndexReader) error { + var err error + hc.results, err = hc.knnStore.Final(func(doc *search.DocumentMatch) error { + if doc.ID == "" { + // look up the id since we need it for lookup + var err error + doc.ID, err = r.ExternalID(doc.IndexInternalID) + if err != nil { + return err + } + } + return nil + }) + return err +} + +func (hc *KNNCollector) Results() search.DocumentMatchCollection { + return hc.results +} + +func (hc *KNNCollector) Total() uint64 { + return hc.total +} + +func (hc *KNNCollector) MaxScore() float64 { + return hc.maxScore +} + +func (hc *KNNCollector) Took() time.Duration { + return hc.took +} + +func (hc *KNNCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) { + // facet unsupported for vector search +} + +func (hc *KNNCollector) FacetResults() search.FacetResults { + // facet unsupported for vector search + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/list.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/list.go index 20d4c9d01b..f73505e7dd 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector/list.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/list.go @@ -81,6 +81,16 @@ func (c *collectStoreList) Final(skip int, fixup collectorFixup) (search.Documen return search.DocumentMatchCollection{}, nil } +func (c *collectStoreList) Internal() search.DocumentMatchCollection { + rv := make(search.DocumentMatchCollection, c.results.Len()) + i := 0 + for e := c.results.Front(); e != nil; e = e.Next() { + rv[i] = e.Value.(*search.DocumentMatch) + i++ + } + return rv +} + func (c *collectStoreList) len() int { return c.results.Len() } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/slice.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/slice.go index b38d9abc4f..07534e6934 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector/slice.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/slice.go @@ -72,6 +72,10 @@ func (c *collectStoreSlice) Final(skip int, fixup collectorFixup) (search.Docume return search.DocumentMatchCollection{}, nil } +func (c *collectStoreSlice) Internal() search.DocumentMatchCollection { + return c.slice +} + func (c *collectStoreSlice) len() int { return len(c.slice) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go index 270d5f924f..fc338f54ed 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go @@ -39,6 +39,9 @@ type collectorStore interface { AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error) + + // Provide access the internal heap implementation + Internal() search.DocumentMatchCollection } // PreAllocSizeSkipCap will cap preallocation to this amount when @@ -72,6 +75,9 @@ type TopNCollector struct { updateFieldVisitor index.DocValueVisitor dvReader index.DocValueReader searchAfter *search.DocumentMatch + + knnHits map[string]*search.DocumentMatch + computeNewScoreExpl search.ScoreExplCorrectionCallbackFunc } // CheckDoneEvery controls how frequently we check the context deadline @@ -89,44 +95,16 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector // ordering hits by the provided sort order func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector { rv := newTopNCollector(size, 0, sort) - rv.searchAfter = &search.DocumentMatch{ - Sort: after, - } - - for pos, ss := range sort { - if ss.RequiresDocID() { - rv.searchAfter.ID = after[pos] - } - if ss.RequiresScoring() { - if score, err := strconv.ParseFloat(after[pos], 64); err == nil { - rv.searchAfter.Score = score - } - } - } - + rv.searchAfter = createSearchAfterDocument(sort, after) return rv } func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { hc := &TopNCollector{size: size, skip: skip, sort: sort} - // pre-allocate space on the store to avoid reslicing - // unless the size + skip is too large, then cap it - // everything should still work, just reslices as necessary - backingSize := size + skip + 1 - if size+skip > PreAllocSizeSkipCap { - backingSize = PreAllocSizeSkipCap + 1 - } - - if size+skip > 10 { - hc.store = newStoreHeap(backingSize, func(i, j *search.DocumentMatch) int { - return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) - }) - } else { - hc.store = newStoreSlice(backingSize, func(i, j *search.DocumentMatch) int { - return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) - }) - } + hc.store = getOptimalCollectorStore(size, skip, func(i, j *search.DocumentMatch) int { + return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) + }) // these lookups traverse an interface, so do once up-front if sort.RequiresDocID() { @@ -139,6 +117,59 @@ func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector return hc } +func createSearchAfterDocument(sort search.SortOrder, after []string) *search.DocumentMatch { + rv := &search.DocumentMatch{ + Sort: after, + } + for pos, ss := range sort { + if ss.RequiresDocID() { + rv.ID = after[pos] + } + if ss.RequiresScoring() { + if score, err := strconv.ParseFloat(after[pos], 64); err == nil { + rv.Score = score + } + } + } + return rv +} + +// Filter document matches based on the SearchAfter field in the SearchRequest. +func FilterHitsBySearchAfter(hits []*search.DocumentMatch, sort search.SortOrder, after []string) []*search.DocumentMatch { + if len(hits) == 0 { + return hits + } + // create a search after document + searchAfter := createSearchAfterDocument(sort, after) + // filter the hits + idx := 0 + cachedScoring := sort.CacheIsScore() + cachedDesc := sort.CacheDescending() + for _, hit := range hits { + if sort.Compare(cachedScoring, cachedDesc, hit, searchAfter) > 0 { + hits[idx] = hit + idx++ + } + } + return hits[:idx] +} + +func getOptimalCollectorStore(size, skip int, comparator collectorCompare) collectorStore { + // pre-allocate space on the store to avoid reslicing + // unless the size + skip is too large, then cap it + // everything should still work, just reslices as necessary + backingSize := size + skip + 1 + if size+skip > PreAllocSizeSkipCap { + backingSize = PreAllocSizeSkipCap + 1 + } + + if size+skip > 10 { + return newStoreHeap(backingSize, comparator) + } else { + return newStoreSlice(backingSize, comparator) + } +} + func (hc *TopNCollector) Size() int { sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr @@ -215,7 +246,12 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, } } - err = hc.prepareDocumentMatch(searchContext, reader, next) + err = hc.adjustDocumentMatch(searchContext, reader, next) + if err != nil { + break + } + + err = hc.prepareDocumentMatch(searchContext, reader, next, false) if err != nil { break } @@ -227,6 +263,23 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, next, err = searcher.Next(searchContext) } + if err != nil { + return err + } + if hc.knnHits != nil { + // we may have some knn hits left that did not match any of the top N tf-idf hits + // we need to add them to the collector store to consider them as well. + for _, knnDoc := range hc.knnHits { + err = hc.prepareDocumentMatch(searchContext, reader, knnDoc, true) + if err != nil { + return err + } + err = dmHandler(knnDoc) + if err != nil { + return err + } + } + } statsCallbackFn := ctx.Value(search.SearchIOStatsCallbackKey) if statsCallbackFn != nil { @@ -258,12 +311,40 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, var sortByScoreOpt = []string{"_score"} -func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, +func (hc *TopNCollector) adjustDocumentMatch(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) (err error) { + if hc.knnHits != nil { + d.ID, err = reader.ExternalID(d.IndexInternalID) + if err != nil { + return err + } + if knnHit, ok := hc.knnHits[d.ID]; ok { + d.Score, d.Expl = hc.computeNewScoreExpl(d, knnHit) + delete(hc.knnHits, d.ID) + } + } + return nil +} + +func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, + reader index.IndexReader, d *search.DocumentMatch, isKnnDoc bool) (err error) { // visit field terms for features that require it (sort, facets) - if len(hc.neededFields) > 0 { - err = hc.visitFieldTerms(reader, d) + if !isKnnDoc && len(hc.neededFields) > 0 { + err = hc.visitFieldTerms(reader, d, hc.updateFieldVisitor) + if err != nil { + return err + } + } else if isKnnDoc && hc.facetsBuilder != nil { + // we need to visit the field terms for the knn document + // only for those fields that are required for faceting + // and not for sorting. This is because the knn document's + // sort value is already computed in the knn collector. + err = hc.visitFieldTerms(reader, d, func(field string, term []byte) { + if hc.facetsBuilder != nil { + hc.facetsBuilder.UpdateVisitor(field, term) + } + }) if err != nil { return err } @@ -277,9 +358,14 @@ func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, if d.Score > hc.maxScore { hc.maxScore = d.Score } + // early exit as the document match had its sort value calculated in the knn + // collector itself + if isKnnDoc { + return nil + } // see if we need to load ID (at this early stage, for example to sort on it) - if hc.needDocIds { + if hc.needDocIds && d.ID == "" { d.ID, err = reader.ExternalID(d.IndexInternalID) if err != nil { return err @@ -314,6 +400,7 @@ func MakeTopNDocumentMatchHandler( // but we want to allow for exact match, so we pretend hc.searchAfter.HitNumber = d.HitNumber if hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.searchAfter) <= 0 { + ctx.DocumentMatchPool.Put(d) return nil } } @@ -353,12 +440,21 @@ func MakeTopNDocumentMatchHandler( // visitFieldTerms is responsible for visiting the field terms of the // search hit, and passing visited terms to the sort and facet builder -func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error { +func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch, v index.DocValueVisitor) error { if hc.facetsBuilder != nil { hc.facetsBuilder.StartDoc() } + if d.ID != "" && d.IndexInternalID == nil { + // this document may have been sent over as preSearchData and + // we need to look up the internal id to visit the doc values for it + var err error + d.IndexInternalID, err = reader.InternalID(d.ID) + if err != nil { + return err + } + } - err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor) + err := hc.dvReader.VisitDocValues(d.IndexInternalID, v) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() } @@ -435,3 +531,11 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { } return nil } + +func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, newScoreExplComputer search.ScoreExplCorrectionCallbackFunc) { + hc.knnHits = make(map[string]*search.DocumentMatch, len(knnHits)) + for _, hit := range knnHits { + hc.knnHits[hit.ID] = hit + } + hc.computeNewScoreExpl = newScoreExplComputer +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go index c272396b71..ff5167f21b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go @@ -17,7 +17,6 @@ package facet import ( "reflect" "sort" - "strconv" "time" "github.com/blevesearch/bleve/v2/numeric" @@ -36,10 +35,8 @@ func init() { } type dateTimeRange struct { - start time.Time - end time.Time - startLayout string - endLayout string + start time.Time + end time.Time } type DateTimeFacetBuilder struct { @@ -78,12 +75,10 @@ func (fb *DateTimeFacetBuilder) Size() int { return sizeInBytes } -func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time, startLayout string, endLayout string) { +func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) { r := dateTimeRange{ - start: start, - end: end, - startLayout: startLayout, - endLayout: endLayout, + start: start, + end: end, } fb.ranges[name] = &r } @@ -139,23 +134,11 @@ func (fb *DateTimeFacetBuilder) Result() *search.FacetResult { Count: count, } if !dateRange.start.IsZero() { - var start string - if dateRange.startLayout == "" { - // layout not set probably means it is probably a timestamp - start = strconv.FormatInt(dateRange.start.UnixNano(), 10) - } else { - start = dateRange.start.Format(dateRange.startLayout) - } + start := dateRange.start.Format(time.RFC3339Nano) tf.Start = &start } if !dateRange.end.IsZero() { - var end string - if dateRange.endLayout == "" { - // layout not set probably means it is probably a timestamp - end = strconv.FormatInt(dateRange.end.UnixNano(), 10) - } else { - end = dateRange.end.Format(dateRange.endLayout) - } + end := dateRange.end.Format(time.RFC3339Nano) tf.End = &end } rv.DateRanges = append(rv.DateRanges, tf) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/facets_builder.go b/vendor/github.com/blevesearch/bleve/v2/search/facets_builder.go index ebe785c02d..4b1f2db788 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/facets_builder.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/facets_builder.go @@ -321,17 +321,29 @@ func (fr *FacetResult) Merge(other *FacetResult) { fr.Total += other.Total fr.Missing += other.Missing fr.Other += other.Other - if fr.Terms != nil && other.Terms != nil { + if other.Terms != nil { + if fr.Terms == nil { + fr.Terms = other.Terms + return + } for _, term := range other.Terms.termFacets { fr.Terms.Add(term) } } - if fr.NumericRanges != nil && other.NumericRanges != nil { + if other.NumericRanges != nil { + if fr.NumericRanges == nil { + fr.NumericRanges = other.NumericRanges + return + } for _, nr := range other.NumericRanges { fr.NumericRanges = fr.NumericRanges.Add(nr) } } - if fr.DateRanges != nil && other.DateRanges != nil { + if other.DateRanges != nil { + if fr.DateRanges == nil { + fr.DateRanges = other.DateRanges + return + } for _, dr := range other.DateRanges { fr.DateRanges = fr.DateRanges.Add(dr) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go index f8573d081f..b307865f3f 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go @@ -27,10 +27,15 @@ import ( ) type DisjunctionQuery struct { - Disjuncts []Query `json:"disjuncts"` - BoostVal *Boost `json:"boost,omitempty"` - Min float64 `json:"min"` - queryStringMode bool + Disjuncts []Query `json:"disjuncts"` + BoostVal *Boost `json:"boost,omitempty"` + Min float64 `json:"min"` + retrieveScoreBreakdown bool + queryStringMode bool +} + +func (q *DisjunctionQuery) RetrieveScoreBreakdown(b bool) { + q.retrieveScoreBreakdown = b } // NewDisjunctionQuery creates a new compound Query. @@ -73,18 +78,22 @@ func (q *DisjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m } return nil, err } - if _, ok := sr.(*searcher.MatchNoneSearcher); ok && q.queryStringMode { - // in query string mode, skip match none - continue + if sr != nil { + if _, ok := sr.(*searcher.MatchNoneSearcher); ok && q.queryStringMode { + // in query string mode, skip match none + continue + } + ss = append(ss, sr) } - ss = append(ss, sr) } if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) } - return searcher.NewDisjunctionSearcher(ctx, i, ss, q.Min, options) + nctx := context.WithValue(ctx, search.IncludeScoreBreakdownKey, q.retrieveScoreBreakdown) + + return searcher.NewDisjunctionSearcher(nctx, i, ss, q.Min, options) } func (q *DisjunctionQuery) Validate() error { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go new file mode 100644 index 0000000000..030483e546 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go @@ -0,0 +1,74 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package query + +import ( + "context" + "fmt" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" + index "github.com/blevesearch/bleve_index_api" +) + +type KNNQuery struct { + VectorField string `json:"field"` + Vector []float32 `json:"vector"` + K int64 `json:"k"` + BoostVal *Boost `json:"boost,omitempty"` +} + +func NewKNNQuery(vector []float32) *KNNQuery { + return &KNNQuery{Vector: vector} +} + +func (q *KNNQuery) Field() string { + return q.VectorField +} + +func (q *KNNQuery) SetK(k int64) { + q.K = k +} + +func (q *KNNQuery) SetFieldVal(field string) { + q.VectorField = field +} + +func (q *KNNQuery) SetBoost(b float64) { + boost := Boost(b) + q.BoostVal = &boost +} + +func (q *KNNQuery) Boost() float64 { + return q.BoostVal.Value() +} + +func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader, + m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { + fieldMapping := m.FieldMappingForPath(q.VectorField) + similarityMetric := fieldMapping.Similarity + if similarityMetric == "" { + similarityMetric = index.DefaultSimilarityMetric + } + if q.K <= 0 || len(q.Vector) == 0 { + return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty") + } + return searcher.NewKNNSearcher(ctx, i, m, options, q.VectorField, + q.Vector, q.K, q.BoostVal.Value(), similarityMetric) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go index eb7b34adbd..26ab656e74 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go @@ -65,14 +65,55 @@ type ValidatableQuery interface { Validate() error } +// ParseQuery deserializes a JSON representation of +// a PreSearchData object. +func ParsePreSearchData(input []byte) (map[string]interface{}, error) { + var rv map[string]interface{} + + var tmp map[string]json.RawMessage + err := util.UnmarshalJSON(input, &tmp) + if err != nil { + return nil, err + } + + for k, v := range tmp { + switch k { + case search.KnnPreSearchDataKey: + var value []*search.DocumentMatch + if v != nil { + err := util.UnmarshalJSON(v, &value) + if err != nil { + return nil, err + } + } + if rv == nil { + rv = make(map[string]interface{}) + } + rv[search.KnnPreSearchDataKey] = value + } + } + return rv, nil +} + // ParseQuery deserializes a JSON representation of // a Query object. func ParseQuery(input []byte) (Query, error) { + if len(input) == 0 { + // interpret as a match_none query + return NewMatchNoneQuery(), nil + } + var tmp map[string]interface{} err := util.UnmarshalJSON(input, &tmp) if err != nil { return nil, err } + + if len(tmp) == 0 { + // interpret as a match_none query + return NewMatchNoneQuery(), nil + } + _, hasFuzziness := tmp["fuzziness"] _, isMatchQuery := tmp["match"] _, isMatchPhraseQuery := tmp["match_phrase"] diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go index fc36fd5bfa..10190bd85b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go @@ -37,6 +37,7 @@ type ConstantScorer struct { queryNorm float64 queryWeight float64 queryWeightExplanation *search.Explanation + includeScore bool } func (s *ConstantScorer) Size() int { @@ -51,10 +52,11 @@ func (s *ConstantScorer) Size() int { func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer { rv := ConstantScorer{ - options: options, - queryWeight: 1.0, - constant: constant, - boost: boost, + options: options, + queryWeight: 1.0, + constant: constant, + boost: boost, + includeScore: options.Score != "none", } return &rv @@ -92,35 +94,38 @@ func (s *ConstantScorer) SetQueryNorm(qnorm float64) { func (s *ConstantScorer) Score(ctx *search.SearchContext, id index.IndexInternalID) *search.DocumentMatch { var scoreExplanation *search.Explanation - score := s.constant - - if s.options.Explain { - scoreExplanation = &search.Explanation{ - Value: score, - Message: fmt.Sprintf("ConstantScore()"), - } - } - - // if the query weight isn't 1, multiply - if s.queryWeight != 1.0 { - score = score * s.queryWeight - if s.options.Explain { - childExplanations := make([]*search.Explanation, 2) - childExplanations[0] = s.queryWeightExplanation - childExplanations[1] = scoreExplanation - scoreExplanation = &search.Explanation{ - Value: score, - Message: fmt.Sprintf("weight(^%f), product of:", s.boost), - Children: childExplanations, - } - } - } - rv := ctx.DocumentMatchPool.Get() rv.IndexInternalID = id - rv.Score = score - if s.options.Explain { - rv.Expl = scoreExplanation + + if s.includeScore { + score := s.constant + + if s.options.Explain { + scoreExplanation = &search.Explanation{ + Value: score, + Message: fmt.Sprintf("ConstantScore()"), + } + } + + // if the query weight isn't 1, multiply + if s.queryWeight != 1.0 { + score = score * s.queryWeight + if s.options.Explain { + childExplanations := make([]*search.Explanation, 2) + childExplanations[0] = s.queryWeightExplanation + childExplanations[1] = scoreExplanation + scoreExplanation = &search.Explanation{ + Value: score, + Message: fmt.Sprintf("weight(^%f), product of:", s.boost), + Children: childExplanations, + } + } + } + + rv.Score = score + if s.options.Explain { + rv.Expl = scoreExplanation + } } return rv diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go index 054e76fd42..fe319bbebf 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go @@ -81,3 +81,43 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ return rv } + +// This method is used only when disjunction searcher is used over multiple +// KNN searchers, where only the score breakdown and the optional explanation breakdown +// is required. The final score and explanation is set when we finalize the KNN hits. +func (s *DisjunctionQueryScorer) ScoreAndExplBreakdown(ctx *search.SearchContext, constituents []*search.DocumentMatch, + matchingIdxs []int, originalPositions []int, countTotal int) *search.DocumentMatch { + + scoreBreakdown := make(map[int]float64) + var childrenExplanations []*search.Explanation + if s.options.Explain { + // since we want to notify which expl belongs to which matched searcher within the disjunction searcher + childrenExplanations = make([]*search.Explanation, countTotal) + } + + for i, docMatch := range constituents { + var index int + if originalPositions != nil { + // scorer used in disjunction slice searcher + index = originalPositions[matchingIdxs[i]] + } else { + // scorer used in disjunction heap searcher + index = matchingIdxs[i] + } + scoreBreakdown[index] = docMatch.Score + if s.options.Explain { + childrenExplanations[index] = docMatch.Expl + } + } + var explBreakdown *search.Explanation + if s.options.Explain { + explBreakdown = &search.Explanation{Children: childrenExplanations} + } + + rv := constituents[0] + rv.ScoreBreakdown = scoreBreakdown + rv.Expl = explBreakdown + rv.FieldTermLocations = search.MergeFieldTermLocations( + rv.FieldTermLocations, constituents[1:]) + return rv +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go new file mode 100644 index 0000000000..326b435d61 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go @@ -0,0 +1,156 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package scorer + +import ( + "fmt" + "math" + "reflect" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeKNNQueryScorer int + +func init() { + var sqs KNNQueryScorer + reflectStaticSizeKNNQueryScorer = int(reflect.TypeOf(sqs).Size()) +} + +type KNNQueryScorer struct { + queryVector []float32 + queryField string + queryWeight float64 + queryBoost float64 + queryNorm float64 + options search.SearcherOptions + similarityMetric string + queryWeightExplanation *search.Explanation +} + +func (s *KNNQueryScorer) Size() int { + sizeInBytes := reflectStaticSizeKNNQueryScorer + size.SizeOfPtr + + (len(s.queryVector) * size.SizeOfFloat32) + len(s.queryField) + + if s.queryWeightExplanation != nil { + sizeInBytes += s.queryWeightExplanation.Size() + } + + return sizeInBytes +} + +func NewKNNQueryScorer(queryVector []float32, queryField string, queryBoost float64, + options search.SearcherOptions, + similarityMetric string) *KNNQueryScorer { + return &KNNQueryScorer{ + queryVector: queryVector, + queryField: queryField, + queryBoost: queryBoost, + queryWeight: 1.0, + options: options, + similarityMetric: similarityMetric, + } +} + +// Score used when the knnMatch.Score = 0 -> +// the query and indexed vector are exactly the same. +const maxKNNScore = math.MaxFloat32 + +func (sqs *KNNQueryScorer) Score(ctx *search.SearchContext, + knnMatch *index.VectorDoc) *search.DocumentMatch { + rv := ctx.DocumentMatchPool.Get() + var scoreExplanation *search.Explanation + score := knnMatch.Score + if sqs.similarityMetric == index.EuclideanDistance { + // in case of euclidean distance being the distance metric, + // an exact vector (perfect match), would return distance = 0 + if score == 0 { + score = maxKNNScore + } else { + // euclidean distances need to be inverted to work with + // tf-idf scoring + score = 1.0 / score + } + } + if sqs.options.Explain { + scoreExplanation = &search.Explanation{ + Value: score, + Message: fmt.Sprintf("fieldWeight(%s in doc %s), score of:", + sqs.queryField, knnMatch.ID), + Children: []*search.Explanation{ + { + Value: score, + Message: fmt.Sprintf("vector(field(%s:%s) with similarity_metric(%s)=%e", + sqs.queryField, knnMatch.ID, sqs.similarityMetric, score), + }, + }, + } + } + // if the query weight isn't 1, multiply + if sqs.queryWeight != 1.0 && score != maxKNNScore { + score = score * sqs.queryWeight + if sqs.options.Explain { + scoreExplanation = &search.Explanation{ + Value: score, + // Product of score * weight + // Avoid adding the query vector to the explanation since vectors + // can get quite large. + Message: fmt.Sprintf("weight(%s:query Vector^%f in %s), product of:", + sqs.queryField, sqs.queryBoost, knnMatch.ID), + Children: []*search.Explanation{sqs.queryWeightExplanation, scoreExplanation}, + } + } + } + rv.Score = score + if sqs.options.Explain { + rv.Expl = scoreExplanation + } + rv.IndexInternalID = append(rv.IndexInternalID, knnMatch.ID...) + return rv +} + +func (sqs *KNNQueryScorer) Weight() float64 { + return sqs.queryBoost * sqs.queryBoost +} + +func (sqs *KNNQueryScorer) SetQueryNorm(qnorm float64) { + sqs.queryNorm = qnorm + + // update the query weight + sqs.queryWeight = sqs.queryBoost * sqs.queryNorm + + if sqs.options.Explain { + childrenExplanations := make([]*search.Explanation, 2) + childrenExplanations[0] = &search.Explanation{ + Value: sqs.queryBoost, + Message: "boost", + } + childrenExplanations[1] = &search.Explanation{ + Value: sqs.queryNorm, + Message: "queryNorm", + } + sqs.queryWeightExplanation = &search.Explanation{ + Value: sqs.queryWeight, + Message: fmt.Sprintf("queryWeight(%s:query Vector^%f), product of:", + sqs.queryField, sqs.queryBoost), + Children: childrenExplanations, + } + } +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/search.go b/vendor/github.com/blevesearch/bleve/v2/search/search.go index b7a3c42ae2..515a320f79 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/search.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/search.go @@ -147,7 +147,7 @@ type DocumentMatch struct { Index string `json:"index,omitempty"` ID string `json:"id"` IndexInternalID index.IndexInternalID `json:"-"` - Score float64 `json:"score"` + Score float64 `json:"score,omitempty"` Expl *Explanation `json:"explanation,omitempty"` Locations FieldTermLocationMap `json:"locations,omitempty"` Fragments FieldFragmentMap `json:"fragments,omitempty"` @@ -173,6 +173,22 @@ type DocumentMatch struct { // not all sub-queries matched // if false, all the sub-queries matched PartialMatch bool `json:"partial_match,omitempty"` + + // used to indicate the sub-scores that combined to form the + // final score for this document match. This is only populated + // when the search request's query is a DisjunctionQuery + // or a ConjunctionQuery. The map key is the index of the sub-query + // in the DisjunctionQuery or ConjunctionQuery. The map value is the + // sub-score for that sub-query. + ScoreBreakdown map[int]float64 `json:"score_breakdown,omitempty"` + + // internal variable used in PreSearch phase of search in alias + // to indicate the name of the index that this match came from. + // used in knn search. + // it is a stack of index names, the top of the stack is the name + // of the index that this match came from + // of the current alias view, used in alias of aliases scenario + IndexNames []string `json:"index_names,omitempty"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -334,7 +350,7 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location { } func (dm *DocumentMatch) String() string { - return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) + return fmt.Sprintf("[%s-%f]", dm.ID, dm.Score) } type DocumentMatchCollection []*DocumentMatch diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/optimize_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/optimize_knn.go new file mode 100644 index 0000000000..efe262b5ba --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/optimize_knn.go @@ -0,0 +1,53 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package searcher + +import ( + "context" + + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +func optimizeKNN(ctx context.Context, indexReader index.IndexReader, + qsearchers []search.Searcher) error { + var octx index.VectorOptimizableContext + var err error + + for _, searcher := range qsearchers { + // Only applicable to KNN Searchers. + o, ok := searcher.(index.VectorOptimizable) + if !ok { + continue + } + + octx, err = o.VectorOptimize(ctx, octx) + if err != nil { + return err + } + } + + // No KNN searchers. + if octx == nil { + return nil + } + + // Postings lists and iterators replaced in the pointer to the + // vector reader + return octx.Finish() +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/optimize_no_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/optimize_no_knn.go new file mode 100644 index 0000000000..bd5d91fb90 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/optimize_no_knn.go @@ -0,0 +1,31 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !vectors +// +build !vectors + +package searcher + +import ( + "context" + + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +func optimizeKNN(ctx context.Context, indexReader index.IndexReader, + qsearchers []search.Searcher) error { + // No-op + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/ordered_searchers_list.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/ordered_searchers_list.go index f3e646e9da..ac9da563d9 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/ordered_searchers_list.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/ordered_searchers_list.go @@ -33,3 +33,23 @@ func (otrl OrderedSearcherList) Less(i, j int) bool { func (otrl OrderedSearcherList) Swap(i, j int) { otrl[i], otrl[j] = otrl[j], otrl[i] } + +type OrderedPositionalSearcherList struct { + searchers []search.Searcher + index []int +} + +// sort.Interface + +func (otrl OrderedPositionalSearcherList) Len() int { + return len(otrl.searchers) +} + +func (otrl OrderedPositionalSearcherList) Less(i, j int) bool { + return otrl.searchers[i].Count() < otrl.searchers[j].Count() +} + +func (otrl OrderedPositionalSearcherList) Swap(i, j int) { + otrl.searchers[i], otrl.searchers[j] = otrl.searchers[j], otrl.searchers[i] + otrl.index[i], otrl.index[j] = otrl.index[j], otrl.index[i] +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go index 19ef199ac4..25e6610752 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go @@ -35,7 +35,7 @@ func init() { type ConjunctionSearcher struct { indexReader index.IndexReader - searchers OrderedSearcherList + searchers []search.Searcher queryNorm float64 currs []*search.DocumentMatch maxIDIdx int @@ -88,6 +88,20 @@ func NewConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, return &rv, nil } +func (s *ConjunctionSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + func (s *ConjunctionSearcher) Size() int { sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr + s.scorer.Size() @@ -105,20 +119,6 @@ func (s *ConjunctionSearcher) Size() int { return sizeInBytes } -func (s *ConjunctionSearcher) computeQueryNorm() { - // first calculate sum of squared weights - sumOfSquaredWeights := 0.0 - for _, searcher := range s.searchers { - sumOfSquaredWeights += searcher.Weight() - } - // now compute query norm from this - s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) - // finally tell all the downstream searchers the norm - for _, searcher := range s.searchers { - searcher.SetQueryNorm(s.queryNorm) - } -} - func (s *ConjunctionSearcher) initSearchers(ctx *search.SearchContext) error { var err error // get all searchers pointing at their first match diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go index 606a157aed..d165ec0273 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go @@ -46,15 +46,31 @@ func optionsDisjunctionOptimizable(options search.SearcherOptions) bool { func newDisjunctionSearcher(ctx context.Context, indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, limit bool) (search.Searcher, error) { - // attempt the "unadorned" disjunction optimization only when we - // do not need extra information like freq-norm's or term vectors - // and the requested min is simple - if len(qsearchers) > 1 && min <= 1 && - optionsDisjunctionOptimizable(options) { - rv, err := optimizeCompositeSearcher(ctx, "disjunction:unadorned", - indexReader, qsearchers, options) - if err != nil || rv != nil { - return rv, err + + var disjOverKNN bool + if ctx != nil { + disjOverKNN, _ = ctx.Value(search.IncludeScoreBreakdownKey).(bool) + } + if disjOverKNN { + // The KNN Searcher optimization is a necessary pre-req for the KNN Searchers, + // not an optional optimization like for, say term searchers. + // It's an optimization to repeat search an open vector index when applicable, + // rather than individually opening and searching a vector index. + err := optimizeKNN(ctx, indexReader, qsearchers) + if err != nil { + return nil, err + } + } else { + // attempt the "unadorned" disjunction optimization only when we + // do not need extra information like freq-norm's or term vectors + // and the requested min is simple + if len(qsearchers) > 1 && min <= 1 && + optionsDisjunctionOptimizable(options) { + rv, err := optimizeCompositeSearcher(ctx, "disjunction:unadorned", + indexReader, qsearchers, options) + if err != nil || rv != nil { + return rv, err + } } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go index d36e301314..89bcd498fb 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go @@ -39,22 +39,25 @@ func init() { } type SearcherCurr struct { - searcher search.Searcher - curr *search.DocumentMatch + searcher search.Searcher + curr *search.DocumentMatch + matchingIdx int } type DisjunctionHeapSearcher struct { indexReader index.IndexReader - numSearchers int - scorer *scorer.DisjunctionQueryScorer - min int - queryNorm float64 - initialized bool - searchers []search.Searcher - heap []*SearcherCurr + numSearchers int + scorer *scorer.DisjunctionQueryScorer + min int + queryNorm float64 + retrieveScoreBreakdown bool + initialized bool + searchers []search.Searcher + heap []*SearcherCurr matching []*search.DocumentMatch + matchingIdxs []int matchingCurrs []*SearcherCurr bytesRead uint64 @@ -67,22 +70,42 @@ func newDisjunctionHeapSearcher(ctx context.Context, indexReader index.IndexRead if limit && tooManyClauses(len(searchers)) { return nil, tooManyClausesErr("", len(searchers)) } + var retrieveScoreBreakdown bool + if ctx != nil { + retrieveScoreBreakdown, _ = ctx.Value(search.IncludeScoreBreakdownKey).(bool) + } // build our searcher rv := DisjunctionHeapSearcher{ - indexReader: indexReader, - searchers: searchers, - numSearchers: len(searchers), - scorer: scorer.NewDisjunctionQueryScorer(options), - min: int(min), - matching: make([]*search.DocumentMatch, len(searchers)), - matchingCurrs: make([]*SearcherCurr, len(searchers)), - heap: make([]*SearcherCurr, 0, len(searchers)), + indexReader: indexReader, + searchers: searchers, + numSearchers: len(searchers), + scorer: scorer.NewDisjunctionQueryScorer(options), + min: int(min), + matching: make([]*search.DocumentMatch, len(searchers)), + matchingCurrs: make([]*SearcherCurr, len(searchers)), + matchingIdxs: make([]int, len(searchers)), + retrieveScoreBreakdown: retrieveScoreBreakdown, + heap: make([]*SearcherCurr, 0, len(searchers)), } rv.computeQueryNorm() return &rv, nil } +func (s *DisjunctionHeapSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + func (s *DisjunctionHeapSearcher) Size() int { sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr + s.scorer.Size() @@ -101,24 +124,11 @@ func (s *DisjunctionHeapSearcher) Size() int { // since searchers and document matches already counted above sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr + sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt return sizeInBytes } -func (s *DisjunctionHeapSearcher) computeQueryNorm() { - // first calculate sum of squared weights - sumOfSquaredWeights := 0.0 - for _, searcher := range s.searchers { - sumOfSquaredWeights += searcher.Weight() - } - // now compute query norm from this - s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) - // finally tell all the downstream searchers the norm - for _, searcher := range s.searchers { - searcher.SetQueryNorm(s.queryNorm) - } -} - func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error { // alloc a single block of SearcherCurrs block := make([]SearcherCurr, len(s.searchers)) @@ -132,6 +142,7 @@ func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error if curr != nil { block[i].searcher = searcher block[i].curr = curr + block[i].matchingIdx = i heap.Push(s, &block[i]) } } @@ -147,6 +158,7 @@ func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error func (s *DisjunctionHeapSearcher) updateMatches() error { matching := s.matching[:0] matchingCurrs := s.matchingCurrs[:0] + matchingIdxs := s.matchingIdxs[:0] if len(s.heap) > 0 { @@ -154,17 +166,20 @@ func (s *DisjunctionHeapSearcher) updateMatches() error { next := heap.Pop(s).(*SearcherCurr) matching = append(matching, next.curr) matchingCurrs = append(matchingCurrs, next) + matchingIdxs = append(matchingIdxs, next.matchingIdx) // now as long as top of heap matches, keep popping for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { next = heap.Pop(s).(*SearcherCurr) matching = append(matching, next.curr) matchingCurrs = append(matchingCurrs, next) + matchingIdxs = append(matchingIdxs, next.matchingIdx) } } s.matching = matching s.matchingCurrs = matchingCurrs + s.matchingIdxs = matchingIdxs return nil } @@ -197,10 +212,16 @@ func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( for !found && len(s.matching) > 0 { if len(s.matching) >= s.min { found = true - partialMatch := len(s.matching) != len(s.searchers) - // score this match - rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) - rv.PartialMatch = partialMatch + if s.retrieveScoreBreakdown { + // just return score and expl breakdown here, since it is a disjunction over knn searchers, + // and the final score and expl is calculated in the knn collector + rv = s.scorer.ScoreAndExplBreakdown(ctx, s.matching, s.matchingIdxs, nil, s.numSearchers) + } else { + // score this match + partialMatch := len(s.matching) != len(s.searchers) + rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) + rv.PartialMatch = partialMatch + } } // invoke next on all the matching searchers diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go index 0969c8cf3c..81b00cc22d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go @@ -34,17 +34,19 @@ func init() { } type DisjunctionSliceSearcher struct { - indexReader index.IndexReader - searchers OrderedSearcherList - numSearchers int - queryNorm float64 - currs []*search.DocumentMatch - scorer *scorer.DisjunctionQueryScorer - min int - matching []*search.DocumentMatch - matchingIdxs []int - initialized bool - bytesRead uint64 + indexReader index.IndexReader + searchers []search.Searcher + originalPos []int + numSearchers int + queryNorm float64 + retrieveScoreBreakdown bool + currs []*search.DocumentMatch + scorer *scorer.DisjunctionQueryScorer + min int + matching []*search.DocumentMatch + matchingIdxs []int + initialized bool + bytesRead uint64 } func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexReader, @@ -54,21 +56,45 @@ func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexRea if limit && tooManyClauses(len(qsearchers)) { return nil, tooManyClausesErr("", len(qsearchers)) } - // build the downstream searchers - searchers := make(OrderedSearcherList, len(qsearchers)) - for i, searcher := range qsearchers { - searchers[i] = searcher + + var searchers OrderedSearcherList + var originalPos []int + var retrieveScoreBreakdown bool + if ctx != nil { + retrieveScoreBreakdown, _ = ctx.Value(search.IncludeScoreBreakdownKey).(bool) } - // sort the searchers - sort.Sort(sort.Reverse(searchers)) - // build our searcher + + if retrieveScoreBreakdown { + // needed only when kNN is in picture + sortedSearchers := &OrderedPositionalSearcherList{ + searchers: make([]search.Searcher, len(qsearchers)), + index: make([]int, len(qsearchers)), + } + for i, searcher := range qsearchers { + sortedSearchers.searchers[i] = searcher + sortedSearchers.index[i] = i + } + sort.Sort(sortedSearchers) + searchers = sortedSearchers.searchers + originalPos = sortedSearchers.index + } else { + searchers = make(OrderedSearcherList, len(qsearchers)) + for i, searcher := range qsearchers { + searchers[i] = searcher + } + sort.Sort(searchers) + } + rv := DisjunctionSliceSearcher{ - indexReader: indexReader, - searchers: searchers, - numSearchers: len(searchers), - currs: make([]*search.DocumentMatch, len(searchers)), - scorer: scorer.NewDisjunctionQueryScorer(options), - min: int(min), + indexReader: indexReader, + searchers: searchers, + originalPos: originalPos, + numSearchers: len(searchers), + currs: make([]*search.DocumentMatch, len(searchers)), + scorer: scorer.NewDisjunctionQueryScorer(options), + min: int(min), + retrieveScoreBreakdown: retrieveScoreBreakdown, + matching: make([]*search.DocumentMatch, len(searchers)), matchingIdxs: make([]int, len(searchers)), } @@ -76,6 +102,20 @@ func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexRea return &rv, nil } +func (s *DisjunctionSliceSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + func (s *DisjunctionSliceSearcher) Size() int { sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr + s.scorer.Size() @@ -97,24 +137,11 @@ func (s *DisjunctionSliceSearcher) Size() int { } sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt + sizeInBytes += len(s.originalPos) * size.SizeOfInt return sizeInBytes } -func (s *DisjunctionSliceSearcher) computeQueryNorm() { - // first calculate sum of squared weights - sumOfSquaredWeights := 0.0 - for _, searcher := range s.searchers { - sumOfSquaredWeights += searcher.Weight() - } - // now compute query norm from this - s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) - // finally tell all the downstream searchers the norm - for _, searcher := range s.searchers { - searcher.SetQueryNorm(s.queryNorm) - } -} - func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error { var err error // get all searchers pointing at their first match @@ -197,10 +224,16 @@ func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( for !found && len(s.matching) > 0 { if len(s.matching) >= s.min { found = true - partialMatch := len(s.matching) != len(s.searchers) - // score this match - rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) - rv.PartialMatch = partialMatch + if s.retrieveScoreBreakdown { + // just return score and expl breakdown here, since it is a disjunction over knn searchers, + // and the final score and expl is calculated in the knn collector + rv = s.scorer.ScoreAndExplBreakdown(ctx, s.matching, s.matchingIdxs, s.originalPos, s.numSearchers) + } else { + // score this match + partialMatch := len(s.matching) != len(s.searchers) + rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) + rv.PartialMatch = partialMatch + } } // invoke next on all the matching searchers diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go new file mode 100644 index 0000000000..8f146b3e81 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go @@ -0,0 +1,142 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package searcher + +import ( + "context" + "reflect" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/scorer" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeKNNSearcher int + +func init() { + var ks KNNSearcher + reflectStaticSizeKNNSearcher = int(reflect.TypeOf(ks).Size()) +} + +type KNNSearcher struct { + field string + vector []float32 + k int64 + indexReader index.IndexReader + vectorReader index.VectorReader + scorer *scorer.KNNQueryScorer + count uint64 + vd index.VectorDoc +} + +func NewKNNSearcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, + options search.SearcherOptions, field string, vector []float32, k int64, + boost float64, similarityMetric string) (search.Searcher, error) { + if vr, ok := i.(index.VectorIndexReader); ok { + vectorReader, err := vr.VectorReader(ctx, vector, field, k) + if err != nil { + return nil, err + } + + knnScorer := scorer.NewKNNQueryScorer(vector, field, boost, + options, similarityMetric) + return &KNNSearcher{ + indexReader: i, + vectorReader: vectorReader, + field: field, + vector: vector, + k: k, + scorer: knnScorer, + }, nil + } + return nil, nil +} + +func (s *KNNSearcher) VectorOptimize(ctx context.Context, octx index.VectorOptimizableContext) ( + index.VectorOptimizableContext, error) { + o, ok := s.vectorReader.(index.VectorOptimizable) + if ok { + return o.VectorOptimize(ctx, octx) + } + + return nil, nil +} + +func (s *KNNSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) ( + *search.DocumentMatch, error) { + knnMatch, err := s.vectorReader.Next(s.vd.Reset()) + if err != nil { + return nil, err + } + + if knnMatch == nil { + return nil, nil + } + + docMatch := s.scorer.Score(ctx, knnMatch) + + return docMatch, nil +} + +func (s *KNNSearcher) Close() error { + return s.vectorReader.Close() +} + +func (s *KNNSearcher) Count() uint64 { + return s.vectorReader.Count() +} + +func (s *KNNSearcher) DocumentMatchPoolSize() int { + return 1 +} + +func (s *KNNSearcher) Min() int { + return 0 +} + +func (s *KNNSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + knnMatch, err := s.vectorReader.Next(s.vd.Reset()) + if err != nil { + return nil, err + } + + if knnMatch == nil { + return nil, nil + } + + docMatch := s.scorer.Score(ctx, knnMatch) + + return docMatch, nil +} + +func (s *KNNSearcher) SetQueryNorm(qnorm float64) { + s.scorer.SetQueryNorm(qnorm) +} + +func (s *KNNSearcher) Size() int { + return reflectStaticSizeKNNSearcher + size.SizeOfPtr + + s.vectorReader.Size() + + s.vd.Size() + + s.scorer.Size() +} + +func (s *KNNSearcher) Weight() float64 { + return s.scorer.Weight() +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/util.go b/vendor/github.com/blevesearch/bleve/v2/search/util.go index b2cb62a2d1..6472803d1c 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/util.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/util.go @@ -106,6 +106,7 @@ const ( const SearchIncrementalCostKey = "_search_incremental_cost_key" const QueryTypeKey = "_query_type_key" const FuzzyMatchPhraseKey = "_fuzzy_match_phrase_key" +const IncludeScoreBreakdownKey = "_include_score_breakdown_key" func RecordSearchCost(ctx context.Context, msg SearchIncrementalCostCallbackMsg, bytes uint64) { @@ -133,3 +134,15 @@ const MaxGeoBufPoolSize = 24 * 1024 const MinGeoBufPoolSize = 24 type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool + +const KnnPreSearchDataKey = "_knn_pre_search_data_key" + +const PreSearchKey = "_presearch_key" + +type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation) + +type SearcherStartCallbackFn func(size uint64) error +type SearcherEndCallbackFn func(size uint64) error + +const SearcherStartCallbackKey = "_searcher_start_callback_key" +const SearcherEndCallbackKey = "_searcher_end_callback_key" diff --git a/vendor/github.com/blevesearch/bleve/v2/search_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_knn.go new file mode 100644 index 0000000000..683771418d --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search_knn.go @@ -0,0 +1,524 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package bleve + +import ( + "context" + "encoding/json" + "fmt" + "sort" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/collector" + "github.com/blevesearch/bleve/v2/search/query" + index "github.com/blevesearch/bleve_index_api" +) + +type knnOperator string + +// Must be updated only at init +var BleveMaxK = int64(10000) + +type SearchRequest struct { + Query query.Query `json:"query"` + Size int `json:"size"` + From int `json:"from"` + Highlight *HighlightRequest `json:"highlight"` + Fields []string `json:"fields"` + Facets FacetsRequest `json:"facets"` + Explain bool `json:"explain"` + Sort search.SortOrder `json:"sort"` + IncludeLocations bool `json:"includeLocations"` + Score string `json:"score,omitempty"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` + + KNN []*KNNRequest `json:"knn"` + KNNOperator knnOperator `json:"knn_operator"` + + // PreSearchData will be a map that will be used + // in the second phase of any 2-phase search, to provide additional + // context to the second phase. This is useful in the case of index + // aliases where the first phase will gather the PreSearchData from all + // the indexes in the alias, and the second phase will use that + // PreSearchData to perform the actual search. + // The currently accepted map configuration is: + // + // "_knn_pre_search_data_key": []*search.DocumentMatch + + PreSearchData map[string]interface{} `json:"pre_search_data,omitempty"` + + sortFunc func(sort.Interface) +} + +type KNNRequest struct { + Field string `json:"field"` + Vector []float32 `json:"vector"` + K int64 `json:"k"` + Boost *query.Boost `json:"boost,omitempty"` +} + +func (r *SearchRequest) AddKNN(field string, vector []float32, k int64, boost float64) { + b := query.Boost(boost) + r.KNN = append(r.KNN, &KNNRequest{ + Field: field, + Vector: vector, + K: k, + Boost: &b, + }) +} + +func (r *SearchRequest) AddKNNOperator(operator knnOperator) { + r.KNNOperator = operator +} + +// UnmarshalJSON deserializes a JSON representation of +// a SearchRequest +func (r *SearchRequest) UnmarshalJSON(input []byte) error { + var temp struct { + Q json.RawMessage `json:"query"` + Size *int `json:"size"` + From int `json:"from"` + Highlight *HighlightRequest `json:"highlight"` + Fields []string `json:"fields"` + Facets FacetsRequest `json:"facets"` + Explain bool `json:"explain"` + Sort []json.RawMessage `json:"sort"` + IncludeLocations bool `json:"includeLocations"` + Score string `json:"score"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` + KNN []*KNNRequest `json:"knn"` + KNNOperator knnOperator `json:"knn_operator"` + PreSearchData json.RawMessage `json:"pre_search_data"` + } + + err := json.Unmarshal(input, &temp) + if err != nil { + return err + } + + if temp.Size == nil { + r.Size = 10 + } else { + r.Size = *temp.Size + } + if temp.Sort == nil { + r.Sort = search.SortOrder{&search.SortScore{Desc: true}} + } else { + r.Sort, err = search.ParseSortOrderJSON(temp.Sort) + if err != nil { + return err + } + } + r.From = temp.From + r.Explain = temp.Explain + r.Highlight = temp.Highlight + r.Fields = temp.Fields + r.Facets = temp.Facets + r.IncludeLocations = temp.IncludeLocations + r.Score = temp.Score + r.SearchAfter = temp.SearchAfter + r.SearchBefore = temp.SearchBefore + r.Query, err = query.ParseQuery(temp.Q) + if err != nil { + return err + } + + if r.Size < 0 { + r.Size = 10 + } + if r.From < 0 { + r.From = 0 + } + + r.KNN = temp.KNN + r.KNNOperator = temp.KNNOperator + if r.KNNOperator == "" { + r.KNNOperator = knnOperatorOr + } + + if temp.PreSearchData != nil { + r.PreSearchData, err = query.ParsePreSearchData(temp.PreSearchData) + if err != nil { + return err + } + } + + return nil + +} + +// ----------------------------------------------------------------------------- + +func copySearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest { + rv := SearchRequest{ + Query: req.Query, + Size: req.Size + req.From, + From: 0, + Highlight: req.Highlight, + Fields: req.Fields, + Facets: req.Facets, + Explain: req.Explain, + Sort: req.Sort.Copy(), + IncludeLocations: req.IncludeLocations, + Score: req.Score, + SearchAfter: req.SearchAfter, + SearchBefore: req.SearchBefore, + KNN: req.KNN, + KNNOperator: req.KNNOperator, + PreSearchData: preSearchData, + } + return &rv + +} + +var ( + knnOperatorAnd = knnOperator("and") + knnOperatorOr = knnOperator("or") +) + +func createKNNQuery(req *SearchRequest) (query.Query, []int64, int64, error) { + if requestHasKNN(req) { + // first perform validation + err := validateKNN(req) + if err != nil { + return nil, nil, 0, err + } + var subQueries []query.Query + kArray := make([]int64, 0, len(req.KNN)) + sumOfK := int64(0) + for _, knn := range req.KNN { + knnQuery := query.NewKNNQuery(knn.Vector) + knnQuery.SetFieldVal(knn.Field) + knnQuery.SetK(knn.K) + knnQuery.SetBoost(knn.Boost.Value()) + subQueries = append(subQueries, knnQuery) + kArray = append(kArray, knn.K) + sumOfK += knn.K + } + rv := query.NewDisjunctionQuery(subQueries) + rv.RetrieveScoreBreakdown(true) + return rv, kArray, sumOfK, nil + } + return nil, nil, 0, nil +} + +func validateKNN(req *SearchRequest) error { + if req.KNN != nil && + req.KNNOperator != "" && + req.KNNOperator != knnOperatorOr && + req.KNNOperator != knnOperatorAnd { + return fmt.Errorf("unknown knn operator: %s", req.KNNOperator) + } + for _, q := range req.KNN { + if q == nil { + return fmt.Errorf("knn query cannot be nil") + } + if q.K <= 0 || len(q.Vector) == 0 { + return fmt.Errorf("k must be greater than 0 and vector must be non-empty") + } + if q.K > BleveMaxK { + return fmt.Errorf("k must be less than %d", BleveMaxK) + } + } + switch req.KNNOperator { + case knnOperatorAnd, knnOperatorOr, "": + // Valid cases, do nothing + default: + return fmt.Errorf("knn_operator must be either 'and' / 'or'") + } + return nil +} + +func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMatch, reader index.IndexReader, name string) (err error) { + requiredSortFields := req.Sort.RequiredFields() + var dvReader index.DocValueReader + var updateFieldVisitor index.DocValueVisitor + if len(requiredSortFields) > 0 { + dvReader, err = reader.DocValueReader(requiredSortFields) + if err != nil { + return err + } + updateFieldVisitor = func(field string, term []byte) { + req.Sort.UpdateVisitor(field, term) + } + } + for _, hit := range knnHits { + if len(requiredSortFields) > 0 { + err = dvReader.VisitDocValues(hit.IndexInternalID, updateFieldVisitor) + if err != nil { + return err + } + } + req.Sort.Value(hit) + err, _ = LoadAndHighlightFields(hit, req, "", reader, nil) + if err != nil { + return err + } + hit.Index = name + } + return nil +} + +func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader, preSearch bool) ([]*search.DocumentMatch, error) { + KNNQuery, kArray, sumOfK, err := createKNNQuery(req) + if err != nil { + return nil, err + } + knnSearcher, err := KNNQuery.Searcher(ctx, reader, i.m, search.SearcherOptions{ + Explain: req.Explain, + }) + if err != nil { + return nil, err + } + knnCollector := collector.NewKNNCollector(kArray, sumOfK) + err = knnCollector.Collect(ctx, knnSearcher, reader) + if err != nil { + return nil, err + } + knnHits := knnCollector.Results() + if !preSearch { + knnHits = finalizeKNNResults(req, knnHits) + } + // at this point, irrespective of whether it is a preSearch or not, + // the knn hits are populated with Sort and Fields. + // it must be ensured downstream that the Sort and Fields are not + // re-evaluated, for these hits. + // also add the index names to the hits, so that when early + // exit takes place after the first phase, the hits will have + // a valid value for Index. + err = addSortAndFieldsToKNNHits(req, knnHits, reader, i.name) + if err != nil { + return nil, err + } + return knnHits, nil +} + +func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { + if len(knnHits) > 0 { + newScoreExplComputer := func(queryMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) (float64, *search.Explanation) { + totalScore := queryMatch.Score + knnMatch.Score + if !req.Explain { + // exit early as we don't need to compute the explanation + return totalScore, nil + } + return totalScore, &search.Explanation{Value: totalScore, Message: "sum of:", Children: []*search.Explanation{queryMatch.Expl, knnMatch.Expl}} + } + coll.SetKNNHits(knnHits, search.ScoreExplCorrectionCallbackFunc(newScoreExplComputer)) + } +} + +func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { + // if the KNN operator is AND, then we need to filter out the hits that + // do not have match the KNN queries. + if req.KNNOperator == knnOperatorAnd { + idx := 0 + for _, hit := range knnHits { + if len(hit.ScoreBreakdown) == len(req.KNN) { + knnHits[idx] = hit + idx++ + } + } + knnHits = knnHits[:idx] + } + // fix the score using score breakdown now + // if the score is none, then we need to set the score to 0.0 + // if req.Explain is true, then we need to use the expl breakdown to + // finalize the correct explanation. + for _, hit := range knnHits { + hit.Score = 0.0 + if req.Score != "none" { + for _, score := range hit.ScoreBreakdown { + hit.Score += score + } + } + if req.Explain { + childrenExpl := make([]*search.Explanation, 0, len(hit.ScoreBreakdown)) + for i := range hit.ScoreBreakdown { + childrenExpl = append(childrenExpl, hit.Expl.Children[i]) + } + hit.Expl = &search.Explanation{Value: hit.Score, Message: "sum of:", Children: childrenExpl} + } + // we don't need the score breakdown anymore + // so we can set it to nil + hit.ScoreBreakdown = nil + } + return knnHits +} + +// when we are setting KNN hits in the preSearchData, we need to make sure that +// the KNN hit goes to the right index. This is because the KNN hits are +// collected from all the indexes in the alias, but the preSearchData is +// specific to each index. If alias A1 contains indexes I1 and I2 and +// the KNN hits collected from both I1 and I2, and merged to get top K +// hits, then the top K hits need to be distributed to I1 and I2, +// so that the preSearchData for I1 contains the top K hits from I1 and +// the preSearchData for I2 contains the top K hits from I2. +func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) { + // create a set of all the index names of this alias + indexNames := make(map[string]struct{}, len(indexes)) + for _, index := range indexes { + indexNames[index.Name()] = struct{}{} + } + segregatedKnnHits := make(map[string][]*search.DocumentMatch) + for _, hit := range knnHits { + // for each hit, we need to perform a validation check to ensure that the stack + // is still valid. + // + // if the stack is empty, then we have an inconsistency/abnormality + // since any hit with an empty stack is supposed to land on a leaf index, + // and not an alias. This cannot happen in normal circumstances. But + // performing this check to be safe. Since we extract the stack top + // in the following steps. + if len(hit.IndexNames) == 0 { + return nil, ErrorTwoPhaseSearchInconsistency + } + // since the stack is not empty, we need to check if the top of the stack + // is a valid index name, of an index that is part of this alias. If not, + // then we have an inconsistency that could be caused due to a topology + // change. + stackTopIdx := len(hit.IndexNames) - 1 + top := hit.IndexNames[stackTopIdx] + if _, exists := indexNames[top]; !exists { + return nil, ErrorTwoPhaseSearchInconsistency + } + if stackTopIdx == 0 { + // if the stack consists of only one index, then popping the top + // would result in an empty slice, and handle this case by setting + // indexNames to nil. So that the final search results will not + // contain the indexNames field. + hit.IndexNames = nil + } else { + hit.IndexNames = hit.IndexNames[:stackTopIdx] + } + segregatedKnnHits[top] = append(segregatedKnnHits[top], hit) + } + return segregatedKnnHits, nil +} + +func requestHasKNN(req *SearchRequest) bool { + return len(req.KNN) > 0 +} + +// returns true if the search request contains a KNN request that can be +// satisfied by just performing a preSearch, completely bypassing the +// actual search. +func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool { + // if req.Query is not match_none => then we need to go to phase 2 + // to perform the actual query. + if _, ok := req.Query.(*query.MatchNoneQuery); !ok { + return false + } + // req.Query is a match_none query + // + // if request contains facets, we need to perform phase 2 to calculate + // the facet result. Since documents were removed as part of the + // merging process after phase 1, if the facet results were to be calculated + // during phase 1, then they will be now be incorrect, since merging would + // remove some documents. + if req.Facets != nil { + return false + } + // the request is a match_none query and does not contain any facets + // so we can satisfy the request using just the preSearch result. + return true +} + +func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preSearchResult *SearchResult, + indexes []Index) (map[string]map[string]interface{}, error) { + + distributedHits, err := validateAndDistributeKNNHits([]*search.DocumentMatch(preSearchResult.Hits), indexes) + if err != nil { + return nil, err + } + for _, index := range indexes { + mergedOut[index.Name()][search.KnnPreSearchDataKey] = distributedHits[index.Name()] + } + return mergedOut, nil +} + +func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { + dummyReq.KNN = realReq.KNN + dummyReq.KNNOperator = knnOperatorOr + dummyReq.Explain = realReq.Explain + dummyReq.Fields = realReq.Fields + dummyReq.Sort = realReq.Sort +} + +// the preSearchData for KNN is a list of DocumentMatch objects +// that need to be redistributed to the right index. +// This is used only in the case of an alias tree, where the indexes +// are at the leaves of the tree, and the master alias is at the root. +// At each level of the tree, the preSearchData needs to be redistributed +// to the indexes/aliases at that level. Because the preSearchData is +// specific to each final index at the leaf. +func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { + knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch) + if !ok { + return nil, fmt.Errorf("request does not have knn preSearchData for redistribution") + } + segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) + if err != nil { + return nil, err + } + + rv := make(map[string]map[string]interface{}) + for _, index := range indexes { + rv[index.Name()] = make(map[string]interface{}) + } + + for _, index := range indexes { + for k, v := range req.PreSearchData { + switch k { + case search.KnnPreSearchDataKey: + rv[index.Name()][k] = segregatedKnnHits[index.Name()] + default: + rv[index.Name()][k] = v + } + } + } + return rv, nil +} + +func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor { + kArray := make([]int64, len(req.KNN)) + for i, knnReq := range req.KNN { + kArray[i] = knnReq.K + } + knnStore := collector.GetNewKNNCollectorStore(kArray) + return &knnPreSearchResultProcessor{ + addFn: func(sr *SearchResult, indexName string) { + for _, hit := range sr.Hits { + // tag the hit with the index name, so that when the + // final search result is constructed, the hit will have + // a valid path to follow along the alias tree to reach + // the index. + hit.IndexNames = append(hit.IndexNames, indexName) + knnStore.AddDocument(hit) + } + }, + finalizeFn: func(sr *SearchResult) { + // passing nil as the document fixup function, because we don't need to + // fixup the document, since this was already done in the first phase, + // hence error is always nil. + // the merged knn hits are finalized and set in the search result. + sr.Hits, _ = knnStore.Final(nil) + }, + } +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go new file mode 100644 index 0000000000..aff8261155 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go @@ -0,0 +1,207 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !vectors +// +build !vectors + +package bleve + +import ( + "context" + "encoding/json" + "sort" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/collector" + "github.com/blevesearch/bleve/v2/search/query" + index "github.com/blevesearch/bleve_index_api" +) + +// A SearchRequest describes all the parameters +// needed to search the index. +// Query is required. +// Size/From describe how much and which part of the +// result set to return. +// Highlight describes optional search result +// highlighting. +// Fields describes a list of field values which +// should be retrieved for result documents, provided they +// were stored while indexing. +// Facets describe the set of facets to be computed. +// Explain triggers inclusion of additional search +// result score explanations. +// Sort describes the desired order for the results to be returned. +// Score controls the kind of scoring performed +// SearchAfter supports deep paging by providing a minimum sort key +// SearchBefore supports deep paging by providing a maximum sort key +// sortFunc specifies the sort implementation to use for sorting results. +// +// A special field named "*" can be used to return all fields. +type SearchRequest struct { + ClientContextID string `json:"client_context_id,omitempty"` + Query query.Query `json:"query"` + Size int `json:"size"` + From int `json:"from"` + Highlight *HighlightRequest `json:"highlight"` + Fields []string `json:"fields"` + Facets FacetsRequest `json:"facets"` + Explain bool `json:"explain"` + Sort search.SortOrder `json:"sort"` + IncludeLocations bool `json:"includeLocations"` + Score string `json:"score,omitempty"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` + + // PreSearchData will be a map that will be used + // in the second phase of any 2-phase search, to provide additional + // context to the second phase. This is useful in the case of index + // aliases where the first phase will gather the PreSearchData from all + // the indexes in the alias, and the second phase will use that + // PreSearchData to perform the actual search. + // The currently accepted map configuration is: + // + // "_knn_pre_search_data_key": []*search.DocumentMatch + + PreSearchData map[string]interface{} `json:"pre_search_data,omitempty"` + + sortFunc func(sort.Interface) +} + +// UnmarshalJSON deserializes a JSON representation of +// a SearchRequest +func (r *SearchRequest) UnmarshalJSON(input []byte) error { + var temp struct { + Q json.RawMessage `json:"query"` + Size *int `json:"size"` + From int `json:"from"` + Highlight *HighlightRequest `json:"highlight"` + Fields []string `json:"fields"` + Facets FacetsRequest `json:"facets"` + Explain bool `json:"explain"` + Sort []json.RawMessage `json:"sort"` + IncludeLocations bool `json:"includeLocations"` + Score string `json:"score"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` + PreSearchData json.RawMessage `json:"pre_search_data"` + } + + err := json.Unmarshal(input, &temp) + if err != nil { + return err + } + + if temp.Size == nil { + r.Size = 10 + } else { + r.Size = *temp.Size + } + if temp.Sort == nil { + r.Sort = search.SortOrder{&search.SortScore{Desc: true}} + } else { + r.Sort, err = search.ParseSortOrderJSON(temp.Sort) + if err != nil { + return err + } + } + r.From = temp.From + r.Explain = temp.Explain + r.Highlight = temp.Highlight + r.Fields = temp.Fields + r.Facets = temp.Facets + r.IncludeLocations = temp.IncludeLocations + r.Score = temp.Score + r.SearchAfter = temp.SearchAfter + r.SearchBefore = temp.SearchBefore + r.Query, err = query.ParseQuery(temp.Q) + if err != nil { + return err + } + + if r.Size < 0 { + r.Size = 10 + } + if r.From < 0 { + r.From = 0 + } + if temp.PreSearchData != nil { + r.PreSearchData, err = query.ParsePreSearchData(temp.PreSearchData) + if err != nil { + return err + } + } + + return nil + +} + +// ----------------------------------------------------------------------------- + +func copySearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest { + rv := SearchRequest{ + Query: req.Query, + Size: req.Size + req.From, + From: 0, + Highlight: req.Highlight, + Fields: req.Fields, + Facets: req.Facets, + Explain: req.Explain, + Sort: req.Sort.Copy(), + IncludeLocations: req.IncludeLocations, + Score: req.Score, + SearchAfter: req.SearchAfter, + SearchBefore: req.SearchBefore, + PreSearchData: preSearchData, + } + return &rv +} + +func validateKNN(req *SearchRequest) error { + return nil +} + +func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader, preSearch bool) ([]*search.DocumentMatch, error) { + return nil, nil +} + +func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { +} + +func requestHasKNN(req *SearchRequest) bool { + return false +} + +func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { +} + +func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { + return nil, nil +} + +func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool { + return false +} + +func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preSearchResult *SearchResult, + indexes []Index) (map[string]map[string]interface{}, error) { + return mergedOut, nil +} + +func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { + return knnHits +} + +func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor { + return &knnPreSearchResultProcessor{} // equivalent to nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/util/extract.go b/vendor/github.com/blevesearch/bleve/v2/util/extract.go new file mode 100644 index 0000000000..e963d0c3ad --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/util/extract.go @@ -0,0 +1,62 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package util + +import ( + "math" + "reflect" +) + +// extract numeric value (if possible) and returns a float64 +func ExtractNumericValFloat64(v interface{}) (float64, bool) { + val := reflect.ValueOf(v) + if !val.IsValid() { + return 0, false + } + + switch { + case val.CanFloat(): + return val.Float(), true + case val.CanInt(): + return float64(val.Int()), true + case val.CanUint(): + return float64(val.Uint()), true + } + + return 0, false +} + +// extract numeric value (if possible) and returns a float32 +func ExtractNumericValFloat32(v interface{}) (float32, bool) { + val := reflect.ValueOf(v) + if !val.IsValid() { + return 0, false + } + + switch { + case val.CanFloat(): + floatVal := val.Float() + if floatVal > math.MaxFloat32 { + return 0, false + } + return float32(floatVal), true + case val.CanInt(): + return float32(val.Int()), true + case val.CanUint(): + return float32(val.Uint()), true + } + + return 0, false +} diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector.go b/vendor/github.com/blevesearch/bleve_index_api/vector.go new file mode 100644 index 0000000000..12c5433467 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve_index_api/vector.go @@ -0,0 +1,69 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package index + +type VectorField interface { + Vector() []float32 + // Dimensionality of the vector + Dims() int + // Similarity metric to be used for scoring the vectors + Similarity() string + // nlist/nprobe config (recall/latency) the index is optimized for + IndexOptimizedFor() string +} + +// ----------------------------------------------------------------------------- + +const ( + EuclideanDistance = "l2_norm" + + // dotProduct(vecA, vecB) = vecA . vecB = |vecA| * |vecB| * cos(theta); + // where, theta is the angle between vecA and vecB + // If vecA and vecB are normalized (unit magnitude), then + // vecA . vecB = cos(theta), which is the cosine similarity. + // Thus, we don't need a separate similarity type for cosine similarity + CosineSimilarity = "dot_product" +) + +const DefaultSimilarityMetric = EuclideanDistance + +// Supported similarity metrics for vector fields +var SupportedSimilarityMetrics = map[string]struct{}{ + EuclideanDistance: {}, + CosineSimilarity: {}, +} + +// ----------------------------------------------------------------------------- + +const ( + IndexOptimizedForRecall = "recall" + IndexOptimizedForLatency = "latency" +) + +const DefaultIndexOptimization = IndexOptimizedForRecall + +var SupportedVectorIndexOptimizations = map[string]int{ + IndexOptimizedForRecall: 0, + IndexOptimizedForLatency: 1, +} + +// Reverse maps vector index optimizations': int -> string +var VectorIndexOptimizationsReverseLookup = map[int]string{ + 0: IndexOptimizedForRecall, + 1: IndexOptimizedForLatency, +} diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector_index.go b/vendor/github.com/blevesearch/bleve_index_api/vector_index.go new file mode 100644 index 0000000000..fa736b9691 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve_index_api/vector_index.go @@ -0,0 +1,74 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package index + +import ( + "context" + "reflect" +) + +var reflectStaticSizeVectorDoc int + +func init() { + var vd VectorDoc + reflectStaticSizeVectorDoc = int(reflect.TypeOf(vd).Size()) +} + +type VectorReader interface { + // Next returns the next document similar to the vector, in this field, or nil + // when it reaches the end of the enumeration. The preAlloced VectorDoc + // is optional, and when non-nil, will be used instead of allocating memory. + Next(preAlloced *VectorDoc) (*VectorDoc, error) + + // Advance resets the enumeration at specified document or its immediate + // follower. + Advance(ID IndexInternalID, preAlloced *VectorDoc) (*VectorDoc, error) + + // Count returns the number of documents similar to the vector, in this field. + Count() uint64 + Close() error + + Size() int +} + +type VectorIndexReader interface { + VectorReader(ctx context.Context, vector []float32, field string, k int64) ( + VectorReader, error) +} + +type VectorDoc struct { + Vector []float32 + ID IndexInternalID + Score float64 +} + +func (vd *VectorDoc) Size() int { + return reflectStaticSizeVectorDoc + sizeOfPtr + len(vd.Vector) + + len(vd.ID) +} + +// Reset allows an already allocated VectorDoc to be reused +func (vd *VectorDoc) Reset() *VectorDoc { + // remember the []byte used for the ID + id := vd.ID + // idiom to copy over from empty VectorDoc (0 allocations) + *vd = VectorDoc{} + // reuse the []byte already allocated (and reset len to 0) + vd.ID = id[:0] + return vd +} diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector_optimize.go b/vendor/github.com/blevesearch/bleve_index_api/vector_optimize.go new file mode 100644 index 0000000000..e8f005539f --- /dev/null +++ b/vendor/github.com/blevesearch/bleve_index_api/vector_optimize.go @@ -0,0 +1,38 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package index + +import "context" + +// VectorOptimizable represents an optional interface that implementable by +// optimizable resources (e.g., VectorReaders, Searchers). These +// optimizable resources are provided the same OptimizableContext +// instance, so that they can coordinate via dynamic interface +// casting. +// To avoid KNNSearchers' OptimizableContext being casted to ones used for +// TFRs, term searchers, etc. +type VectorOptimizable interface { + VectorOptimize(ctx context.Context, octx VectorOptimizableContext) (VectorOptimizableContext, error) +} + +type VectorOptimizableContext interface { + // Once all the optimzable resources have been provided the same + // OptimizableContext instance, the optimization preparations are + // finished or completed via the Finish() method. + Finish() error +} diff --git a/vendor/github.com/blevesearch/go-faiss/LICENSE b/vendor/github.com/blevesearch/go-faiss/LICENSE new file mode 100644 index 0000000000..16c42bc8ec --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Paul Ouellette + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/blevesearch/go-faiss/README.md b/vendor/github.com/blevesearch/go-faiss/README.md new file mode 100644 index 0000000000..dd3aa66ef4 --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/README.md @@ -0,0 +1,43 @@ +# go-faiss + +[![Go Reference](https://pkg.go.dev/badge/github.com/DataIntelligenceCrew/go-faiss.svg)](https://pkg.go.dev/github.com/DataIntelligenceCrew/go-faiss) + +Go bindings for [Faiss](https://github.com/facebookresearch/faiss), a library for vector similarity search. + +## Install + +First you will need to build and install Faiss: + +``` +git clone https://github.com/blevesearch/faiss.git +cd faiss +cmake -B build -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_C_API=ON -DBUILD_SHARED_LIBS=ON . +make -C build +sudo make -C build install +``` + +On osX ARM64, the instructions needed to be slightly adjusted based on https://github.com/facebookresearch/faiss/issues/2111: + +``` +LDFLAGS="-L/opt/homebrew/opt/llvm/lib" CPPFLAGS="-I/opt/homebrew/opt/llvm/include" CXX=/opt/homebrew/opt/llvm/bin/clang++ CC=/opt/homebrew/opt/llvm/bin/clang cmake -B build -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_C_API=ON -DBUILD_SHARED_LIBS=ON . +// set FAISS_ENABLE_PYTHON to OFF in CMakeLists.txt to ignore libpython dylib +make -C build +sudo make -C build install +``` + +Building will produce the dynamic library `faiss_c`. +You will need to install it in a place where your system will find it (e.g. `/usr/local/lib` on mac or `/usr/lib` on Linux). +You can do this with: + + sudo cp build/c_api/libfaiss_c.so /usr/local/lib + +Now you can install the Go module: + + go get github.com/blevesearch/go-faiss + +## Usage + +API documentation is available at . +See the [Faiss wiki](https://github.com/facebookresearch/faiss/wiki) for more information. + +Examples can be found in the [_example](_example) directory. diff --git a/vendor/github.com/blevesearch/go-faiss/autotune.go b/vendor/github.com/blevesearch/go-faiss/autotune.go new file mode 100644 index 0000000000..0c06c4cc18 --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/autotune.go @@ -0,0 +1,50 @@ +package faiss + +/* +#include +#include +*/ +import "C" +import ( + "runtime" + "unsafe" +) + +type ParameterSpace struct { + ps *C.FaissParameterSpace +} + +// NewParameterSpace creates a new ParameterSpace. +func NewParameterSpace() (*ParameterSpace, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var ps *C.FaissParameterSpace + if c := C.faiss_ParameterSpace_new(&ps); c != 0 { + return nil, getLastError() + } + return &ParameterSpace{ps}, nil +} + +// SetIndexParameter sets one of the parameters. +func (p *ParameterSpace) SetIndexParameter(idx Index, name string, val float64) error { + runtime.LockOSThread() + cname := C.CString(name) + + defer func() { + C.free(unsafe.Pointer(cname)) + runtime.UnlockOSThread() + }() + + c := C.faiss_ParameterSpace_set_index_parameter( + p.ps, idx.cPtr(), cname, C.double(val)) + if c != 0 { + return getLastError() + } + return nil +} + +// Delete frees the memory associated with p. +func (p *ParameterSpace) Delete() { + C.faiss_ParameterSpace_free(p.ps) +} diff --git a/vendor/github.com/blevesearch/go-faiss/faiss.go b/vendor/github.com/blevesearch/go-faiss/faiss.go new file mode 100644 index 0000000000..4a73f760fe --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/faiss.go @@ -0,0 +1,30 @@ +// Package faiss provides bindings to Faiss, a library for vector similarity +// search. +// More detailed documentation can be found at the Faiss wiki: +// https://github.com/facebookresearch/faiss/wiki. +package faiss + +/* +#cgo LDFLAGS: -lfaiss_c + +#include +#include +*/ +import "C" +import "errors" + +func getLastError() error { + return errors.New(C.GoString(C.faiss_get_last_error())) +} + +// Metric type +const ( + MetricInnerProduct = C.METRIC_INNER_PRODUCT + MetricL2 = C.METRIC_L2 + MetricL1 = C.METRIC_L1 + MetricLinf = C.METRIC_Linf + MetricLp = C.METRIC_Lp + MetricCanberra = C.METRIC_Canberra + MetricBrayCurtis = C.METRIC_BrayCurtis + MetricJensenShannon = C.METRIC_JensenShannon +) diff --git a/vendor/github.com/blevesearch/go-faiss/index.go b/vendor/github.com/blevesearch/go-faiss/index.go new file mode 100644 index 0000000000..76bc1758bf --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/index.go @@ -0,0 +1,378 @@ +package faiss + +/* +#include +#include +#include +#include +#include +#include +#include +#include +*/ +import "C" +import ( + "fmt" + "runtime" + "unsafe" +) + +// Index is a Faiss index. +// +// Note that some index implementations do not support all methods. +// Check the Faiss wiki to see what operations an index supports. +type Index interface { + // D returns the dimension of the indexed vectors. + D() int + + // IsTrained returns true if the index has been trained or does not require + // training. + IsTrained() bool + + // Ntotal returns the number of indexed vectors. + Ntotal() int64 + + // MetricType returns the metric type of the index. + MetricType() int + + // Train trains the index on a representative set of vectors. + Train(x []float32) error + + // Add adds vectors to the index. + Add(x []float32) error + + // AddWithIDs is like Add, but stores xids instead of sequential IDs. + AddWithIDs(x []float32, xids []int64) error + + // Search queries the index with the vectors in x. + // Returns the IDs of the k nearest neighbors for each query vector and the + // corresponding distances. + Search(x []float32, k int64) (distances []float32, labels []int64, err error) + + SearchWithoutIDs(x []float32, k int64, exclude []int64) (distances []float32, + labels []int64, err error) + + Reconstruct(key int64) ([]float32, error) + + ReconstructBatch(keys []int64, recons []float32) ([]float32, error) + + MergeFrom(other Index, add_id int64) error + + // RangeSearch queries the index with the vectors in x. + // Returns all vectors with distance < radius. + RangeSearch(x []float32, radius float32) (*RangeSearchResult, error) + + // Reset removes all vectors from the index. + Reset() error + + // RemoveIDs removes the vectors specified by sel from the index. + // Returns the number of elements removed and error. + RemoveIDs(sel *IDSelector) (int, error) + + // Close frees the memory used by the index. + Close() + + // consults the C++ side to get the size of the index + Size() uint64 + + cPtr() *C.FaissIndex +} + +type faissIndex struct { + idx *C.FaissIndex +} + +func (idx *faissIndex) cPtr() *C.FaissIndex { + return idx.idx +} + +func (idx *faissIndex) Size() uint64 { + size := C.faiss_Index_size(idx.idx) + return uint64(size) +} + +func (idx *faissIndex) D() int { + return int(C.faiss_Index_d(idx.idx)) +} + +func (idx *faissIndex) IsTrained() bool { + return C.faiss_Index_is_trained(idx.idx) != 0 +} + +func (idx *faissIndex) Ntotal() int64 { + return int64(C.faiss_Index_ntotal(idx.idx)) +} + +func (idx *faissIndex) MetricType() int { + return int(C.faiss_Index_metric_type(idx.idx)) +} + +func (idx *faissIndex) Train(x []float32) error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + n := len(x) / idx.D() + if c := C.faiss_Index_train(idx.idx, C.idx_t(n), (*C.float)(&x[0])); c != 0 { + return getLastError() + } + return nil +} + +func (idx *faissIndex) Add(x []float32) error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + n := len(x) / idx.D() + if c := C.faiss_Index_add(idx.idx, C.idx_t(n), (*C.float)(&x[0])); c != 0 { + return getLastError() + } + return nil +} + +func (idx *faissIndex) AddWithIDs(x []float32, xids []int64) error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + n := len(x) / idx.D() + if c := C.faiss_Index_add_with_ids( + idx.idx, + C.idx_t(n), + (*C.float)(&x[0]), + (*C.idx_t)(&xids[0]), + ); c != 0 { + return getLastError() + } + return nil +} + +func (idx *faissIndex) Search(x []float32, k int64) ( + distances []float32, labels []int64, err error, +) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + n := len(x) / idx.D() + distances = make([]float32, int64(n)*k) + labels = make([]int64, int64(n)*k) + if c := C.faiss_Index_search( + idx.idx, + C.idx_t(n), + (*C.float)(&x[0]), + C.idx_t(k), + (*C.float)(&distances[0]), + (*C.idx_t)(&labels[0]), + ); c != 0 { + err = getLastError() + } + + return +} + +func (idx *faissIndex) SearchWithoutIDs(x []float32, k int64, exclude []int64) ( + distances []float32, labels []int64, err error, +) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if len(exclude) <= 0 { + return idx.Search(x, k) + } + + excludeSelector, err := NewIDSelectorNot(exclude) + if err != nil { + return nil, nil, err + } + + var sp *C.FaissSearchParameters + C.faiss_SearchParameters_new(&sp, (*C.FaissIDSelector)(excludeSelector.sel)) + ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) + if ivfPtr != nil { + sp = C.faiss_SearchParametersIVF_cast(sp) + C.faiss_SearchParametersIVF_new_with_sel(&sp, (*C.FaissIDSelector)(excludeSelector.sel)) + } + + n := len(x) / idx.D() + distances = make([]float32, int64(n)*k) + labels = make([]int64, int64(n)*k) + + if c := C.faiss_Index_search_with_params( + idx.idx, + C.idx_t(n), + (*C.float)(&x[0]), + C.idx_t(k), sp, + (*C.float)(&distances[0]), + (*C.idx_t)(&labels[0]), + ); c != 0 { + err = getLastError() + } + excludeSelector.Delete() + C.faiss_SearchParameters_free(sp) + return +} + +func (idx *faissIndex) Reconstruct(key int64) (recons []float32, err error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + rv := make([]float32, idx.D()) + if c := C.faiss_Index_reconstruct( + idx.idx, + C.idx_t(key), + (*C.float)(&rv[0]), + ); c != 0 { + err = getLastError() + } + + return rv, err +} + +func (idx *faissIndex) ReconstructBatch(keys []int64, recons []float32) ([]float32, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var err error + n := int64(len(keys)) + if c := C.faiss_Index_reconstruct_batch( + idx.idx, + C.idx_t(n), + (*C.idx_t)(&keys[0]), + (*C.float)(&recons[0]), + ); c != 0 { + err = getLastError() + } + + return recons, err +} + +func (i *IndexImpl) MergeFrom(other Index, add_id int64) error { + if impl, ok := other.(*IndexImpl); ok { + return i.Index.MergeFrom(impl.Index, add_id) + } + return fmt.Errorf("merge not support") +} + +func (idx *faissIndex) MergeFrom(other Index, add_id int64) (err error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + otherIdx, ok := other.(*faissIndex) + if !ok { + return fmt.Errorf("merge api not supported") + } + + if c := C.faiss_Index_merge_from( + idx.idx, + otherIdx.idx, + (C.idx_t)(add_id), + ); c != 0 { + err = getLastError() + } + + return err +} + +func (idx *faissIndex) RangeSearch(x []float32, radius float32) ( + *RangeSearchResult, error, +) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + n := len(x) / idx.D() + var rsr *C.FaissRangeSearchResult + if c := C.faiss_RangeSearchResult_new(&rsr, C.idx_t(n)); c != 0 { + return nil, getLastError() + } + if c := C.faiss_Index_range_search( + idx.idx, + C.idx_t(n), + (*C.float)(&x[0]), + C.float(radius), + rsr, + ); c != 0 { + return nil, getLastError() + } + return &RangeSearchResult{rsr}, nil +} + +func (idx *faissIndex) Reset() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if c := C.faiss_Index_reset(idx.idx); c != 0 { + return getLastError() + } + return nil +} + +func (idx *faissIndex) RemoveIDs(sel *IDSelector) (int, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var nRemoved C.size_t + if c := C.faiss_Index_remove_ids(idx.idx, sel.sel, &nRemoved); c != 0 { + return 0, getLastError() + } + return int(nRemoved), nil +} + +func (idx *faissIndex) Close() { + C.faiss_Index_free(idx.idx) +} + +// RangeSearchResult is the result of a range search. +type RangeSearchResult struct { + rsr *C.FaissRangeSearchResult +} + +// Nq returns the number of queries. +func (r *RangeSearchResult) Nq() int { + return int(C.faiss_RangeSearchResult_nq(r.rsr)) +} + +// Lims returns a slice containing start and end indices for queries in the +// distances and labels slices returned by Labels. +func (r *RangeSearchResult) Lims() []int { + var lims *C.size_t + C.faiss_RangeSearchResult_lims(r.rsr, &lims) + length := r.Nq() + 1 + return (*[1 << 30]int)(unsafe.Pointer(lims))[:length:length] +} + +// Labels returns the unsorted IDs and respective distances for each query. +// The result for query i is labels[lims[i]:lims[i+1]]. +func (r *RangeSearchResult) Labels() (labels []int64, distances []float32) { + lims := r.Lims() + length := lims[len(lims)-1] + var clabels *C.idx_t + var cdist *C.float + C.faiss_RangeSearchResult_labels(r.rsr, &clabels, &cdist) + labels = (*[1 << 30]int64)(unsafe.Pointer(clabels))[:length:length] + distances = (*[1 << 30]float32)(unsafe.Pointer(cdist))[:length:length] + return +} + +// Delete frees the memory associated with r. +func (r *RangeSearchResult) Delete() { + C.faiss_RangeSearchResult_free(r.rsr) +} + +// IndexImpl is an abstract structure for an index. +type IndexImpl struct { + Index +} + +// IndexFactory builds a composite index. +// description is a comma-separated list of components. +func IndexFactory(d int, description string, metric int) (*IndexImpl, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + cdesc := C.CString(description) + defer C.free(unsafe.Pointer(cdesc)) + var idx faissIndex + c := C.faiss_index_factory(&idx.idx, C.int(d), cdesc, C.FaissMetricType(metric)) + if c != 0 { + return nil, getLastError() + } + return &IndexImpl{&idx}, nil +} diff --git a/vendor/github.com/blevesearch/go-faiss/index_flat.go b/vendor/github.com/blevesearch/go-faiss/index_flat.go new file mode 100644 index 0000000000..b8a3c03880 --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/index_flat.go @@ -0,0 +1,56 @@ +package faiss + +/* +#include +#include +*/ +import "C" +import "unsafe" + +// IndexFlat is an index that stores the full vectors and performs exhaustive +// search. +type IndexFlat struct { + Index +} + +// NewIndexFlat creates a new flat index. +func NewIndexFlat(d int, metric int) (*IndexFlat, error) { + var idx faissIndex + if c := C.faiss_IndexFlat_new_with( + &idx.idx, + C.idx_t(d), + C.FaissMetricType(metric), + ); c != 0 { + return nil, getLastError() + } + return &IndexFlat{&idx}, nil +} + +// NewIndexFlatIP creates a new flat index with the inner product metric type. +func NewIndexFlatIP(d int) (*IndexFlat, error) { + return NewIndexFlat(d, MetricInnerProduct) +} + +// NewIndexFlatL2 creates a new flat index with the L2 metric type. +func NewIndexFlatL2(d int) (*IndexFlat, error) { + return NewIndexFlat(d, MetricL2) +} + +// Xb returns the index's vectors. +// The returned slice becomes invalid after any add or remove operation. +func (idx *IndexFlat) Xb() []float32 { + var size C.size_t + var ptr *C.float + C.faiss_IndexFlat_xb(idx.cPtr(), &ptr, &size) + return (*[1 << 30]float32)(unsafe.Pointer(ptr))[:size:size] +} + +// AsFlat casts idx to a flat index. +// AsFlat panics if idx is not a flat index. +func (idx *IndexImpl) AsFlat() *IndexFlat { + ptr := C.faiss_IndexFlat_cast(idx.cPtr()) + if ptr == nil { + panic("index is not a flat index") + } + return &IndexFlat{&faissIndex{ptr}} +} diff --git a/vendor/github.com/blevesearch/go-faiss/index_io.go b/vendor/github.com/blevesearch/go-faiss/index_io.go new file mode 100644 index 0000000000..ba8eaf7e7e --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/index_io.go @@ -0,0 +1,123 @@ +package faiss + +/* +#include +#include +#include +#include +*/ +import "C" +import ( + "runtime" + "unsafe" +) + +// WriteIndex writes an index to a file. +func WriteIndex(idx Index, filename string) error { + cfname := C.CString(filename) + defer C.free(unsafe.Pointer(cfname)) + if c := C.faiss_write_index_fname(idx.cPtr(), cfname); c != 0 { + return getLastError() + } + return nil +} + +func WriteIndexIntoBuffer(idx Index) ([]byte, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // the values to be returned by the faiss APIs + tempBuf := (*C.uchar)(C.malloc(C.size_t(0))) + bufSize := C.size_t(0) + + if c := C.faiss_write_index_buf( + idx.cPtr(), + &bufSize, + &tempBuf, + ); c != 0 { + C.free(unsafe.Pointer(tempBuf)) + return nil, getLastError() + } + + // at this point, the idx has a valid ref count. furthermore, the index is + // something that's present on the C memory space, so not available to go's + // GC. needs to be freed when its of no more use. + + // todo: add checksum. + // the content populated in the tempBuf is converted from *C.uchar to unsafe.Pointer + // and then the pointer is casted into a large byte slice which is then sliced + // to a length and capacity equal to bufSize returned across the cgo interface. + // NOTE: it still points to the C memory though + // the bufSize is of type size_t which is equivalent to a uint in golang, so + // the conversion is safe. + val := unsafe.Slice((*byte)(unsafe.Pointer(tempBuf)), uint(bufSize)) + + // NOTE: This method is compatible with 64-bit systems but may encounter issues on 32-bit systems. + // leading to vector indexing being supported only for 64-bit systems. + // This limitation arises because the maximum allowed length of a slice on 32-bit systems + // is math.MaxInt32 (2^31-1), whereas the maximum value of a size_t in C++ is math.MaxUInt32 + // (4^31-1), exceeding the maximum allowed size of a slice in Go. + // Consequently, the bufSize returned by faiss_write_index_buf might exceed the + // maximum allowed size of a slice in Go, leading to a panic when attempting to + // create the following slice rv. + rv := make([]byte, uint(bufSize)) + // an explicit copy is necessary to free the memory on C heap and then return + // the rv back to the caller which is definitely on goruntime space (which will + // GC'd later on). + // + // an optimization over here - create buffer pool which can be used to make the + // memory allocations cheaper. specifically two separate pools can be utilized, + // one for C pointers and another for goruntime. within the faiss_write_index_buf + // a cheaper calloc rather than malloc can be used to make any extra allocations + // cheaper. + copy(rv, val) + + // safe to free the c memory allocated while serializing the index; + // rv is from go runtime - so different address space altogether + C.free(unsafe.Pointer(tempBuf)) + // p.s: no need to free "val" since the underlying memory is same as tempBuf (deferred free) + val = nil + + return rv, nil +} + +func ReadIndexFromBuffer(buf []byte, ioflags int) (*IndexImpl, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + ptr := (*C.uchar)(unsafe.Pointer(&buf[0])) + size := C.size_t(len(buf)) + + // the idx var has C.FaissIndex within the struct which is nil as of now. + var idx faissIndex + if c := C.faiss_read_index_buf(ptr, + size, + C.int(ioflags), + &idx.idx); c != 0 { + return nil, getLastError() + } + + ptr = nil + + // after exiting the faiss_read_index_buf, the ref count to the memory allocated + // for the freshly created faiss::index becomes 1 (held by idx.idx of type C.FaissIndex) + // this is allocated on the C heap, so not available for golang's GC. hence needs + // to be cleaned up after the index is longer being used - to be done at zap layer. + return &IndexImpl{&idx}, nil +} + +const ( + IOFlagMmap = C.FAISS_IO_FLAG_MMAP + IOFlagReadOnly = C.FAISS_IO_FLAG_READ_ONLY +) + +// ReadIndex reads an index from a file. +func ReadIndex(filename string, ioflags int) (*IndexImpl, error) { + cfname := C.CString(filename) + defer C.free(unsafe.Pointer(cfname)) + var idx faissIndex + if c := C.faiss_read_index_fname(cfname, C.int(ioflags), &idx.idx); c != 0 { + return nil, getLastError() + } + return &IndexImpl{&idx}, nil +} diff --git a/vendor/github.com/blevesearch/go-faiss/index_ivf.go b/vendor/github.com/blevesearch/go-faiss/index_ivf.go new file mode 100644 index 0000000000..88266f1155 --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/index_ivf.go @@ -0,0 +1,58 @@ +package faiss + +/* +#include +#include +#include +#include +#include +*/ +import "C" +import ( + "fmt" + "runtime" +) + +func (idx *IndexImpl) SetDirectMap(mapType int) (err error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) + if ivfPtr == nil { + return fmt.Errorf("index is not of ivf type") + } + if c := C.faiss_IndexIVF_set_direct_map( + ivfPtr, + C.int(mapType), + ); c != 0 { + err = getLastError() + } + return err +} + +func (idx *IndexImpl) GetSubIndex() (*IndexImpl, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + ptr := C.faiss_IndexIDMap2_cast(idx.cPtr()) + if ptr == nil { + return nil, fmt.Errorf("index is not a id map") + } + + subIdx := C.faiss_IndexIDMap2_sub_index(ptr) + if subIdx == nil { + return nil, fmt.Errorf("couldn't retrieve the sub index") + } + + return &IndexImpl{&faissIndex{subIdx}}, nil +} + +// pass nprobe to be set as index time option for IVF indexes only. +// varying nprobe impacts recall but with an increase in latency. +func (idx *IndexImpl) SetNProbe(nprobe int32) { + ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) + if ivfPtr == nil { + return + } + C.faiss_IndexIVF_set_nprobe(ivfPtr, C.ulong(nprobe)) +} diff --git a/vendor/github.com/blevesearch/go-faiss/selector.go b/vendor/github.com/blevesearch/go-faiss/selector.go new file mode 100644 index 0000000000..84161a5073 --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/selector.go @@ -0,0 +1,67 @@ +package faiss + +/* +#include +*/ +import "C" +import "runtime" + +// IDSelector represents a set of IDs to remove. +type IDSelector struct { + sel *C.FaissIDSelector +} + +// NewIDSelectorRange creates a selector that removes IDs on [imin, imax). +func NewIDSelectorRange(imin, imax int64) (*IDSelector, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var sel *C.FaissIDSelectorRange + c := C.faiss_IDSelectorRange_new(&sel, C.idx_t(imin), C.idx_t(imax)) + if c != 0 { + return nil, getLastError() + } + return &IDSelector{(*C.FaissIDSelector)(sel)}, nil +} + +// NewIDSelectorBatch creates a new batch selector. +func NewIDSelectorBatch(indices []int64) (*IDSelector, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var sel *C.FaissIDSelectorBatch + if c := C.faiss_IDSelectorBatch_new( + &sel, + C.size_t(len(indices)), + (*C.idx_t)(&indices[0]), + ); c != 0 { + return nil, getLastError() + } + return &IDSelector{(*C.FaissIDSelector)(sel)}, nil +} + +// NewIDSelectorNot creates a new Not selector, wrapped arround a +// batch selector, with the IDs in 'exclude'. +func NewIDSelectorNot(exclude []int64) (*IDSelector, error) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + batchSelector, err := NewIDSelectorBatch(exclude) + if err != nil { + return nil, err + } + + var sel *C.FaissIDSelectorNot + if c := C.faiss_IDSelectorNot_new( + &sel, + batchSelector.sel, + ); c != 0 { + return nil, getLastError() + } + return &IDSelector{(*C.FaissIDSelector)(sel)}, nil +} + +// Delete frees the memory associated with s. +func (s *IDSelector) Delete() { + C.faiss_IDSelector_free(s.sel) +} diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go index 37d04c47ef..8e4a3d99cd 100644 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go @@ -168,3 +168,13 @@ type DocVisitState interface { type StatsReporter interface { ReportBytesWritten(bytesWritten uint64) } + +type FieldStatsReporter interface { + UpdateFieldStats(FieldStats) +} + +type FieldStats interface { + Store(statName, fieldName string, value uint64) + Aggregate(stats FieldStats) + Fetch() map[string]map[string]uint64 +} diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go new file mode 100644 index 0000000000..bc00796cca --- /dev/null +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go @@ -0,0 +1,74 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package segment + +import ( + "github.com/RoaringBitmap/roaring" +) + +type VecPostingsList interface { + DiskStatsReporter + + Iterator(prealloc VecPostingsIterator) VecPostingsIterator + + Size() int + + Count() uint64 + + // NOTE deferred for future work + + // And(other PostingsList) PostingsList + // Or(other PostingsList) PostingsList +} + +type VecPostingsIterator interface { + DiskStatsReporter + + // The caller is responsible for copying whatever it needs from + // the returned Posting instance before calling Next(), as some + // implementations may return a shared instance to reduce memory + // allocations. + Next() (VecPosting, error) + + // Advance will return the posting with the specified doc number + // or if there is no such posting, the next posting. + // Callers MUST NOT attempt to pass a docNum that is less than or + // equal to the currently visited posting doc Num. + Advance(docNum uint64) (VecPosting, error) + + Size() int +} + +type VectorIndex interface { + Search(qVector []float32, k int64, except *roaring.Bitmap) (VecPostingsList, error) + Close() + Size() uint64 +} + +type VectorSegment interface { + Segment + InterpretVectorIndex(field string) (VectorIndex, error) +} + +type VecPosting interface { + Number() uint64 + + Score() float32 + + Size() int +} diff --git a/vendor/github.com/blevesearch/zapx/v16/.gitignore b/vendor/github.com/blevesearch/zapx/v16/.gitignore new file mode 100644 index 0000000000..46d1cfad54 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/.gitignore @@ -0,0 +1,12 @@ +#* +*.sublime-* +*~ +.#* +.project +.settings +**/.idea/ +**/*.iml +.DS_Store +/cmd/zap/zap +*.test +tags diff --git a/vendor/github.com/blevesearch/zapx/v16/.golangci.yml b/vendor/github.com/blevesearch/zapx/v16/.golangci.yml new file mode 100644 index 0000000000..1d55bfc00d --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/.golangci.yml @@ -0,0 +1,28 @@ +linters: + # please, do not use `enable-all`: it's deprecated and will be removed soon. + # inverted configuration with `enable-all` and `disable` is not scalable during updates of golangci-lint + disable-all: true + enable: + - bodyclose + - deadcode + - depguard + - dupl + - errcheck + - gofmt + - goimports + - goprintffuncname + - gosec + - gosimple + - govet + - ineffassign + - misspell + - nakedret + - nolintlint + - rowserrcheck + - staticcheck + - structcheck + - typecheck + - unused + - varcheck + - whitespace + diff --git a/vendor/github.com/blevesearch/zapx/v16/LICENSE b/vendor/github.com/blevesearch/zapx/v16/LICENSE new file mode 100644 index 0000000000..7a4a3ea242 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/vendor/github.com/blevesearch/zapx/v16/README.md b/vendor/github.com/blevesearch/zapx/v16/README.md new file mode 100644 index 0000000000..4cbf1a145b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/README.md @@ -0,0 +1,163 @@ +# zapx file format + +The zapx module is fork of [zap](https://github.com/blevesearch/zap) module which maintains file format compatibility, but removes dependency on bleve, and instead depends only on the indepenent interface modules: + +- [bleve_index_api](https://github.com/blevesearch/scorch_segment_api) +- [scorch_segment_api](https://github.com/blevesearch/scorch_segment_api) + +Advanced ZAP File Format Documentation is [here](zap.md). + +The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written. + +Current usage: + +- mmap the entire file +- crc-32 bytes and version are in fixed position at end of the file +- reading remainder of footer could be version specific +- remainder of footer gives us: + - 3 important offsets (docValue , fields index and stored data index) + - 2 important values (number of docs and chunk factor) +- field data is processed once and memoized onto the heap so that we never have to go back to disk for it +- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends. +- access to all other indexed data follows the following pattern: + - first know the field name -> convert to id + - next navigate to term dictionary for that field + - some operations stop here and do dictionary ops + - next use dictionary to navigate to posting list for a specific term + - walk posting list + - if necessary, walk posting details as we go + - if location info is desired, consult location bitmap to see if it is there + +## stored fields section + +- for each document + - preparation phase: + - produce a slice of metadata bytes and data bytes + - produce these slices in field id order + - field value is appended to the data slice + - metadata slice is varint encoded with the following values for each field value + - field id (uint16) + - field type (byte) + - field value start offset in uncompressed data slice (uint64) + - field value length (uint64) + - field number of array positions (uint64) + - one additional value for each array position (uint64) + - compress the data slice using snappy + - file writing phase: + - remember the start offset for this document + - write out meta data length (varint uint64) + - write out compressed data length (varint uint64) + - write out the metadata bytes + - write out the compressed data bytes + +## stored fields idx + +- for each document + - write start offset (remembered from previous section) of stored data (big endian uint64) + +With this index and a known document number, we have direct access to all the stored field data. + +## posting details (freq/norm) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode term frequency (uint64) + - encode norm factor (float32) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## posting details (location) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode field (uint16) + - encode field pos (uint64) + - encode field start (uint64) + - encode field end (uint64) + - encode number of array positions to follow (uint64) + - encode each array position (each uint64) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## postings list section + +- for each posting list + - preparation phase: + - encode roaring bitmap posting list to bytes (so we know the length) + - file writing phase: + - remember the start position for this posting list + - write freq/norm details offset (remembered from previous, as varint uint64) + - write location details offset (remembered from previous, as varint uint64) + - write length of encoded roaring bitmap + - write the serialized roaring bitmap data + +## dictionary + +- for each field + - preparation phase: + - encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous) + - file writing phase: + - remember the start position of this persistDictionary + - write length of vellum data (varint uint64) + - write out vellum data + +## fields section + +- for each field + - file writing phase: + - remember start offset for each field + - write dictionary address (remembered from previous) (varint uint64) + - write length of field name (varint uint64) + - write field name bytes + +## fields idx + +- for each field + - file writing phase: + - write big endian uint64 of start offset for each field + +NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size. + +## fields DocValue + +- for each field + - preparation phase: + - produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data + - produce a slice remembering the length of each chunk + - file writing phase: + - remember the start position of this first field DocValue offset in the footer + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any +read operation leverage that meta information to extract the document specific data from the file. + +## footer + +- file writing phase + - write number of docs (big endian uint64) + - write stored field index location (big endian uint64) + - write field index location (big endian uint64) + - write field docValue location (big endian uint64) + - write out chunk factor (big endian uint32) + - write out version (big endian uint32) + - write out file CRC of everything preceding this (big endian uint32) diff --git a/vendor/github.com/blevesearch/zapx/v16/build.go b/vendor/github.com/blevesearch/zapx/v16/build.go new file mode 100644 index 0000000000..a545b072ba --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/build.go @@ -0,0 +1,194 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "fmt" + "io" + "math" + "os" + + "github.com/blevesearch/vellum" +) + +const Version uint32 = 16 +const IndexSectionsVersion uint32 = 16 +const Type string = "zap" + +const fieldNotUninverted = math.MaxUint64 + +func (sb *SegmentBase) Persist(path string) error { + return PersistSegmentBase(sb, path) +} + +// WriteTo is an implementation of io.WriterTo interface. +func (sb *SegmentBase) WriteTo(w io.Writer) (int64, error) { + if w == nil { + return 0, fmt.Errorf("invalid writer found") + } + + n, err := persistSegmentBaseToWriter(sb, w) + return int64(n), err +} + +// PersistSegmentBase persists SegmentBase in the zap file format. +func PersistSegmentBase(sb *SegmentBase, path string) error { + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + _, err = persistSegmentBaseToWriter(sb, f) + if err != nil { + cleanup() + return err + } + + err = f.Sync() + if err != nil { + cleanup() + return err + } + + err = f.Close() + if err != nil { + cleanup() + return err + } + + return err +} + +type bufWriter struct { + w *bufio.Writer + n int +} + +func (br *bufWriter) Write(in []byte) (int, error) { + n, err := br.w.Write(in) + br.n += n + return n, err +} + +func persistSegmentBaseToWriter(sb *SegmentBase, w io.Writer) (int, error) { + br := &bufWriter{w: bufio.NewWriter(w)} + + _, err := br.Write(sb.mem) + if err != nil { + return 0, err + } + + err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.sectionsIndexOffset, + sb.docValueOffset, sb.chunkMode, sb.memCRC, br) + if err != nil { + return 0, err + } + + err = br.w.Flush() + if err != nil { + return 0, err + } + + return br.n, nil +} + +func persistStoredFieldValues(fieldID int, + storedFieldValues [][]byte, stf []byte, spf [][]uint64, + curr int, metaEncode varintEncoder, data []byte) ( + int, []byte, error) { + for i := 0; i < len(storedFieldValues); i++ { + // encode field + _, err := metaEncode(uint64(fieldID)) + if err != nil { + return 0, nil, err + } + // encode type + _, err = metaEncode(uint64(stf[i])) + if err != nil { + return 0, nil, err + } + // encode start offset + _, err = metaEncode(uint64(curr)) + if err != nil { + return 0, nil, err + } + // end len + _, err = metaEncode(uint64(len(storedFieldValues[i]))) + if err != nil { + return 0, nil, err + } + // encode number of array pos + _, err = metaEncode(uint64(len(spf[i]))) + if err != nil { + return 0, nil, err + } + // encode all array positions + for _, pos := range spf[i] { + _, err = metaEncode(pos) + if err != nil { + return 0, nil, err + } + } + + data = append(data, storedFieldValues[i]...) + curr += len(storedFieldValues[i]) + } + + return curr, data, nil +} + +func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, + fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, + storedIndexOffset uint64, dictLocs []uint64, + sectionsIndexOffset uint64) (*SegmentBase, error) { + sb := &SegmentBase{ + mem: mem, + memCRC: memCRC, + chunkMode: chunkMode, + fieldsMap: fieldsMap, + fieldsInv: fieldsInv, + numDocs: numDocs, + storedIndexOffset: storedIndexOffset, + fieldsIndexOffset: sectionsIndexOffset, + sectionsIndexOffset: sectionsIndexOffset, + fieldDvReaders: make([]map[uint16]*docValueReader, len(segmentSections)), + docValueOffset: 0, // docValueOffsets identified automatically by the section + dictLocs: dictLocs, + fieldFSTs: make(map[uint16]*vellum.FST), + } + sb.updateSize() + + // load the data/section starting offsets for each field + // by via the sectionsIndexOffset as starting point. + err := sb.loadFieldsNew() + if err != nil { + return nil, err + } + + err = sb.loadDvReaders() + if err != nil { + return nil, err + } + + return sb, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v16/chunk.go b/vendor/github.com/blevesearch/zapx/v16/chunk.go new file mode 100644 index 0000000000..4307d0ed29 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/chunk.go @@ -0,0 +1,67 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "fmt" +) + +// LegacyChunkMode was the original chunk mode (always chunk size 1024) +// this mode is still used for chunking doc values. +var LegacyChunkMode uint32 = 1024 + +// DefaultChunkMode is the most recent improvement to chunking and should +// be used by default. +var DefaultChunkMode uint32 = 1026 + +func getChunkSize(chunkMode uint32, cardinality uint64, maxDocs uint64) (uint64, error) { + switch { + // any chunkMode <= 1024 will always chunk with chunkSize=chunkMode + case chunkMode <= 1024: + // legacy chunk size + return uint64(chunkMode), nil + + case chunkMode == 1025: + // attempt at simple improvement + // theory - the point of chunking is to put a bound on the maximum number of + // calls to Next() needed to find a random document. ie, you should be able + // to do one jump to the correct chunk, and then walk through at most + // chunk-size items + // previously 1024 was chosen as the chunk size, but this is particularly + // wasteful for low cardinality terms. the observation is that if there + // are less than 1024 items, why not put them all in one chunk, + // this way you'll still achieve the same goal of visiting at most + // chunk-size items. + // no attempt is made to tweak any other case + if cardinality <= 1024 { + return maxDocs, nil + } + return 1024, nil + + case chunkMode == 1026: + // improve upon the ideas tested in chunkMode 1025 + // the observation that the fewest number of dense chunks is the most + // desirable layout, given the built-in assumptions of chunking + // (that we want to put an upper-bound on the number of items you must + // walk over without skipping, currently tuned to 1024) + // + // 1. compute the number of chunks needed (max 1024/chunk) + // 2. convert to chunkSize, dividing into maxDocs + numChunks := (cardinality / 1024) + 1 + chunkSize := maxDocs / numChunks + return chunkSize, nil + } + return 0, fmt.Errorf("unknown chunk mode %d", chunkMode) +} diff --git a/vendor/github.com/blevesearch/zapx/v16/contentcoder.go b/vendor/github.com/blevesearch/zapx/v16/contentcoder.go new file mode 100644 index 0000000000..cd8b3fc86f --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/contentcoder.go @@ -0,0 +1,257 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" + "reflect" + "sync/atomic" + + "github.com/golang/snappy" +) + +var reflectStaticSizeMetaData int + +func init() { + var md MetaData + reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) +} + +var ( + termSeparator byte = 0xff + termSeparatorSplitSlice = []byte{termSeparator} +) + +type chunkedContentCoder struct { + bytesWritten uint64 // atomic access to this variable, moved to top to correct alignment issues on ARM, 386 and 32-bit MIPS. + + final []byte + chunkSize uint64 + currChunk uint64 + chunkLens []uint64 + + compressed []byte // temp buf for snappy compression + + w io.Writer + progressiveWrite bool + + chunkMeta []MetaData + chunkMetaBuf bytes.Buffer + chunkBuf bytes.Buffer +} + +// MetaData represents the data information inside a +// chunk. +type MetaData struct { + DocNum uint64 // docNum of the data inside the chunk + DocDvOffset uint64 // offset of data inside the chunk for the given docid +} + +// newChunkedContentCoder returns a new chunk content coder which +// packs data into chunks based on the provided chunkSize +func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, + w io.Writer, progressiveWrite bool, +) *chunkedContentCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedContentCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + chunkMeta: make([]MetaData, 0, total), + w: w, + progressiveWrite: progressiveWrite, + } + + return rv +} + +// Reset lets you reuse this chunked content coder. Buffers are reset +// and re used. You cannot change the chunk size. +func (c *chunkedContentCoder) Reset() { + c.currChunk = 0 + c.bytesWritten = 0 + c.final = c.final[:0] + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } + c.chunkMeta = c.chunkMeta[:0] +} + +func (c *chunkedContentCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } + if cap(c.chunkMeta) < total { + c.chunkMeta = make([]MetaData, 0, total) + } +} + +// Close indicates you are done calling Add() this allows +// the final chunk to be encoded. +func (c *chunkedContentCoder) Close() error { + return c.flushContents() +} + +func (c *chunkedContentCoder) incrementBytesWritten(val uint64) { + atomic.AddUint64(&c.bytesWritten, val) +} + +func (c *chunkedContentCoder) getBytesWritten() uint64 { + return atomic.LoadUint64(&c.bytesWritten) +} + +func (c *chunkedContentCoder) flushContents() error { + // flush the contents, with meta information at first + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(len(c.chunkMeta))) + _, err := c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return err + } + + // write out the metaData slice + for _, meta := range c.chunkMeta { + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) + if err != nil { + return err + } + } + + // write the metadata to final data + metaData := c.chunkMetaBuf.Bytes() + c.final = append(c.final, c.chunkMetaBuf.Bytes()...) + // write the compressed data to the final data + c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) + c.incrementBytesWritten(uint64(len(c.compressed))) + c.final = append(c.final, c.compressed...) + + c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) + + if c.progressiveWrite { + _, err := c.w.Write(c.final) + if err != nil { + return err + } + c.final = c.final[:0] + } + + return nil +} + +// Add encodes the provided byte slice into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // flush out the previous chunk details + err := c.flushContents() + if err != nil { + return err + } + // clearing the chunk specific meta for next chunk + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + c.chunkMeta = c.chunkMeta[:0] + c.currChunk = chunk + } + + // get the starting offset for this doc + dvOffset := c.chunkBuf.Len() + dvSize, err := c.chunkBuf.Write(vals) + if err != nil { + return err + } + + c.chunkMeta = append(c.chunkMeta, MetaData{ + DocNum: docNum, + DocDvOffset: uint64(dvOffset + dvSize), + }) + return nil +} + +// Write commits all the encoded chunked contents to the provided writer. +// +// | ..... data ..... | chunk offsets (varints) +// | position of chunk offsets (uint64) | number of offsets (uint64) | +func (c *chunkedContentCoder) Write() (int, error) { + var tw int + + if c.final != nil { + // write out the data section first + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsStart := uint64(tw) + + if cap(c.final) < binary.MaxVarintLen64 { + c.final = make([]byte, binary.MaxVarintLen64) + } else { + c.final = c.final[0:binary.MaxVarintLen64] + } + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + // write out the chunk offsets + for _, chunkOffset := range chunkOffsets { + n := binary.PutUvarint(c.final, chunkOffset) + nw, err := c.w.Write(c.final[:n]) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsLen := uint64(tw) - chunkOffsetsStart + + c.final = c.final[0:8] + // write out the length of chunk offsets + binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + // write out the number of chunks + binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) + nw, err = c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + c.final = c.final[:0] + + return tw, nil +} + +// ReadDocValueBoundary elicits the start, end offsets from a +// metaData header slice +func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = metaHeaders[chunk-1].DocDvOffset + } + return start, metaHeaders[chunk].DocDvOffset +} diff --git a/vendor/github.com/blevesearch/zapx/v16/count.go b/vendor/github.com/blevesearch/zapx/v16/count.go new file mode 100644 index 0000000000..b6135359fb --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/count.go @@ -0,0 +1,61 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "hash/crc32" + "io" + + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +// CountHashWriter is a wrapper around a Writer which counts the number of +// bytes which have been written and computes a crc32 hash +type CountHashWriter struct { + w io.Writer + crc uint32 + n int + s segment.StatsReporter +} + +// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer +func NewCountHashWriter(w io.Writer) *CountHashWriter { + return &CountHashWriter{w: w} +} + +func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { + return &CountHashWriter{w: w, s: s} +} + +// Write writes the provided bytes to the wrapped writer and counts the bytes +func (c *CountHashWriter) Write(b []byte) (int, error) { + n, err := c.w.Write(b) + c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) + c.n += n + if c.s != nil { + c.s.ReportBytesWritten(uint64(n)) + } + return n, err +} + +// Count returns the number of bytes written +func (c *CountHashWriter) Count() int { + return c.n +} + +// Sum32 returns the CRC-32 hash of the content written to this writer +func (c *CountHashWriter) Sum32() uint32 { + return c.crc +} diff --git a/vendor/github.com/blevesearch/zapx/v16/dict.go b/vendor/github.com/blevesearch/zapx/v16/dict.go new file mode 100644 index 0000000000..d06278fd5e --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/dict.go @@ -0,0 +1,178 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "fmt" + + "github.com/RoaringBitmap/roaring" + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +// Dictionary is the zap representation of the term dictionary +type Dictionary struct { + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST + + fstReader *vellum.Reader + + bytesRead uint64 +} + +// represents an immutable, empty dictionary +var emptyDictionary = &Dictionary{} + +// PostingsList returns the postings list for the specified term +func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, + prealloc segment.PostingsList) (segment.PostingsList, error) { + var preallocPL *PostingsList + pl, ok := prealloc.(*PostingsList) + if ok && pl != nil { + preallocPL = pl + } + return d.postingsList(term, except, preallocPL) +} + +func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + if d.fstReader == nil { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + postingsOffset, exists, err := d.fstReader.Get(term) + + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if !exists { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + return d.postingsListFromOffset(postingsOffset, except, rv) +} + +func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + rv = d.postingsListInit(rv, except) + + err := rv.read(postingsOffset, d) + if err != nil { + return nil, err + } + + return rv, nil +} + +func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { + if rv == nil || rv == emptyPostingsList { + rv = &PostingsList{} + } else { + postings := rv.postings + if postings != nil { + postings.Clear() + } + + *rv = PostingsList{} // clear the struct + + rv.postings = postings + } + rv.sb = d.sb + rv.except = except + return rv +} + +func (d *Dictionary) Contains(key []byte) (bool, error) { + if d.fst != nil { + return d.fst.Contains(key) + } + return false, nil +} + +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (d *Dictionary) AutomatonIterator(a segment.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { + if d.fst != nil { + rv := &DictionaryIterator{ + d: d, + } + + itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + + return rv + } + return emptyDictionaryIterator +} + +func (d *Dictionary) incrementBytesRead(val uint64) { + d.bytesRead += val +} + +func (d *Dictionary) BytesRead() uint64 { + return d.bytesRead +} + +func (d *Dictionary) ResetBytesRead(val uint64) { + d.bytesRead = val +} + +func (d *Dictionary) BytesWritten() uint64 { + return 0 +} + +// DictionaryIterator is an iterator for term dictionary +type DictionaryIterator struct { + d *Dictionary + itr vellum.Iterator + err error + tmp PostingsList + entry index.DictEntry + omitCount bool +} + +var emptyDictionaryIterator = &DictionaryIterator{} + +// Next returns the next entry in the dictionary +func (i *DictionaryIterator) Next() (*index.DictEntry, error) { + if i.err != nil && i.err != vellum.ErrIteratorDone { + return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil + } + term, postingsOffset := i.itr.Current() + i.entry.Term = string(term) + if !i.omitCount { + i.err = i.tmp.read(postingsOffset, i.d) + if i.err != nil { + return nil, i.err + } + i.entry.Count = i.tmp.Count() + } + i.err = i.itr.Next() + return &i.entry, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v16/docvalues.go b/vendor/github.com/blevesearch/zapx/v16/docvalues.go new file mode 100644 index 0000000000..6fb7a9a20b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/docvalues.go @@ -0,0 +1,355 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "reflect" + "sort" + + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/golang/snappy" +) + +var reflectStaticSizedocValueReader int + +func init() { + var dvi docValueReader + reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) +} + +type docNumTermsVisitor func(docNum uint64, terms []byte) error + +type docVisitState struct { + dvrs map[uint16]*docValueReader + segment *SegmentBase + + bytesRead uint64 +} + +// Implements the segment.DiskStatsReporter interface +// The purpose of this implementation is to get +// the bytes read from the disk (pertaining to the +// docvalues) while querying. +// the loadDvChunk retrieves the next chunk of docvalues +// and the bytes retrieved off the disk pertaining to that +// is accounted as well. +func (d *docVisitState) incrementBytesRead(val uint64) { + d.bytesRead += val +} + +func (d *docVisitState) BytesRead() uint64 { + return d.bytesRead +} + +func (d *docVisitState) BytesWritten() uint64 { + return 0 +} + +func (d *docVisitState) ResetBytesRead(val uint64) { + d.bytesRead = val +} + +type docValueReader struct { + field string + curChunkNum uint64 + chunkOffsets []uint64 + dvDataLoc uint64 + curChunkHeader []MetaData + curChunkData []byte // compressed data cache + uncompressed []byte // temp buf for snappy decompression + + // atomic access to this variable + bytesRead uint64 +} + +func (di *docValueReader) size() int { + return reflectStaticSizedocValueReader + SizeOfPtr + + len(di.field) + + len(di.chunkOffsets)*SizeOfUint64 + + len(di.curChunkHeader)*reflectStaticSizeMetaData + + len(di.curChunkData) +} + +func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { + if rv == nil { + rv = &docValueReader{} + } + + rv.field = di.field + rv.curChunkNum = math.MaxUint64 + rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable + rv.dvDataLoc = di.dvDataLoc + rv.curChunkHeader = rv.curChunkHeader[:0] + rv.curChunkData = nil + rv.uncompressed = rv.uncompressed[:0] + + return rv +} + +func (di *docValueReader) curChunkNumber() uint64 { + return di.curChunkNum +} + +func (s *SegmentBase) loadFieldDocValueReader(field string, + fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { + // get the docValue offset for the given fields + if fieldDvLocStart == fieldNotUninverted { + // no docValues found, nothing to do + return nil, nil + } + + // read the number of chunks, and chunk offsets position + var numChunks, chunkOffsetsPosition uint64 + + if fieldDvLocEnd-fieldDvLocStart > 16 { + numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) + // read the length of chunk offsets + chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) + // acquire position of chunk offsets + chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen + + // 16 bytes since it corresponds to the length + // of chunk offsets and the position of the offsets + s.incrementBytesRead(16) + } else { + return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart) + } + + fdvIter := &docValueReader{ + curChunkNum: math.MaxUint64, + field: field, + chunkOffsets: make([]uint64, int(numChunks)), + } + + // read the chunk offsets + var offset uint64 + for i := 0; i < int(numChunks); i++ { + loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) + if read <= 0 { + return nil, fmt.Errorf("corrupted chunk offset during segment load") + } + fdvIter.chunkOffsets[i] = loc + offset += uint64(read) + } + s.incrementBytesRead(offset) + // set the data offset + fdvIter.dvDataLoc = fieldDvLocStart + return fdvIter, nil +} + +func (d *docValueReader) getBytesRead() uint64 { + return d.bytesRead +} + +func (d *docValueReader) incrementBytesRead(val uint64) { + d.bytesRead += val +} + +func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { + // advance to the chunk where the docValues + // reside for the given docNum + destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc + start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) + if start >= end { + di.curChunkHeader = di.curChunkHeader[:0] + di.curChunkData = nil + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil + } + + destChunkDataLoc += start + curChunkEnd += end + + // read the number of docs reside in the chunk + numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("failed to read the chunk") + } + chunkMetaLoc := destChunkDataLoc + uint64(read) + di.incrementBytesRead(uint64(read)) + offset := uint64(0) + if cap(di.curChunkHeader) < int(numDocs) { + di.curChunkHeader = make([]MetaData, int(numDocs)) + } else { + di.curChunkHeader = di.curChunkHeader[:int(numDocs)] + } + for i := 0; i < int(numDocs); i++ { + di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + } + + compressedDataLoc := chunkMetaLoc + offset + dataLength := curChunkEnd - compressedDataLoc + di.incrementBytesRead(uint64(dataLength + offset)) + di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil +} + +func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { + for i := 0; i < len(di.chunkOffsets); i++ { + err := di.loadDvChunk(uint64(i), s) + if err != nil { + return err + } + if di.curChunkData == nil || len(di.curChunkHeader) == 0 { + continue + } + + // uncompress the already loaded data + uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + + start := uint64(0) + for _, entry := range di.curChunkHeader { + err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) + if err != nil { + return err + } + + start = entry.DocDvOffset + } + } + + return nil +} + +func (di *docValueReader) visitDocValues(docNum uint64, + visitor index.DocValueVisitor) error { + // binary search the term locations for the docNum + start, end := di.getDocValueLocs(docNum) + if start == math.MaxUint64 || end == math.MaxUint64 || start == end { + return nil + } + + var uncompressed []byte + var err error + // use the uncompressed copy if available + if len(di.uncompressed) > 0 { + uncompressed = di.uncompressed + } else { + // uncompress the already loaded data + uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + } + + // pick the terms for the given docNum + uncompressed = uncompressed[start:end] + for { + i := bytes.Index(uncompressed, termSeparatorSplitSlice) + if i < 0 { + break + } + + visitor(di.field, uncompressed[0:i]) + uncompressed = uncompressed[i+1:] + } + + return nil +} + +func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { + i := sort.Search(len(di.curChunkHeader), func(i int) bool { + return di.curChunkHeader[i].DocNum >= docNum + }) + if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { + return ReadDocValueBoundary(i, di.curChunkHeader) + } + return math.MaxUint64, math.MaxUint64 +} + +// VisitDocValues is an implementation of the +// DocValueVisitable interface +func (s *SegmentBase) VisitDocValues(localDocNum uint64, fields []string, + visitor index.DocValueVisitor, dvsIn segment.DocVisitState) ( + segment.DocVisitState, error) { + dvs, ok := dvsIn.(*docVisitState) + if !ok || dvs == nil { + dvs = &docVisitState{} + } else { + if dvs.segment != s { + dvs.segment = s + dvs.dvrs = nil + dvs.bytesRead = 0 + } + } + + var fieldIDPlus1 uint16 + if dvs.dvrs == nil { + dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvIter, exists := s.fieldDvReaders[SectionInvertedTextIndex][fieldID]; exists && + dvIter != nil { + dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) + } + } + } + + // find the chunkNumber where the docValues are stored + // NOTE: doc values continue to use legacy chunk mode + chunkFactor, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, err + } + docInChunk := localDocNum / chunkFactor + var dvr *docValueReader + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { + // check if the chunk is already loaded + if docInChunk != dvr.curChunkNumber() { + err := dvr.loadDvChunk(docInChunk, s) + if err != nil { + return dvs, err + } + dvs.ResetBytesRead(dvr.getBytesRead()) + } else { + dvs.ResetBytesRead(0) + } + + _ = dvr.visitDocValues(localDocNum, visitor) + } + } + return dvs, nil +} + +// VisitableDocValueFields returns the list of fields with +// persisted doc value terms ready to be visitable using the +// VisitDocumentFieldTerms method. +func (s *SegmentBase) VisitableDocValueFields() ([]string, error) { + return s.fieldDvNames, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v16/enumerator.go b/vendor/github.com/blevesearch/zapx/v16/enumerator.go new file mode 100644 index 0000000000..972a224165 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/enumerator.go @@ -0,0 +1,138 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + + "github.com/blevesearch/vellum" +) + +// enumerator provides an ordered traversal of multiple vellum +// iterators. Like JOIN of iterators, the enumerator produces a +// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC, +// then iteratorIndex ASC, where the same key might be seen or +// repeated across multiple child iterators. +type enumerator struct { + itrs []vellum.Iterator + currKs [][]byte + currVs []uint64 + + lowK []byte + lowIdxs []int + lowCurr int +} + +// newEnumerator returns a new enumerator over the vellum Iterators +func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { + rv := &enumerator{ + itrs: itrs, + currKs: make([][]byte, len(itrs)), + currVs: make([]uint64, len(itrs)), + lowIdxs: make([]int, 0, len(itrs)), + } + for i, itr := range rv.itrs { + rv.currKs[i], rv.currVs[i] = itr.Current() + } + rv.updateMatches(false) + if rv.lowK == nil && len(rv.lowIdxs) == 0 { + return rv, vellum.ErrIteratorDone + } + return rv, nil +} + +// updateMatches maintains the low key matches based on the currKs +func (m *enumerator) updateMatches(skipEmptyKey bool) { + m.lowK = nil + m.lowIdxs = m.lowIdxs[:0] + m.lowCurr = 0 + + for i, key := range m.currKs { + if (key == nil && m.currVs[i] == 0) || // in case of empty iterator + (len(key) == 0 && skipEmptyKey) { // skip empty keys + continue + } + + cmp := bytes.Compare(key, m.lowK) + if cmp < 0 || len(m.lowIdxs) == 0 { + // reached a new low + m.lowK = key + m.lowIdxs = m.lowIdxs[:0] + m.lowIdxs = append(m.lowIdxs, i) + } else if cmp == 0 { + m.lowIdxs = append(m.lowIdxs, i) + } + } +} + +// Current returns the enumerator's current key, iterator-index, and +// value. If the enumerator is not pointing at a valid value (because +// Next returned an error previously), Current will return nil,0,0. +func (m *enumerator) Current() ([]byte, int, uint64) { + var i int + var v uint64 + if m.lowCurr < len(m.lowIdxs) { + i = m.lowIdxs[m.lowCurr] + v = m.currVs[i] + } + return m.lowK, i, v +} + +// GetLowIdxsAndValues will return all of the iterator indices +// which point to the current key, and their corresponding +// values. This can be used by advanced caller which may need +// to peek into these other sets of data before processing. +func (m *enumerator) GetLowIdxsAndValues() ([]int, []uint64) { + values := make([]uint64, 0, len(m.lowIdxs)) + for _, idx := range m.lowIdxs { + values = append(values, m.currVs[idx]) + } + return m.lowIdxs, values +} + +// Next advances the enumerator to the next key/iterator/value result, +// else vellum.ErrIteratorDone is returned. +func (m *enumerator) Next() error { + m.lowCurr += 1 + if m.lowCurr >= len(m.lowIdxs) { + // move all the current low iterators forwards + for _, vi := range m.lowIdxs { + err := m.itrs[vi].Next() + if err != nil && err != vellum.ErrIteratorDone { + return err + } + m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() + } + // can skip any empty keys encountered at this point + m.updateMatches(true) + } + if m.lowK == nil && len(m.lowIdxs) == 0 { + return vellum.ErrIteratorDone + } + return nil +} + +// Close all the underlying Iterators. The first error, if any, will +// be returned. +func (m *enumerator) Close() error { + var rv error + for _, itr := range m.itrs { + err := itr.Close() + if rv == nil { + rv = err + } + } + return rv +} diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go new file mode 100644 index 0000000000..2104d53f94 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go @@ -0,0 +1,422 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "math" + "reflect" + + "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/roaring64" + faiss "github.com/blevesearch/go-faiss" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizeVecPostingsList int +var reflectStaticSizeVecPostingsIterator int +var reflectStaticSizeVecPosting int + +func init() { + var pl VecPostingsList + reflectStaticSizeVecPostingsList = int(reflect.TypeOf(pl).Size()) + var pi VecPostingsIterator + reflectStaticSizeVecPostingsIterator = int(reflect.TypeOf(pi).Size()) + var p VecPosting + reflectStaticSizeVecPosting = int(reflect.TypeOf(p).Size()) +} + +type VecPosting struct { + docNum uint64 + score float32 +} + +func (vp *VecPosting) Number() uint64 { + return vp.docNum +} + +func (vp *VecPosting) Score() float32 { + return vp.score +} + +func (vp *VecPosting) Size() int { + sizeInBytes := reflectStaticSizePosting + + return sizeInBytes +} + +// ============================================================================= + +// the vector postings list is supposed to store the docNum and its similarity +// score as a vector postings entry in it. +// The way in which is it stored is using a roaring64 bitmap. +// the docNum is stored in high 32 and the lower 32 bits contains the score value. +// the score is actually a float32 value and in order to store it as a uint32 in +// the bitmap, we use the IEEE 754 floating point format. +// +// each entry in the roaring64 bitmap of the vector postings list is a 64 bit +// number which looks like this: +// MSB LSB +// |64 63 62 ... 32| 31 30 ... 0| +// | | | +type VecPostingsList struct { + // todo: perhaps we don't even need to store a bitmap if there is only + // one similar vector the query, but rather store it as a field value + // in the struct + except *roaring64.Bitmap + postings *roaring64.Bitmap +} + +var emptyVecPostingsIterator = &VecPostingsIterator{} +var emptyVecPostingsList = &VecPostingsList{} + +func (vpl *VecPostingsList) Iterator(prealloc segment.VecPostingsIterator) segment.VecPostingsIterator { + + // tbd: do we check the cardinality of postings and scores? + var preallocPI *VecPostingsIterator + pi, ok := prealloc.(*VecPostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyVecPostingsIterator { + preallocPI = nil + } + + return vpl.iterator(preallocPI) +} + +func (p *VecPostingsList) iterator(rv *VecPostingsIterator) *VecPostingsIterator { + if rv == nil { + rv = &VecPostingsIterator{} + } else { + *rv = VecPostingsIterator{} // clear the struct + } + // think on some of the edge cases over here. + if p.postings == nil { + return rv + } + rv.postings = p + rv.all = p.postings.Iterator() + if p.except != nil { + rv.ActualBM = roaring64.AndNot(p.postings, p.except) + rv.Actual = rv.ActualBM.Iterator() + } else { + rv.ActualBM = p.postings + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. + } + return rv +} + +func (p *VecPostingsList) Size() int { + sizeInBytes := reflectStaticSizeVecPostingsList + SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +func (p *VecPostingsList) Count() uint64 { + n := p.postings.GetCardinality() + var e uint64 + if p.except != nil { + e = p.postings.AndCardinality(p.except) + } + return n - e +} + +func (vpl *VecPostingsList) ResetBytesRead(val uint64) { + +} + +func (vpl *VecPostingsList) BytesRead() uint64 { + return 0 +} + +func (vpl *VecPostingsList) BytesWritten() uint64 { + return 0 +} + +// ============================================================================= + +type VecPostingsIterator struct { + postings *VecPostingsList + all roaring64.IntPeekable64 + Actual roaring64.IntPeekable64 + ActualBM *roaring64.Bitmap + + next VecPosting // reused across Next() calls +} + +func (i *VecPostingsIterator) nextCodeAtOrAfterClean(atOrAfter uint64) (uint64, bool, error) { + i.Actual.AdvanceIfNeeded(atOrAfter) + + if !i.Actual.HasNext() { + return 0, false, nil // couldn't find anything + } + + return i.Actual.Next(), true, nil +} + +func (i *VecPostingsIterator) nextCodeAtOrAfter(atOrAfter uint64) (uint64, bool, error) { + if i.Actual == nil || !i.Actual.HasNext() { + return 0, false, nil + } + + if i.postings == nil || i.postings == emptyVecPostingsList { + // couldn't find anything + return 0, false, nil + } + + if i.postings.postings == i.ActualBM { + return i.nextCodeAtOrAfterClean(atOrAfter) + } + + i.Actual.AdvanceIfNeeded(atOrAfter) + + if !i.Actual.HasNext() || !i.all.HasNext() { + // couldn't find anything + return 0, false, nil + } + + n := i.Actual.Next() + allN := i.all.Next() + + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do. + for allN != n { + if !i.all.HasNext() { + return 0, false, nil + } + allN = i.all.Next() + } + + return uint64(n), true, nil +} + +// a transformation function which stores both the score and the docNum as a single +// entry which is a uint64 number. +func getVectorCode(docNum uint32, score float32) uint64 { + return uint64(docNum)<<32 | uint64(math.Float32bits(score)) +} + +// Next returns the next posting on the vector postings list, or nil at the end +func (i *VecPostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.VecPosting, error) { + // transform the docNum provided to the vector code format and use that to + // get the next entry. the comparison still happens docNum wise since after + // the transformation, the docNum occupies the upper 32 bits just an entry in + // the postings list + atOrAfter = getVectorCode(uint32(atOrAfter), 0) + code, exists, err := i.nextCodeAtOrAfter(atOrAfter) + if err != nil || !exists { + return nil, err + } + + i.next = VecPosting{} // clear the struct + rv := &i.next + rv.score = math.Float32frombits(uint32(code)) + rv.docNum = code >> 32 + + return rv, nil +} + +func (itr *VecPostingsIterator) Next() (segment.VecPosting, error) { + return itr.nextAtOrAfter(0) +} + +func (itr *VecPostingsIterator) Advance(docNum uint64) (segment.VecPosting, error) { + return itr.nextAtOrAfter(docNum) +} + +func (i *VecPostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + SizeOfPtr + + i.next.Size() + + return sizeInBytes +} + +func (vpl *VecPostingsIterator) ResetBytesRead(val uint64) { + +} + +func (vpl *VecPostingsIterator) BytesRead() uint64 { + return 0 +} + +func (vpl *VecPostingsIterator) BytesWritten() uint64 { + return 0 +} + +// vectorIndexWrapper conforms to scorch_segment_api's VectorIndex interface +type vectorIndexWrapper struct { + search func(qVector []float32, k int64, except *roaring.Bitmap) (segment.VecPostingsList, error) + close func() + size func() uint64 +} + +func (i *vectorIndexWrapper) Search(qVector []float32, k int64, except *roaring.Bitmap) ( + segment.VecPostingsList, error) { + return i.search(qVector, k, except) +} + +func (i *vectorIndexWrapper) Close() { + i.close() +} + +func (i *vectorIndexWrapper) Size() uint64 { + return i.size() +} + +// InterpretVectorIndex returns closures that will allow the caller to - +// (1) SearchVectorIndex - search within an attached vector index +// (2) CloseVectorIndex - close attached vector index +// +// These function pointers may be nil, when InterpretVectorIndex return a non-nil err. +// It is on the caller to ensure CloseVectorIndex is invoked (sync or async) after +// their business with the attached vector index concludes. +func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex, error) { + // Params needed for the closures + var vecIndex *faiss.IndexImpl + vecDocIDMap := make(map[int64]uint32) + + var ( + wrapVecIndex = &vectorIndexWrapper{ + search: func(qVector []float32, k int64, except *roaring.Bitmap) (segment.VecPostingsList, error) { + // 1. returned postings list (of type PostingsList) has two types of information - docNum and its score. + // 2. both the values can be represented using roaring bitmaps. + // 3. the Iterator (of type PostingsIterator) returned would operate in terms of VecPostings. + // 4. VecPostings would just have the docNum and the score. Every call of Next() + // and Advance just returns the next VecPostings. The caller would do a vp.Number() + // and the Score() to get the corresponding values + rv := &VecPostingsList{ + except: nil, // todo: handle the except bitmap within postings iterator. + postings: roaring64.New(), + } + + if vecIndex == nil || vecIndex.D() != len(qVector) { + // vector index not found or dimensionality mismatched + return rv, nil + } + + var vectorIDsToExclude []int64 + // iterate through the vector doc ID map and if the doc ID is one to be + // deleted, add it to the list + for vecID, docID := range vecDocIDMap { + if except != nil && except.Contains(docID) { + vectorIDsToExclude = append(vectorIDsToExclude, vecID) + } + } + + scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k, vectorIDsToExclude) + + if err != nil { + return nil, err + } + // for every similar vector returned by the Search() API, add the corresponding + // docID and the score to the newly created vecPostingsList + for i := 0; i < len(ids); i++ { + vecID := ids[i] + // Checking if it's present in the vecDocIDMap. + // If -1 is returned as an ID(insufficient vectors), this will ensure + // they it isn't added to the final postings list. + if docID, ok := vecDocIDMap[vecID]; ok { + code := getVectorCode(docID, scores[i]) + rv.postings.Add(uint64(code)) + } + } + + return rv, nil + }, + close: func() { + if vecIndex != nil { + vecIndex.Close() + } + }, + size: func() uint64 { + if vecIndex != nil { + return vecIndex.Size() + } + return 0 + }, + } + + err error + ) + + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 <= 0 { + return wrapVecIndex, nil + } + + vectorSection := sb.fieldsSectionsMap[fieldIDPlus1-1][SectionFaissVectorIndex] + // check if the field has a vector section in the segment. + if vectorSection <= 0 { + return wrapVecIndex, nil + } + + pos := int(vectorSection) + + // the below loop loads the following: + // 1. doc values(first 2 iterations) - adhering to the sections format. never + // valid values for vector section + // 2. index optimization type. + for i := 0; i < 3; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + } + + // read the number vectors indexed for this field and load the vector to docID mapping. + // todo: cache the vecID to docIDs mapping for a fieldID + numVecs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + for i := 0; i < int(numVecs); i++ { + vecID, n := binary.Varint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + docID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + vecDocIDMap[vecID] = uint32(docID) + } + + // todo: not a good idea to cache the vector index perhaps, since it could be quite huge. + indexSize, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + indexBytes := sb.mem[pos : pos+int(indexSize)] + pos += int(indexSize) + + vecIndex, err = faiss.ReadIndexFromBuffer(indexBytes, faiss.IOFlagReadOnly) + return wrapVecIndex, err +} + +func (sb *SegmentBase) UpdateFieldStats(stats segment.FieldStats) { + for _, fieldName := range sb.fieldsInv { + pos := int(sb.fieldsSectionsMap[sb.fieldsMap[fieldName]-1][SectionFaissVectorIndex]) + if pos == 0 { + continue + } + + for i := 0; i < 3; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + } + numVecs, _ := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + + stats.Store("num_vectors", fieldName, numVecs) + } +} diff --git a/vendor/github.com/blevesearch/zapx/v16/intDecoder.go b/vendor/github.com/blevesearch/zapx/v16/intDecoder.go new file mode 100644 index 0000000000..e50c471722 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/intDecoder.go @@ -0,0 +1,139 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" +) + +type chunkedIntDecoder struct { + startOffset uint64 + dataStartOffset uint64 + chunkOffsets []uint64 + curChunkBytes []byte + data []byte + r *memUvarintReader + + // atomic access to this variable + bytesRead uint64 +} + +// newChunkedIntDecoder expects an optional or reset chunkedIntDecoder for better reuse. +func newChunkedIntDecoder(buf []byte, offset uint64, rv *chunkedIntDecoder) *chunkedIntDecoder { + if rv == nil { + rv = &chunkedIntDecoder{startOffset: offset, data: buf} + } else { + rv.startOffset = offset + rv.data = buf + } + + var n, numChunks uint64 + var read int + if offset == termNotEncoded { + numChunks = 0 + } else { + numChunks, read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + } + + n += uint64(read) + if cap(rv.chunkOffsets) >= int(numChunks) { + rv.chunkOffsets = rv.chunkOffsets[:int(numChunks)] + } else { + rv.chunkOffsets = make([]uint64, int(numChunks)) + } + for i := 0; i < int(numChunks); i++ { + rv.chunkOffsets[i], read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.bytesRead += n + rv.dataStartOffset = offset + n + return rv +} + +// A util function which fetches the query time +// specific bytes encoded by intcoder (for eg the +// freqNorm and location details of a term in document) +// the loadChunk retrieves the next chunk and the +// number of bytes retrieve in that operation is accounted +func (d *chunkedIntDecoder) getBytesRead() uint64 { + return d.bytesRead +} + +func (d *chunkedIntDecoder) loadChunk(chunk int) error { + if d.startOffset == termNotEncoded { + d.r = newMemUvarintReader([]byte(nil)) + return nil + } + + if chunk >= len(d.chunkOffsets) { + return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)", + chunk, len(d.chunkOffsets)) + } + + end, start := d.dataStartOffset, d.dataStartOffset + s, e := readChunkBoundary(chunk, d.chunkOffsets) + start += s + end += e + d.curChunkBytes = d.data[start:end] + d.bytesRead += uint64(len(d.curChunkBytes)) + if d.r == nil { + d.r = newMemUvarintReader(d.curChunkBytes) + } else { + d.r.Reset(d.curChunkBytes) + } + + return nil +} + +func (d *chunkedIntDecoder) reset() { + d.startOffset = 0 + d.dataStartOffset = 0 + d.chunkOffsets = d.chunkOffsets[:0] + d.curChunkBytes = d.curChunkBytes[:0] + d.bytesRead = 0 + d.data = d.data[:0] + if d.r != nil { + d.r.Reset([]byte(nil)) + } +} + +func (d *chunkedIntDecoder) isNil() bool { + return d.curChunkBytes == nil || len(d.curChunkBytes) == 0 +} + +func (d *chunkedIntDecoder) readUvarint() (uint64, error) { + return d.r.ReadUvarint() +} + +func (d *chunkedIntDecoder) readBytes(start, end int) []byte { + return d.curChunkBytes[start:end] +} + +func (d *chunkedIntDecoder) SkipUvarint() { + d.r.SkipUvarint() +} + +func (d *chunkedIntDecoder) SkipBytes(count int) { + d.r.SkipBytes(count) +} + +func (d *chunkedIntDecoder) Len() int { + return d.r.Len() +} + +func (d *chunkedIntDecoder) remainingLen() int { + return len(d.curChunkBytes) - d.r.Len() +} diff --git a/vendor/github.com/blevesearch/zapx/v16/intcoder.go b/vendor/github.com/blevesearch/zapx/v16/intcoder.go new file mode 100644 index 0000000000..2957fbd098 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/intcoder.go @@ -0,0 +1,220 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" + "sync/atomic" +) + +// We can safely use 0 to represent termNotEncoded since 0 +// could never be a valid address for term location information. +// (stored field index is always non-empty and earlier in the +// file) +const termNotEncoded = 0 + +type chunkedIntCoder struct { + final []byte + chunkSize uint64 + chunkBuf bytes.Buffer + chunkLens []uint64 + currChunk uint64 + + buf []byte + + // atomic access to this variable + bytesWritten uint64 +} + +// newChunkedIntCoder returns a new chunk int coder which packs data into +// chunks based on the provided chunkSize and supports up to the specified +// maxDocNum +func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedIntCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + final: make([]byte, 0, 64), + } + + return rv +} + +// Reset lets you reuse this chunked int coder. buffers are reset and reused +// from previous use. you cannot change the chunk size or max doc num. +func (c *chunkedIntCoder) Reset() { + c.final = c.final[:0] + c.bytesWritten = 0 + c.chunkBuf.Reset() + c.currChunk = 0 + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } +} + +// SetChunkSize changes the chunk size. It is only valid to do so +// with a new chunkedIntCoder, or immediately after calling Reset() +func (c *chunkedIntCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } +} + +func (c *chunkedIntCoder) incrementBytesWritten(val uint64) { + atomic.AddUint64(&c.bytesWritten, val) +} + +func (c *chunkedIntCoder) getBytesWritten() uint64 { + return atomic.LoadUint64(&c.bytesWritten) +} + +// Add encodes the provided integers into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + if len(c.buf) < binary.MaxVarintLen64 { + c.buf = make([]byte, binary.MaxVarintLen64) + } + + for _, val := range vals { + wb := binary.PutUvarint(c.buf, val) + _, err := c.chunkBuf.Write(c.buf[:wb]) + if err != nil { + return err + } + } + + return nil +} + +func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + _, err := c.chunkBuf.Write(buf) + return err +} + +// Close indicates you are done calling Add() this allows the final chunk +// to be encoded. +func (c *chunkedIntCoder) Close() { + encodingBytes := c.chunkBuf.Bytes() + c.incrementBytesWritten(uint64(len(encodingBytes))) + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) + c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close +} + +// Write commits all the encoded chunked integers to the provided writer. +func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { + bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens)) + if len(c.buf) < bufNeeded { + c.buf = make([]byte, bufNeeded) + } + buf := c.buf + + // convert the chunk lengths into chunk offsets + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + + // write out the number of chunks & each chunk offsets + n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) + for _, chunkOffset := range chunkOffsets { + n += binary.PutUvarint(buf[n:], chunkOffset) + } + + tw, err := w.Write(buf[:n]) + if err != nil { + return tw, err + } + + // write out the data + nw, err := w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + return tw, nil +} + +// writeAt commits all the encoded chunked integers to the provided writer +// and returns the starting offset, total bytes written and an error +func (c *chunkedIntCoder) writeAt(w io.Writer) (uint64, int, error) { + startOffset := uint64(termNotEncoded) + if len(c.final) <= 0 { + return startOffset, 0, nil + } + + if chw := w.(*CountHashWriter); chw != nil { + startOffset = uint64(chw.Count()) + } + + tw, err := c.Write(w) + return startOffset, tw, err +} + +func (c *chunkedIntCoder) FinalSize() int { + return len(c.final) +} + +// modifyLengthsToEndOffsets converts the chunk length array +// to a chunk offset array. The readChunkBoundary +// will figure out the start and end of every chunk from +// these offsets. Starting offset of i'th index is stored +// in i-1'th position except for 0'th index and ending offset +// is stored at i'th index position. +// For 0'th element, starting position is always zero. +// eg: +// Lens -> 5 5 5 5 => 5 10 15 20 +// Lens -> 0 5 0 5 => 0 5 5 10 +// Lens -> 0 0 0 5 => 0 0 0 5 +// Lens -> 5 0 0 0 => 5 5 5 5 +// Lens -> 0 5 0 0 => 0 5 5 5 +// Lens -> 0 0 5 0 => 0 0 5 5 +func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { + var runningOffset uint64 + var index, i int + for i = 1; i <= len(lengths); i++ { + runningOffset += lengths[i-1] + lengths[index] = runningOffset + index++ + } + return lengths +} + +func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = offsets[chunk-1] + } + return start, offsets[chunk] +} diff --git a/vendor/github.com/blevesearch/zapx/v16/memuvarint.go b/vendor/github.com/blevesearch/zapx/v16/memuvarint.go new file mode 100644 index 0000000000..48a57f9c85 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/memuvarint.go @@ -0,0 +1,103 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "fmt" +) + +type memUvarintReader struct { + C int // index of next byte to read from S + S []byte +} + +func newMemUvarintReader(s []byte) *memUvarintReader { + return &memUvarintReader{S: s} +} + +// Len returns the number of unread bytes. +func (r *memUvarintReader) Len() int { + n := len(r.S) - r.C + if n < 0 { + return 0 + } + return n +} + +// ReadUvarint reads an encoded uint64. The original code this was +// based on is at encoding/binary/ReadUvarint(). +func (r *memUvarintReader) ReadUvarint() (uint64, error) { + if r.C >= len(r.S) { + // nothing else to read + return 0, nil + } + + var x uint64 + var s uint + var C = r.C + var S = r.S + + for { + b := S[C] + C++ + + if b < 0x80 { + r.C = C + + // why 63? The original code had an 'i += 1' loop var and + // checked for i > 9 || i == 9 ...; but, we no longer + // check for the i var, but instead check here for s, + // which is incremented by 7. So, 7*9 == 63. + // + // why the "extra" >= check? The normal case is that s < + // 63, so we check this single >= guard first so that we + // hit the normal, nil-error return pathway sooner. + if s >= 63 && (s > 63 || b > 1) { + return 0, fmt.Errorf("memUvarintReader overflow") + } + + return x | uint64(b)<= len(r.S) { + return + } + + b := r.S[r.C] + r.C++ + + if b < 0x80 { + return + } + } +} + +// SkipBytes skips a count number of bytes. +func (r *memUvarintReader) SkipBytes(count int) { + r.C = r.C + count +} + +func (r *memUvarintReader) Reset(s []byte) { + r.C = 0 + r.S = s +} diff --git a/vendor/github.com/blevesearch/zapx/v16/merge.go b/vendor/github.com/blevesearch/zapx/v16/merge.go new file mode 100644 index 0000000000..490e9da016 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/merge.go @@ -0,0 +1,614 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "math" + "os" + "sort" + + "github.com/RoaringBitmap/roaring" + seg "github.com/blevesearch/scorch_segment_api/v2" + "github.com/golang/snappy" +) + +var DefaultFileMergerBufferSize = 1024 * 1024 + +const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc + +// Merge takes a slice of segments and bit masks describing which +// documents may be dropped, and creates a new segment containing the +// remaining data. This new segment is built at the specified path. +func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + segmentBases := make([]*SegmentBase, len(segments)) + for segmenti, segment := range segments { + switch segmentx := segment.(type) { + case *Segment: + segmentBases[segmenti] = &segmentx.SegmentBase + case *SegmentBase: + segmentBases[segmenti] = segmentx + default: + panic(fmt.Sprintf("oops, unexpected segment type: %T", segment)) + } + } + return mergeSegmentBases(segmentBases, drops, path, DefaultChunkMode, closeCh, s) +} + +func mergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, + chunkMode uint32, closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return nil, 0, err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + // buffer the output + br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriterWithStatsReporter(br, s) + + newDocNums, numDocs, storedIndexOffset, _, _, _, sectionsIndexOffset, err := + MergeToWriter(segmentBases, drops, chunkMode, cr, closeCh) + if err != nil { + cleanup() + return nil, 0, err + } + + // passing the sectionsIndexOffset as fieldsIndexOffset and the docValueOffset as 0 for the footer + err = persistFooter(numDocs, storedIndexOffset, sectionsIndexOffset, sectionsIndexOffset, + 0, chunkMode, cr.Sum32(), cr) + if err != nil { + cleanup() + return nil, 0, err + } + + err = br.Flush() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Sync() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Close() + if err != nil { + cleanup() + return nil, 0, err + } + + return newDocNums, uint64(cr.Count()), nil +} + +func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, + chunkMode uint32, cr *CountHashWriter, closeCh chan struct{}) ( + newDocNums [][]uint64, numDocs, storedIndexOffset uint64, dictLocs []uint64, + fieldsInv []string, fieldsMap map[string]uint16, sectionsIndexOffset uint64, + err error) { + + var fieldsSame bool + fieldsSame, fieldsInv = mergeFields(segments) + fieldsMap = mapFields(fieldsInv) + + numDocs = computeNewDocCount(segments, drops) + + if isClosed(closeCh) { + return nil, 0, 0, nil, nil, nil, 0, seg.ErrClosed + } + + // the merge opaque is especially important when it comes to tracking the file + // offset a field of a particular section is at. This will be used to write the + // offsets in the fields section index of the file (the final merged file). + mergeOpaque := map[int]resetable{} + args := map[string]interface{}{ + "chunkMode": chunkMode, + "fieldsSame": fieldsSame, + "fieldsMap": fieldsMap, + "numDocs": numDocs, + } + + if numDocs > 0 { + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) + if err != nil { + return nil, 0, 0, nil, nil, nil, 0, err + } + + // at this point, ask each section implementation to merge itself + for i, x := range segmentSections { + mergeOpaque[int(i)] = x.InitOpaque(args) + + err = x.Merge(mergeOpaque, segments, drops, fieldsInv, newDocNums, cr, closeCh) + if err != nil { + return nil, 0, 0, nil, nil, nil, 0, err + } + } + } else { + dictLocs = make([]uint64, len(fieldsInv)) + } + + // we can persist the fields section index now, this will point + // to the various indexes (each in different section) available for a field. + sectionsIndexOffset, err = persistFieldsSection(fieldsInv, cr, dictLocs, mergeOpaque) + if err != nil { + return nil, 0, 0, nil, nil, nil, 0, err + } + + return newDocNums, numDocs, storedIndexOffset, dictLocs, fieldsInv, fieldsMap, sectionsIndexOffset, nil +} + +// mapFields takes the fieldsInv list and returns a map of fieldName +// to fieldID+1 +func mapFields(fields []string) map[string]uint16 { + rv := make(map[string]uint16, len(fields)) + for i, fieldName := range fields { + rv[fieldName] = uint16(i) + 1 + } + return rv +} + +// computeNewDocCount determines how many documents will be in the newly +// merged segment when obsoleted docs are dropped +func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 { + var newDocCount uint64 + for segI, segment := range segments { + newDocCount += segment.numDocs + if drops[segI] != nil { + newDocCount -= drops[segI].GetCardinality() + } + } + return newDocCount +} + +func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := + postItr.nextBytes() + for err == nil && len(nextFreqNormBytes) > 0 { + hitNewDocNum := newDocNums[nextDocNum] + if hitNewDocNum == docDropped { + return 0, 0, 0, fmt.Errorf("see hit with dropped doc num") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) + if err != nil { + return 0, 0, 0, err + } + + if len(nextLocBytes) > 0 { + err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) + if err != nil { + return 0, 0, 0, err + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err = + postItr.nextBytes() + } + + return lastDocNum, lastFreq, lastNorm, err +} + +func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { + next, err := postItr.Next() + for next != nil && err == nil { + hitNewDocNum := newDocNums[next.Number()] + if hitNewDocNum == docDropped { + return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + nextFreq := next.Frequency() + var nextNorm uint64 + if pi, ok := next.(*Posting); ok { + nextNorm = pi.NormUint64() + } else { + return 0, 0, 0, nil, fmt.Errorf("unexpected posting type %T", next) + } + + locs := next.Locations() + + if nextFreq > 0 { + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) + } else { + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0)) + } + if err != nil { + return 0, 0, 0, nil, err + } + + if len(locs) > 0 { + numBytesLocs := 0 + for _, loc := range locs { + ap := loc.ArrayPositions() + numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), + loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) + } + + err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) + if err != nil { + return 0, 0, 0, nil, err + } + + for _, loc := range locs { + ap := loc.ArrayPositions() + if cap(bufLoc) < 5+len(ap) { + bufLoc = make([]uint64, 0, 5+len(ap)) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(ap)) + args = append(args, ap...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return 0, 0, 0, nil, err + } + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + next, err = postItr.Next() + } + + return lastDocNum, lastFreq, lastNorm, bufLoc, err +} + +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, + use1HitEncoding func(uint64) (bool, uint64, uint64), + w *CountHashWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + if use1HitEncoding != nil { + encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) + if encodeAs1Hit { + return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil + } + } + + var tfOffset uint64 + tfOffset, _, err = tfEncoder.writeAt(w) + if err != nil { + return 0, err + } + + var locOffset uint64 + locOffset, _, err = locEncoder.writeAt(w) + if err != nil { + return 0, err + } + + postingsOffset := uint64(w.Count()) + + n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, locOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + +type varintEncoder func(uint64) (int, error) + +func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, + fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, + w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { + var rv [][]uint64 // The remapped or newDocNums for each segment. + + var newDocNum uint64 + + var curr int + var data, compressed []byte + var metaBuf bytes.Buffer + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return metaBuf.Write(varBuf[:wb]) + } + + vals := make([][][]byte, len(fieldsInv)) + typs := make([][]byte, len(fieldsInv)) + poss := make([][][]uint64, len(fieldsInv)) + + var posBuf []uint64 + + docNumOffsets := make([]uint64, newSegDocCount) + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + + // for each segment + for segI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return 0, nil, seg.ErrClosed + } + + segNewDocNums := make([]uint64, segment.numDocs) + + dropsI := drops[segI] + + // optimize when the field mapping is the same across all + // segments and there are no deletions, via byte-copying + // of stored docs bytes directly to the writer + if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) { + err := segment.copyStoredDocs(newDocNum, docNumOffsets, w) + if err != nil { + return 0, nil, err + } + + for i := uint64(0); i < segment.numDocs; i++ { + segNewDocNums[i] = newDocNum + newDocNum++ + } + rv = append(rv, segNewDocNums) + + continue + } + + // for each doc num + for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + // TODO: roaring's API limits docNums to 32-bits? + if dropsI != nil && dropsI.Contains(uint32(docNum)) { + segNewDocNums[docNum] = docDropped + continue + } + + segNewDocNums[docNum] = newDocNum + + curr = 0 + metaBuf.Reset() + data = data[:0] + + posTemp := posBuf + + // collect all the data + for i := 0; i < len(fieldsInv); i++ { + vals[i] = vals[i][:0] + typs[i] = typs[i][:0] + poss[i] = poss[i][:0] + } + err := segment.visitStoredFields(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldID := int(fieldsMap[field]) - 1 + if fieldID < 0 { + // no entry for field in fieldsMap + return false + } + vals[fieldID] = append(vals[fieldID], value) + typs[fieldID] = append(typs[fieldID], typ) + + // copy array positions to preserve them beyond the scope of this callback + var curPos []uint64 + if len(pos) > 0 { + if cap(posTemp) < len(pos) { + posBuf = make([]uint64, len(pos)*len(fieldsInv)) + posTemp = posBuf + } + curPos = posTemp[0:len(pos)] + copy(curPos, pos) + posTemp = posTemp[len(pos):] + } + poss[fieldID] = append(poss[fieldID], curPos) + + return true + }) + if err != nil { + return 0, nil, err + } + + // _id field special case optimizes ExternalID() lookups + idFieldVal := vals[uint16(0)][0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, nil, err + } + + // now walk the non-"_id" fields in order + for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { + storedFieldValues := vals[fieldID] + + stf := typs[fieldID] + spf := poss[fieldID] + + var err2 error + curr, data, err2 = persistStoredFieldValues(fieldID, + storedFieldValues, stf, spf, curr, metaEncode, data) + if err2 != nil { + return 0, nil, err2 + } + } + + metaBytes := metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + + // record where we're about to start writing + docNumOffsets[newDocNum] = uint64(w.Count()) + + // write out the meta len and compressed data len + _, err = writeUvarints(w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) + if err != nil { + return 0, nil, err + } + // now write the meta + _, err = w.Write(metaBytes) + if err != nil { + return 0, nil, err + } + // now write the _id field val (counted as part of the 'compressed' data) + _, err = w.Write(idFieldVal) + if err != nil { + return 0, nil, err + } + // now write the compressed data + _, err = w.Write(compressed) + if err != nil { + return 0, nil, err + } + + newDocNum++ + } + + rv = append(rv, segNewDocNums) + } + + // return value is the start of the stored index + storedIndexOffset := uint64(w.Count()) + + // now write out the stored doc index + for _, docNumOffset := range docNumOffsets { + err := binary.Write(w, binary.BigEndian, docNumOffset) + if err != nil { + return 0, nil, err + } + } + + return storedIndexOffset, rv, nil +} + +// copyStoredDocs writes out a segment's stored doc info, optimized by +// using a single Write() call for the entire set of bytes. The +// newDocNumOffsets is filled with the new offsets for each doc. +func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64, + w *CountHashWriter) error { + if s.numDocs <= 0 { + return nil + } + + indexOffset0, storedOffset0, _, _, _ := + s.getDocStoredOffsets(0) // the segment's first doc + + indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN := + s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc + + storedOffset0New := uint64(w.Count()) + + storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN] + _, err := w.Write(storedBytes) + if err != nil { + return err + } + + // remap the storedOffset's for the docs into new offsets relative + // to storedOffset0New, filling the given docNumOffsetsOut array + for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 { + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New + newDocNumOffsets[newDocNum] = storedOffsetNew + newDocNum += 1 + } + + return nil +} + +// mergeFields builds a unified list of fields used across all the +// input segments, and computes whether the fields are the same across +// segments (which depends on fields to be sorted in the same way +// across segments) +func mergeFields(segments []*SegmentBase) (bool, []string) { + fieldsSame := true + + var segment0Fields []string + if len(segments) > 0 { + segment0Fields = segments[0].Fields() + } + + fieldsExist := map[string]struct{}{} + for _, segment := range segments { + fields := segment.Fields() + for fieldi, field := range fields { + fieldsExist[field] = struct{}{} + if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { + fieldsSame = false + } + } + } + + rv := make([]string, 0, len(fieldsExist)) + // ensure _id stays first + rv = append(rv, "_id") + for k := range fieldsExist { + if k != "_id" { + rv = append(rv, k) + } + } + + sort.Strings(rv[1:]) // leave _id as first + + return fieldsSame, rv +} + +func isClosed(closeCh chan struct{}) bool { + select { + case <-closeCh: + return true + default: + return false + } +} diff --git a/vendor/github.com/blevesearch/zapx/v16/new.go b/vendor/github.com/blevesearch/zapx/v16/new.go new file mode 100644 index 0000000000..94079eaf4c --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/new.go @@ -0,0 +1,440 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + "sync" + "sync/atomic" + + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/golang/snappy" +) + +var NewSegmentBufferNumResultsBump int = 100 +var NewSegmentBufferNumResultsFactor float64 = 1.0 +var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 + +// ValidateDocFields can be set by applications to perform additional checks +// on fields in a document being added to a new segment, by default it does +// nothing. +// This API is experimental and may be removed at any time. +var ValidateDocFields = func(field index.Field) error { + return nil +} + +// New creates an in-memory zap-encoded SegmentBase from a set of Documents +func (z *ZapPlugin) New(results []index.Document) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode) +} + +func (*ZapPlugin) newWithChunkMode(results []index.Document, + chunkMode uint32) (segment.Segment, uint64, error) { + s := interimPool.Get().(*interim) + + var br bytes.Buffer + if s.lastNumDocs > 0 { + // use previous results to initialize the buf with an estimate + // size, but note that the interim instance comes from a + // global interimPool, so multiple scorch instances indexing + // different docs can lead to low quality estimates + estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * + NewSegmentBufferNumResultsFactor) + estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * + NewSegmentBufferAvgBytesPerDocFactor) + br.Grow(estimateAvgBytesPerDoc * estimateNumResults) + } + + s.results = results + s.chunkMode = chunkMode + s.w = NewCountHashWriter(&br) + + storedIndexOffset, dictOffsets, sectionsIndexOffset, err := s.convert() + if err != nil { + return nil, uint64(0), err + } + + sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, + s.FieldsMap, s.FieldsInv, uint64(len(results)), + storedIndexOffset, dictOffsets, sectionsIndexOffset) + + // get the bytes written before the interim's reset() call + // write it to the newly formed segment base. + totalBytesWritten := s.getBytesWritten() + if err == nil && s.reset() == nil { + s.lastNumDocs = len(results) + s.lastOutSize = len(br.Bytes()) + sb.setBytesWritten(totalBytesWritten) + interimPool.Put(s) + } + + return sb, uint64(len(br.Bytes())), err +} + +var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} + +// interim holds temporary working data used while converting from +// analysis results to a zap-encoded segment +type interim struct { + results []index.Document + + chunkMode uint32 + + w *CountHashWriter + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + metaBuf bytes.Buffer + + tmp0 []byte + tmp1 []byte + + lastNumDocs int + lastOutSize int + + // atomic access to this variable + bytesWritten uint64 + + opaque map[int]resetable +} + +func (s *interim) reset() (err error) { + s.results = nil + s.chunkMode = 0 + s.w = nil + s.FieldsMap = nil + s.FieldsInv = nil + s.metaBuf.Reset() + s.tmp0 = s.tmp0[:0] + s.tmp1 = s.tmp1[:0] + s.lastNumDocs = 0 + s.lastOutSize = 0 + + // reset the bytes written stat count + // to avoid leaking of bytesWritten across reuse cycles. + s.setBytesWritten(0) + + if s.opaque != nil { + for _, v := range s.opaque { + err = v.Reset() + } + } else { + s.opaque = map[int]resetable{} + } + + return err +} + +type interimStoredField struct { + vals [][]byte + typs []byte + arrayposs [][]uint64 // array positions +} + +type interimFreqNorm struct { + freq uint64 + norm float32 + numLocs int +} + +type interimLoc struct { + fieldID uint16 + pos uint64 + start uint64 + end uint64 + arrayposs []uint64 +} + +func (s *interim) convert() (uint64, []uint64, uint64, error) { + s.FieldsMap = map[string]uint16{} + + args := map[string]interface{}{ + "results": s.results, + "chunkMode": s.chunkMode, + } + if s.opaque == nil { + s.opaque = map[int]resetable{} + for i, x := range segmentSections { + s.opaque[int(i)] = x.InitOpaque(args) + } + } else { + for k, v := range args { + for _, op := range s.opaque { + op.Set(k, v) + } + } + } + + s.getOrDefineField("_id") // _id field is fieldID 0 + + for _, result := range s.results { + result.VisitComposite(func(field index.CompositeField) { + s.getOrDefineField(field.Name()) + }) + result.VisitFields(func(field index.Field) { + s.getOrDefineField(field.Name()) + }) + } + + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + + s.processDocuments() + + storedIndexOffset, err := s.writeStoredFields() + if err != nil { + return 0, nil, 0, err + } + + var dictOffsets []uint64 + + // we can persist the various sections at this point. + // the rule of thumb here is that each section must persist field wise. + for _, x := range segmentSections { + _, err = x.Persist(s.opaque, s.w) + if err != nil { + return 0, nil, 0, err + } + } + + // after persisting the sections to the writer, account corresponding + for _, opaque := range s.opaque { + opaqueIO, ok := opaque.(segment.DiskStatsReporter) + if ok { + s.incrementBytesWritten(opaqueIO.BytesWritten()) + } + } + + if len(s.results) == 0 { + dictOffsets = make([]uint64, len(s.FieldsInv)) + } + + // we can persist a new fields section here + // this new fields section will point to the various indexes available + sectionsIndexOffset, err := persistFieldsSection(s.FieldsInv, s.w, dictOffsets, s.opaque) + if err != nil { + return 0, nil, 0, err + } + + return storedIndexOffset, dictOffsets, sectionsIndexOffset, nil +} + +func (s *interim) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := s.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[fieldName] = fieldIDPlus1 + s.FieldsInv = append(s.FieldsInv, fieldName) + } + + return int(fieldIDPlus1 - 1) +} + +func (s *interim) processDocuments() { + for docNum, result := range s.results { + s.processDocument(uint32(docNum), result) + } +} + +func (s *interim) processDocument(docNum uint32, + result index.Document) { + // this callback is essentially going to be invoked on each field, + // as part of which preprocessing, cumulation etc. of the doc's data + // will take place. + visitField := func(field index.Field) { + fieldID := uint16(s.getOrDefineField(field.Name())) + + // section specific processing of the field + for _, section := range segmentSections { + section.Process(s.opaque, docNum, field, fieldID) + } + } + + // walk each composite field + result.VisitComposite(func(field index.CompositeField) { + visitField(field) + }) + + // walk each field + result.VisitFields(visitField) + + // given that as part of visiting each field, there may some kind of totalling + // or accumulation that can be updated, it becomes necessary to commit or + // put that totalling/accumulation into effect. However, for certain section + // types this particular step need not be valid, in which case it would be a + // no-op in the implmentation of the section's process API. + for _, section := range segmentSections { + section.Process(s.opaque, docNum, nil, math.MaxUint16) + } + +} + +func (s *interim) getBytesWritten() uint64 { + return atomic.LoadUint64(&s.bytesWritten) +} + +func (s *interim) incrementBytesWritten(val uint64) { + atomic.AddUint64(&s.bytesWritten, val) +} + +func (s *interim) writeStoredFields() ( + storedIndexOffset uint64, err error) { + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return s.metaBuf.Write(varBuf[:wb]) + } + + data, compressed := s.tmp0[:0], s.tmp1[:0] + defer func() { s.tmp0, s.tmp1 = data, compressed }() + + // keyed by docNum + docStoredOffsets := make([]uint64, len(s.results)) + + // keyed by fieldID, for the current doc in the loop + docStoredFields := map[uint16]interimStoredField{} + + for docNum, result := range s.results { + for fieldID := range docStoredFields { // reset for next doc + delete(docStoredFields, fieldID) + } + + var validationErr error + result.VisitFields(func(field index.Field) { + fieldID := uint16(s.getOrDefineField(field.Name())) + + if field.Options().IsStored() { + isf := docStoredFields[fieldID] + isf.vals = append(isf.vals, field.Value()) + isf.typs = append(isf.typs, field.EncodedFieldType()) + isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) + docStoredFields[fieldID] = isf + } + + err := ValidateDocFields(field) + if err != nil && validationErr == nil { + validationErr = err + } + }) + if validationErr != nil { + return 0, validationErr + } + + var curr int + + s.metaBuf.Reset() + data = data[:0] + + // _id field special case optimizes ExternalID() lookups + idFieldVal := docStoredFields[uint16(0)].vals[0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, err + } + + // handle non-"_id" fields + for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { + isf, exists := docStoredFields[uint16(fieldID)] + if exists { + curr, data, err = persistStoredFieldValues( + fieldID, isf.vals, isf.typs, isf.arrayposs, + curr, metaEncode, data) + if err != nil { + return 0, err + } + } + } + + metaBytes := s.metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + s.incrementBytesWritten(uint64(len(compressed))) + docStoredOffsets[docNum] = uint64(s.w.Count()) + + _, err := writeUvarints(s.w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) + if err != nil { + return 0, err + } + + _, err = s.w.Write(metaBytes) + if err != nil { + return 0, err + } + + _, err = s.w.Write(idFieldVal) + if err != nil { + return 0, err + } + + _, err = s.w.Write(compressed) + if err != nil { + return 0, err + } + } + + storedIndexOffset = uint64(s.w.Count()) + + for _, docStoredOffset := range docStoredOffsets { + err = binary.Write(s.w, binary.BigEndian, docStoredOffset) + if err != nil { + return 0, err + } + } + + return storedIndexOffset, nil +} + +func (s *interim) setBytesWritten(val uint64) { + atomic.StoreUint64(&s.bytesWritten, val) +} + +// returns the total # of bytes needed to encode the given uint64's +// into binary.PutUVarint() encoding +func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { + n = numUvarintBytes(a) + n += numUvarintBytes(b) + n += numUvarintBytes(c) + n += numUvarintBytes(d) + n += numUvarintBytes(e) + for _, v := range more { + n += numUvarintBytes(v) + } + return n +} + +// returns # of bytes needed to encode x in binary.PutUvarint() encoding +func numUvarintBytes(x uint64) (n int) { + for x >= 0x80 { + x >>= 7 + n++ + } + return n + 1 +} diff --git a/vendor/github.com/blevesearch/zapx/v16/plugin.go b/vendor/github.com/blevesearch/zapx/v16/plugin.go new file mode 100644 index 0000000000..f67297ec2f --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/plugin.go @@ -0,0 +1,27 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +// ZapPlugin implements the Plugin interface of +// the blevesearch/scorch_segment_api pkg +type ZapPlugin struct{} + +func (*ZapPlugin) Type() string { + return Type +} + +func (*ZapPlugin) Version() uint32 { + return Version +} diff --git a/vendor/github.com/blevesearch/zapx/v16/posting.go b/vendor/github.com/blevesearch/zapx/v16/posting.go new file mode 100644 index 0000000000..ad47df0dd6 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/posting.go @@ -0,0 +1,939 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "math" + "reflect" + + "github.com/RoaringBitmap/roaring" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + +// FST or vellum value (uint64) encoding is determined by the top two +// highest-order or most significant bits... +// +// encoding : MSB +// name : 63 62 61...to...bit #0 (LSB) +// ----------+---+---+--------------------------------------------------- +// general : 0 | 0 | 62-bits of postingsOffset. +// ~ : 0 | 1 | reserved for future. +// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. +// ~ : 1 | 1 | reserved for future. +// +// Encoding "general" is able to handle all cases, where the +// postingsOffset points to more information about the postings for +// the term. +// +// Encoding "1-hit" is used to optimize a commonly seen case when a +// term has only a single hit. For example, a term in the _id field +// will have only 1 hit. The "1-hit" encoding is used for a term +// in a field when... +// +// - term vector info is disabled for that field; +// - and, the term appears in only a single doc for that field; +// - and, the term's freq is exactly 1 in that single doc for that field; +// - and, the docNum must fit into 31-bits; +// +// Otherwise, the "general" encoding is used instead. +// +// In the "1-hit" encoding, the field in that single doc may have +// other terms, which is supported in the "1-hit" encoding by the +// positive float31 norm. + +const FSTValEncodingMask = uint64(0xc000000000000000) +const FSTValEncodingGeneral = uint64(0x0000000000000000) +const FSTValEncoding1Hit = uint64(0x8000000000000000) + +func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { + return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) +} + +func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { + return (mask31Bits & v), (mask31Bits & (v >> 31)) +} + +const mask31Bits = uint64(0x000000007fffffff) + +func under32Bits(x uint64) bool { + return x <= mask31Bits +} + +const DocNum1HitFinished = math.MaxUint64 + +var NormBits1Hit = uint64(1) + +// PostingsList is an in-memory representation of a postings list +type PostingsList struct { + sb *SegmentBase + postingsOffset uint64 + freqOffset uint64 + locOffset uint64 + postings *roaring.Bitmap + except *roaring.Bitmap + + // when normBits1Hit != 0, then this postings list came from a + // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply + docNum1Hit uint64 + normBits1Hit uint64 + + chunkSize uint64 + + bytesRead uint64 +} + +// represents an immutable, empty postings list +var emptyPostingsList = &PostingsList{} + +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { + if p.normBits1Hit != 0 { + receiver.Add(uint32(p.docNum1Hit)) + return + } + + if p.postings != nil { + receiver.Or(p.postings) + } +} + +// Iterator returns an iterator for this postings list +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, + prealloc segment.PostingsIterator) segment.PostingsIterator { + if p.normBits1Hit == 0 && p.postings == nil { + return emptyPostingsIterator + } + + var preallocPI *PostingsIterator + pi, ok := prealloc.(*PostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyPostingsIterator { + preallocPI = nil + } + + return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI) +} + +func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, + rv *PostingsIterator) *PostingsIterator { + if rv == nil { + rv = &PostingsIterator{} + } else { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.reset() + } + + locReader := rv.locReader + if locReader != nil { + locReader.reset() + } + + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + + buf := rv.buf + + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.locReader = locReader + + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + + rv.buf = buf + } + + rv.postings = p + rv.includeFreqNorm = includeFreq || includeNorm || includeLocs + rv.includeLocs = includeLocs + + if p.normBits1Hit != 0 { + // "1-hit" encoding + rv.docNum1Hit = p.docNum1Hit + rv.normBits1Hit = p.normBits1Hit + + if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { + rv.docNum1Hit = DocNum1HitFinished + } + + return rv + } + + // "general" encoding, check if empty + if p.postings == nil { + return rv + } + + // initialize freq chunk reader + if rv.includeFreqNorm { + rv.freqNormReader = newChunkedIntDecoder(p.sb.mem, p.freqOffset, rv.freqNormReader) + rv.incrementBytesRead(rv.freqNormReader.getBytesRead()) + } + + // initialize the loc chunk reader + if rv.includeLocs { + rv.locReader = newChunkedIntDecoder(p.sb.mem, p.locOffset, rv.locReader) + rv.incrementBytesRead(rv.locReader.getBytesRead()) + } + + rv.all = p.postings.Iterator() + if p.except != nil { + rv.ActualBM = roaring.AndNot(p.postings, p.except) + rv.Actual = rv.ActualBM.Iterator() + } else { + rv.ActualBM = p.postings + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. + } + + return rv +} + +// Count returns the number of items on this postings list +func (p *PostingsList) Count() uint64 { + var n, e uint64 + if p.normBits1Hit != 0 { + n = 1 + if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) { + e = 1 + } + } else if p.postings != nil { + n = p.postings.GetCardinality() + if p.except != nil { + e = p.postings.AndCardinality(p.except) + } + } + return n - e +} + +// Implements the segment.DiskStatsReporter interface +// The purpose of this implementation is to get +// the bytes read from the postings lists stored +// on disk, while querying +func (p *PostingsList) ResetBytesRead(val uint64) { + p.bytesRead = val +} + +func (p *PostingsList) BytesRead() uint64 { + return p.bytesRead +} + +func (p *PostingsList) incrementBytesRead(val uint64) { + p.bytesRead += val +} + +func (p *PostingsList) BytesWritten() uint64 { + return 0 +} + +func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { + rv.postingsOffset = postingsOffset + + // handle "1-hit" encoding special case + if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { + return rv.init1Hit(postingsOffset) + } + + // read the location of the freq/norm details + var n uint64 + var read int + + rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + n += uint64(read) + + rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var postingsLen uint64 + postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] + + rv.incrementBytesRead(n + postingsLen) + + if rv.postings == nil { + rv.postings = roaring.NewBitmap() + } + _, err := rv.postings.FromBuffer(roaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap: %v", err) + } + + chunkSize, err := getChunkSize(d.sb.chunkMode, + rv.postings.GetCardinality(), d.sb.numDocs) + if err != nil { + return err + } else if chunkSize == 0 { + return fmt.Errorf("chunk size is zero, chunkMode: %v, numDocs: %v", + d.sb.chunkMode, d.sb.numDocs) + } + + rv.chunkSize = chunkSize + + return nil +} + +func (rv *PostingsList) init1Hit(fstVal uint64) error { + docNum, normBits := FSTValDecode1Hit(fstVal) + + rv.docNum1Hit = docNum + rv.normBits1Hit = normBits + + return nil +} + +// PostingsIterator provides a way to iterate through the postings list +type PostingsIterator struct { + postings *PostingsList + all roaring.IntPeekable + Actual roaring.IntPeekable + ActualBM *roaring.Bitmap + + currChunk uint32 + freqNormReader *chunkedIntDecoder + locReader *chunkedIntDecoder + + next Posting // reused across Next() calls + nextLocs []Location // reused across Next() calls + nextSegmentLocs []segment.Location // reused across Next() calls + + docNum1Hit uint64 + normBits1Hit uint64 + + buf []byte + + includeFreqNorm bool + includeLocs bool + + bytesRead uint64 +} + +var emptyPostingsIterator = &PostingsIterator{} + +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + SizeOfPtr + + i.next.Size() + // account for freqNormReader, locReader if we start using this. + for _, entry := range i.nextLocs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +// Implements the segment.DiskStatsReporter interface +// The purpose of this implementation is to get +// the bytes read from the disk which includes +// the freqNorm and location specific information +// of a hit +func (i *PostingsIterator) ResetBytesRead(val uint64) { + i.bytesRead = val +} + +func (i *PostingsIterator) BytesRead() uint64 { + return i.bytesRead +} + +func (i *PostingsIterator) incrementBytesRead(val uint64) { + i.bytesRead += val +} + +func (i *PostingsIterator) BytesWritten() uint64 { + return 0 +} + +func (i *PostingsIterator) loadChunk(chunk int) error { + if i.includeFreqNorm { + err := i.freqNormReader.loadChunk(chunk) + if err != nil { + return err + } + + // assign the bytes read at this point, since + // the postingsIterator is tracking only the chunk loaded + // and the cumulation is tracked correctly in the downstream + // intDecoder + i.ResetBytesRead(i.freqNormReader.getBytesRead()) + } + + if i.includeLocs { + err := i.locReader.loadChunk(chunk) + if err != nil { + return err + } + i.ResetBytesRead(i.locReader.getBytesRead()) + } + + i.currChunk = uint32(chunk) + return nil +} + +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { + if i.normBits1Hit != 0 { + return 1, i.normBits1Hit, false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) + } + + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + if freq == 0 { + return freq, 0, hasLocs, nil + } + + normBits, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) + } + + return freq, normBits, hasLocs, nil +} + +func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) { + if i.normBits1Hit != 0 { + return false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return false, fmt.Errorf("error reading freqHasLocs: %v", err) + } + + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + if freq == 0 { + return hasLocs, nil + } + + i.freqNormReader.SkipUvarint() // Skip normBits. + + return hasLocs, nil // See decodeFreqHasLocs() / hasLocs. +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs +} + +// readLocation processes all the integers on the stream representing a single +// location. +func (i *PostingsIterator) readLocation(l *Location) error { + // read off field + fieldID, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location field: %v", err) + } + // read off pos + pos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location pos: %v", err) + } + // read off start + start, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location start: %v", err) + } + // read off end + end, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location end: %v", err) + } + // read off num array pos + numArrayPos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location num array pos: %v", err) + } + + l.field = i.postings.sb.fieldsInv[fieldID] + l.pos = pos + l.start = start + l.end = end + + if cap(l.ap) < int(numArrayPos) { + l.ap = make([]uint64, int(numArrayPos)) + } else { + l.ap = l.ap[:int(numArrayPos)] + } + + // read off array positions + for k := 0; k < int(numArrayPos); k++ { + ap, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading array position: %v", err) + } + + l.ap[k] = ap + } + + return nil +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) Next() (segment.Posting, error) { + return i.nextAtOrAfter(0) +} + +// Advance returns the posting at the specified docNum or it is not present +// the next posting, or if the end is reached, nil +func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) { + return i.nextAtOrAfter(docNum) +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) { + docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter) + if err != nil || !exists { + return nil, err + } + + i.next = Posting{} // clear the struct + rv := &i.next + rv.docNum = docNum + + if !i.includeFreqNorm { + return rv, nil + } + + var normBits uint64 + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return nil, err + } + + rv.norm = math.Float32frombits(uint32(normBits)) + + if i.includeLocs && hasLocs { + // prepare locations into reused slices, where we assume + // rv.freq >= "number of locs", since in a composite field, + // some component fields might have their IncludeTermVector + // flags disabled while other component fields are enabled + if rv.freq > 0 { + if cap(i.nextLocs) >= int(rv.freq) { + i.nextLocs = i.nextLocs[0:rv.freq] + } else { + i.nextLocs = make([]Location, rv.freq, rv.freq*2) + } + if cap(i.nextSegmentLocs) < int(rv.freq) { + i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) + } + rv.locs = i.nextSegmentLocs[:0] + } + + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + j := 0 + var nextLoc *Location + startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader + for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { + if len(i.nextLocs) > j { + nextLoc = &i.nextLocs[j] + } else { + nextLoc = &Location{} + } + + err := i.readLocation(nextLoc) + if err != nil { + return nil, err + } + + rv.locs = append(rv.locs, nextLoc) + j++ + } + } + + return rv, nil +} + +// nextDocNum returns the next docNum on the postings list, and also +// sets up the currChunk / loc related fields of the iterator. +func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { + if i.normBits1Hit != 0 { + if i.docNum1Hit == DocNum1HitFinished { + return 0, false, nil + } + if i.docNum1Hit < atOrAfter { + // advanced past our 1-hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return 0, false, nil + } + docNum := i.docNum1Hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return docNum, true, nil + } + + if i.Actual == nil || !i.Actual.HasNext() { + return 0, false, nil + } + + if i.postings == nil || i.postings == emptyPostingsList { + // couldn't find anything + return 0, false, nil + } + + if i.postings.postings == i.ActualBM { + return i.nextDocNumAtOrAfterClean(atOrAfter) + } + + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() || !i.all.HasNext() { + // couldn't find anything + return 0, false, nil + } + + n := i.Actual.Next() + allN := i.all.Next() + nChunk := n / uint32(i.postings.chunkSize) + + // when allN becomes >= to here, then allN is in the same chunk as nChunk. + allNReachesNChunk := nChunk * uint32(i.postings.chunkSize) + + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do + for allN != n { + // we've reached same chunk, so move the freq/norm/loc decoders forward + if i.includeFreqNorm && allN >= allNReachesNChunk { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, err + } + } + + if !i.all.HasNext() { + return 0, false, nil + } + + allN = i.all.Next() + } + + if i.includeFreqNorm && (i.currChunk != nChunk || i.freqNormReader.isNil()) { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +var freqHasLocs1Hit = encodeFreqHasLocs(1, false) + +// nextBytes returns the docNum and the encoded freq & loc bytes for +// the next posting +func (i *PostingsIterator) nextBytes() ( + docNumOut uint64, freq uint64, normBits uint64, + bytesFreqNorm []byte, bytesLoc []byte, err error) { + docNum, exists, err := i.nextDocNumAtOrAfter(0) + if err != nil || !exists { + return 0, 0, 0, nil, nil, err + } + + if i.normBits1Hit != 0 { + if i.buf == nil { + i.buf = make([]byte, binary.MaxVarintLen64*2) + } + n := binary.PutUvarint(i.buf, freqHasLocs1Hit) + n += binary.PutUvarint(i.buf[n:], i.normBits1Hit) + return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil + } + + startFreqNorm := i.freqNormReader.remainingLen() + + var hasLocs bool + + freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return 0, 0, 0, nil, nil, err + } + + endFreqNorm := i.freqNormReader.remainingLen() + bytesFreqNorm = i.freqNormReader.readBytes(startFreqNorm, endFreqNorm) + + if hasLocs { + startLoc := i.locReader.remainingLen() + + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return 0, 0, 0, nil, nil, + fmt.Errorf("error reading location nextBytes numLocs: %v", err) + } + + // skip over all the location bytes + i.locReader.SkipBytes(int(numLocsBytes)) + + endLoc := i.locReader.remainingLen() + bytesLoc = i.locReader.readBytes(startLoc, endLoc) + } + + return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil +} + +// optimization when the postings list is "clean" (e.g., no updates & +// no deletions) where the all bitmap is the same as the actual bitmap +func (i *PostingsIterator) nextDocNumAtOrAfterClean( + atOrAfter uint64) (uint64, bool, error) { + if !i.includeFreqNorm { + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() { + return 0, false, nil // couldn't find anything + } + + return uint64(i.Actual.Next()), true, nil + } + + // freq-norm's needed, so maintain freq-norm chunk reader + sameChunkNexts := 0 // # of times we called Next() in the same chunk + n := i.Actual.Next() + nChunk := n / uint32(i.postings.chunkSize) + + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + + nChunkPrev := nChunk + nChunk = n / uint32(i.postings.chunkSize) + + if nChunk != nChunkPrev { + sameChunkNexts = 0 + } else { + sameChunkNexts += 1 + } + } + + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } + + for j := 0; j < sameChunkNexts; j++ { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) + } + } + + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +func (i *PostingsIterator) currChunkNext(nChunk uint32) error { + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + hasLocs, err := i.skipFreqNormReadHasLocs() + if err != nil { + return err + } + + if i.includeLocs && hasLocs { + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + // skip over all the location bytes + i.locReader.SkipBytes(int(numLocsBytes)) + } + + return nil +} + +// DocNum1Hit returns the docNum and true if this is "1-hit" optimized +// and the docNum is available. +func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { + if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished { + return p.docNum1Hit, true + } + return 0, false +} + +// ActualBitmap returns the underlying actual bitmap +// which can be used up the stack for optimizations +func (p *PostingsIterator) ActualBitmap() *roaring.Bitmap { + return p.ActualBM +} + +// ReplaceActual replaces the ActualBM with the provided +// bitmap +func (p *PostingsIterator) ReplaceActual(abm *roaring.Bitmap) { + p.ActualBM = abm + p.Actual = abm.Iterator() +} + +// PostingsIteratorFromBitmap constructs a PostingsIterator given an +// "actual" bitmap. +func PostingsIteratorFromBitmap(bm *roaring.Bitmap, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + ActualBM: bm, + Actual: bm.Iterator(), + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// PostingsIteratorFrom1Hit constructs a PostingsIterator given a +// 1-hit docNum. +func PostingsIteratorFrom1Hit(docNum1Hit uint64, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + docNum1Hit: docNum1Hit, + normBits1Hit: NormBits1Hit, + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// Posting is a single entry in a postings list +type Posting struct { + docNum uint64 + freq uint64 + norm float32 + locs []segment.Location +} + +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + + for _, entry := range p.locs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +// Number returns the document number of this posting in this segment +func (p *Posting) Number() uint64 { + return p.docNum +} + +// Frequency returns the frequencies of occurrence of this term in this doc/field +func (p *Posting) Frequency() uint64 { + return p.freq +} + +// Norm returns the normalization factor for this posting +func (p *Posting) Norm() float64 { + return float64(float32(1.0 / math.Sqrt(float64(math.Float32bits(p.norm))))) +} + +// Locations returns the location information for each occurrence +func (p *Posting) Locations() []segment.Location { + return p.locs +} + +// NormUint64 returns the norm value as uint64 +func (p *Posting) NormUint64() uint64 { + return uint64(math.Float32bits(p.norm)) +} + +// Location represents the location of a single occurrence +type Location struct { + field string + pos uint64 + start uint64 + end uint64 + ap []uint64 +} + +func (l *Location) Size() int { + return reflectStaticSizeLocation + + len(l.field) + + len(l.ap)*SizeOfUint64 +} + +// Field returns the name of the field (useful in composite fields to know +// which original field the value came from) +func (l *Location) Field() string { + return l.field +} + +// Start returns the start byte offset of this occurrence +func (l *Location) Start() uint64 { + return l.start +} + +// End returns the end byte offset of this occurrence +func (l *Location) End() uint64 { + return l.end +} + +// Pos returns the 1-based phrase position of this occurrence +func (l *Location) Pos() uint64 { + return l.pos +} + +// ArrayPositions returns the array position vector associated with this occurrence +func (l *Location) ArrayPositions() []uint64 { + return l.ap +} diff --git a/vendor/github.com/blevesearch/zapx/v16/read.go b/vendor/github.com/blevesearch/zapx/v16/read.go new file mode 100644 index 0000000000..e47d4c6abd --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/read.go @@ -0,0 +1,43 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import "encoding/binary" + +func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { + _, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum) + + meta := s.mem[storedOffset+n : storedOffset+n+metaLen] + data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen] + + return meta, data +} + +func (s *SegmentBase) getDocStoredOffsets(docNum uint64) ( + uint64, uint64, uint64, uint64, uint64) { + indexOffset := s.storedIndexOffset + (8 * docNum) + + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + + var n uint64 + + metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64]) + n += uint64(read) + + dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + return indexOffset, storedOffset, n, metaLen, dataLen +} diff --git a/vendor/github.com/blevesearch/zapx/v16/section.go b/vendor/github.com/blevesearch/zapx/v16/section.go new file mode 100644 index 0000000000..1ace25e3bf --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/section.go @@ -0,0 +1,77 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "sync" + + "github.com/RoaringBitmap/roaring" + index "github.com/blevesearch/bleve_index_api" +) + +type section interface { + // process is essentially parsing of a specific field's content in a specific + // document. any tracking of processed data *specific to this section* should + // be done in opaque which will be passed to the Persist() API. + Process(opaque map[int]resetable, docNum uint32, f index.Field, fieldID uint16) + + // flush the processed data in the opaque to the writer. + Persist(opaque map[int]resetable, w *CountHashWriter) (n int64, err error) + + // this API is used to fetch the file offset of the field for this section. + // this is used during search time to parse the section, and fetch results + // for the specific "index" thats part of the section. + AddrForField(opaque map[int]resetable, fieldID int) int + + // for every field in the fieldsInv (relevant to this section) merge the section + // contents from all the segments into a single section data for the field. + // as part of the merge API, write the merged data to the writer and also track + // the starting offset of this newly merged section data. + Merge(opaque map[int]resetable, segments []*SegmentBase, drops []*roaring.Bitmap, fieldsInv []string, + newDocNumsIn [][]uint64, w *CountHashWriter, closeCh chan struct{}) error + + // opaque is used to track the data specific to this section. its not visible + // to the other sections and is only visible and freely modifiable by this specifc + // section. + InitOpaque(args map[string]interface{}) resetable +} + +type resetable interface { + Reset() error + Set(key string, value interface{}) +} + +// ----------------------------------------------------------------------------- + +const ( + SectionInvertedTextIndex = iota + SectionFaissVectorIndex +) + +// ----------------------------------------------------------------------------- + +var ( + segmentSectionsMutex sync.Mutex + // writes to segmentSections within init()s ONLY within lock, + // reads will not require lock access + segmentSections = make(map[uint16]section) +) + +// Method to be invoked within init()s ONLY. +func registerSegmentSection(key uint16, val section) { + segmentSectionsMutex.Lock() + segmentSections[key] = val + segmentSectionsMutex.Unlock() +} diff --git a/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go b/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go new file mode 100644 index 0000000000..2102fb5a9c --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go @@ -0,0 +1,765 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "fmt" + "math" + "math/rand" + "sync/atomic" + "time" + + "github.com/RoaringBitmap/roaring" + index "github.com/blevesearch/bleve_index_api" + faiss "github.com/blevesearch/go-faiss" + seg "github.com/blevesearch/scorch_segment_api/v2" +) + +func init() { + rand.Seed(time.Now().UTC().UnixNano()) + registerSegmentSection(SectionFaissVectorIndex, &faissVectorIndexSection{}) + isFieldNotApplicableToInvertedTextSection = func(field index.Field) bool { + _, ok := field.(index.VectorField) + return ok + } +} + +type faissVectorIndexSection struct { +} + +func (v *faissVectorIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) { + if fieldID == math.MaxUint16 { + return + } + + if vf, ok := field.(index.VectorField); ok { + vo := v.getvectorIndexOpaque(opaque) + vo.process(vf, fieldID, docNum) + } +} + +func (v *faissVectorIndexSection) Persist(opaque map[int]resetable, w *CountHashWriter) (n int64, err error) { + vo := v.getvectorIndexOpaque(opaque) + vo.writeVectorIndexes(w) + return 0, nil +} + +func (v *faissVectorIndexSection) AddrForField(opaque map[int]resetable, fieldID int) int { + vo := v.getvectorIndexOpaque(opaque) + return vo.fieldAddrs[uint16(fieldID)] +} + +// metadata corresponding to a serialized vector index +type vecIndexMeta struct { + startOffset int + indexSize uint64 + vecIds []int64 + indexOptimizedFor string +} + +// keep in mind with respect to update and delete operations with resepct to vectors +func (v *faissVectorIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase, + drops []*roaring.Bitmap, fieldsInv []string, + newDocNumsIn [][]uint64, w *CountHashWriter, closeCh chan struct{}) error { + vo := v.getvectorIndexOpaque(opaque) + + // the segments with valid vector sections in them + // preallocating the space over here, if there are too many fields + // in the segment this will help by avoiding multiple allocation + // calls. + vecSegs := make([]*SegmentBase, 0, len(segments)) + indexes := make([]*vecIndexMeta, 0, len(segments)) + + for fieldID, fieldName := range fieldsInv { + indexes = indexes[:0] // resizing the slices + vecSegs = vecSegs[:0] + vecToDocID := make(map[int64]uint64) + + // todo: would parallely fetching the following stuff from segments + // be beneficial in terms of perf? + for segI, sb := range segments { + if isClosed(closeCh) { + return seg.ErrClosed + } + if _, ok := sb.fieldsMap[fieldName]; !ok { + continue + } + + // check if the section address is a valid one for "fieldName" in the + // segment sb. the local fieldID (fetched by the fieldsMap of the sb) + // is to be used while consulting the fieldsSectionsMap + pos := int(sb.fieldsSectionsMap[sb.fieldsMap[fieldName]-1][SectionFaissVectorIndex]) + if pos == 0 { + continue + } + + // loading doc values - adhering to the sections format. never + // valid values for vector section + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + _, n = binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + // the index optimization type represented as an int + indexOptimizationTypeInt, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + numVecs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + vecSegs = append(vecSegs, sb) + indexes = append(indexes, &vecIndexMeta{ + vecIds: make([]int64, 0, numVecs), + indexOptimizedFor: index.VectorIndexOptimizationsReverseLookup[int(indexOptimizationTypeInt)], + }) + + curIdx := len(indexes) - 1 + for i := 0; i < int(numVecs); i++ { + vecID, n := binary.Varint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + docID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + // remap the docID from the old segment to the new document nos. + // provided. furthermore, also drop the now-invalid doc nums + // of that segment + var vecIDNotDeleted bool // indicates if the vector ID was not deleted. + var newDocID uint64 // new docID in the new segment + if newDocNumsIn[segI][uint32(docID)] != docDropped { + newDocID = newDocNumsIn[segI][uint32(docID)] + vecIDNotDeleted = true + } + // if the remapped doc ID is valid, track it + // as part of vecs to be reconstructed (for larger indexes). + // this would account only the valid vector IDs, so the deleted + // ones won't be reconstructed in the final index. + if vecIDNotDeleted { + vecToDocID[vecID] = newDocID + indexes[curIdx].vecIds = append(indexes[curIdx].vecIds, vecID) + } + } + + indexSize, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + indexes[curIdx].startOffset = pos + indexes[curIdx].indexSize = indexSize + pos += int(indexSize) + } + + err := vo.flushSectionMetadata(fieldID, w, vecToDocID, indexes) + if err != nil { + return err + } + err = vo.mergeAndWriteVectorIndexes(vecSegs, indexes, w, closeCh) + if err != nil { + return err + } + } + + return nil +} + +func (v *vectorIndexOpaque) flushSectionMetadata(fieldID int, w *CountHashWriter, + vecToDocID map[int64]uint64, indexes []*vecIndexMeta) error { + tempBuf := v.grabBuf(binary.MaxVarintLen64) + + // early exit if there are absolutely no valid vectors present in the segment + // and crucially don't store the section start offset in it + if len(indexes) == 0 || len(vecToDocID) == 0 { + return nil + } + fieldStart := w.Count() + // marking the fact that for vector index, doc values isn't valid by + // storing fieldNotUniverted values. + n := binary.PutUvarint(tempBuf, uint64(fieldNotUninverted)) + _, err := w.Write(tempBuf[:n]) + if err != nil { + return err + } + n = binary.PutUvarint(tempBuf, uint64(fieldNotUninverted)) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(tempBuf, uint64(index.SupportedVectorIndexOptimizations[indexes[0].indexOptimizedFor])) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + // write the number of unique vectors + n = binary.PutUvarint(tempBuf, uint64(len(vecToDocID))) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + for vecID, docID := range vecToDocID { + // write the vecID + n = binary.PutVarint(tempBuf, vecID) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + // write the docID + n = binary.PutUvarint(tempBuf, docID) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + } + + v.fieldAddrs[uint16(fieldID)] = fieldStart + return nil +} + +func (v *vectorIndexOpaque) flushVectorIndex(indexBytes []byte, w *CountHashWriter) error { + tempBuf := v.grabBuf(binary.MaxVarintLen64) + + n := binary.PutUvarint(tempBuf, uint64(len(indexBytes))) + _, err := w.Write(tempBuf[:n]) + if err != nil { + return err + } + + // write the vector index data + _, err = w.Write(indexBytes) + if err != nil { + return err + } + + return nil +} + +// Divide the estimated nprobe with this value to optimize +// for latency. +const nprobeLatencyOptimization = 2 + +// Calculates the nprobe count, given nlist(number of centroids) based on +// the metric the index is optimized for. +func calculateNprobe(nlist int, indexOptimizedFor string) int32 { + nprobe := int32(math.Sqrt(float64(nlist))) + if indexOptimizedFor == index.IndexOptimizedForLatency { + nprobe /= nprobeLatencyOptimization + if nprobe < 1 { + nprobe = 1 + } + } + return nprobe +} + +// todo: naive implementation. need to keep in mind the perf implications and improve on this. +// perhaps, parallelized merging can help speed things up over here. +func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase, + indexes []*vecIndexMeta, w *CountHashWriter, closeCh chan struct{}) error { + + vecIndexes := make([]*faiss.IndexImpl, 0, len(sbs)) + reconsCap := 0 + for segI, segBase := range sbs { + // Considering merge operations on vector indexes are expensive, it is + // worth including an early exit if the merge is aborted, saving us + // the resource spikes, even if temporary. + if isClosed(closeCh) { + freeReconstructedIndexes(vecIndexes) + return seg.ErrClosed + } + // read the index bytes. todo: parallelize this + indexBytes := segBase.mem[indexes[segI].startOffset : indexes[segI].startOffset+int(indexes[segI].indexSize)] + index, err := faiss.ReadIndexFromBuffer(indexBytes, faiss.IOFlagReadOnly) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + indexReconsLen := len(indexes[segI].vecIds) * index.D() + if indexReconsLen > reconsCap { + reconsCap = indexReconsLen + } + vecIndexes = append(vecIndexes, index) + } + + // no vector indexes to merge + if len(vecIndexes) == 0 { + return nil + } + + var mergedIndexBytes []byte + + // capacities for the finalVecIDs and indexData slices + // to avoid multiple allocations, via append. + finalVecIDCap := len(indexes[0].vecIds) * len(vecIndexes) + indexDataCap := finalVecIDCap * vecIndexes[0].D() + + finalVecIDs := make([]int64, 0, finalVecIDCap) + // merging of indexes with reconstruction method. + // the indexes[i].vecIds has only the valid vecs of this vector + // index present in it, so we'd be reconstructing only those. + indexData := make([]float32, 0, indexDataCap) + // reusable buffer for reconstruction + recons := make([]float32, 0, reconsCap) + var err error + for i := 0; i < len(vecIndexes); i++ { + if isClosed(closeCh) { + freeReconstructedIndexes(vecIndexes) + return seg.ErrClosed + } + + // reconstruct the vectors only if present, it could be that + // some of the indexes had all of their vectors updated/deleted. + if len(indexes[i].vecIds) > 0 { + neededReconsLen := len(indexes[i].vecIds) * vecIndexes[i].D() + recons = recons[:neededReconsLen] + // todo: parallelize reconstruction + recons, err = vecIndexes[i].ReconstructBatch(indexes[i].vecIds, recons) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + indexData = append(indexData, recons...) + // Adding vector IDs in the same order as the vectors + finalVecIDs = append(finalVecIDs, indexes[i].vecIds...) + } + } + + if len(indexData) == 0 { + // no valid vectors for this index, so we don't even have to + // record it in the section + freeReconstructedIndexes(vecIndexes) + return nil + } + + nvecs := len(finalVecIDs) + + // index type to be created after merge based on the number of vectors in + // indexData added into the index. + nlist := determineCentroids(nvecs) + indexDescription, indexClass := determineIndexToUse(nvecs, nlist) + + // safe to assume that all the indexes are of the same config values, given + // that they are extracted from the field mapping info. + dims := vecIndexes[0].D() + metric := vecIndexes[0].MetricType() + indexOptimizedFor := indexes[0].indexOptimizedFor + + // freeing the reconstructed indexes immediately - waiting till the end + // to do the same is not needed because the following operations don't need + // the reconstructed ones anymore and doing so will hold up memory which can + // be detrimental while creating indexes during introduction. + freeReconstructedIndexes(vecIndexes) + + faissIndex, err := faiss.IndexFactory(dims, indexDescription, metric) + if err != nil { + return err + } + defer faissIndex.Close() + + if indexClass == IndexTypeIVF { + // the direct map maintained in the IVF index is essential for the + // reconstruction of vectors based on vector IDs in the future merges. + // the AddWithIDs API also needs a direct map to be set before using. + err = faissIndex.SetDirectMap(2) + if err != nil { + return err + } + + nprobe := calculateNprobe(nlist, indexOptimizedFor) + faissIndex.SetNProbe(nprobe) + + // train the vector index, essentially performs k-means clustering to partition + // the data space of indexData such that during the search time, we probe + // only a subset of vectors -> non-exhaustive search. could be a time + // consuming step when the indexData is large. + err = faissIndex.Train(indexData) + if err != nil { + return err + } + } + + err = faissIndex.AddWithIDs(indexData, finalVecIDs) + if err != nil { + return err + } + + mergedIndexBytes, err = faiss.WriteIndexIntoBuffer(faissIndex) + if err != nil { + return err + } + + err = v.flushVectorIndex(mergedIndexBytes, w) + if err != nil { + return err + } + + return nil +} + +// todo: can be parallelized. +func freeReconstructedIndexes(indexes []*faiss.IndexImpl) { + for _, index := range indexes { + index.Close() + } +} + +// todo: is it possible to merge this resuable stuff with the interim's tmp0? +func (v *vectorIndexOpaque) grabBuf(size int) []byte { + buf := v.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + v.tmp0 = buf + } + return buf[0:size] +} + +// Determines the number of centroids to use for an IVF index. +func determineCentroids(nvecs int) int { + var nlist int + + switch { + // At 1M vectors, nlist = 4k gave a reasonably high recall with the right nprobe, + // whereas 1M/100 = 10000 centroids would increase training time without + // corresponding increase in recall + case nvecs >= 1000000: + nlist = int(4 * math.Sqrt(float64(nvecs))) + case nvecs >= 1000: + // 100 points per cluster is a reasonable default, considering the default + // minimum and maximum points per cluster is 39 and 256 respectively. + // Since it's a recommendation to have a minimum of 10 clusters, 1000(100 * 10) + // was chosen as the lower threshold. + nlist = nvecs / 100 + } + return nlist +} + +const ( + IndexTypeFlat = iota + IndexTypeIVF +) + +// Returns a description string for the index and quantizer type +// and an index type. +func determineIndexToUse(nvecs, nlist int) (string, int) { + switch { + case nvecs >= 10000: + return fmt.Sprintf("IVF%d,SQ8", nlist), IndexTypeIVF + case nvecs >= 1000: + return fmt.Sprintf("IVF%d,Flat", nlist), IndexTypeIVF + default: + return "IDMap2,Flat", IndexTypeFlat + } +} + +func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint64, err error) { + // for every fieldID, contents to store over here are: + // 1. the serialized representation of the dense vector index. + // 2. its constituent vectorID -> {docID} mapping. + tempBuf := vo.grabBuf(binary.MaxVarintLen64) + for fieldID, content := range vo.vecFieldMap { + // calculate the capacity of the vecs and ids slices + // to avoid multiple allocations. + vecs := make([]float32, 0, uint16(len(content.vecs))*content.dim) + ids := make([]int64, 0, len(content.vecs)) + for hash, vecInfo := range content.vecs { + vecs = append(vecs, vecInfo.vec...) + ids = append(ids, int64(hash)) + } + + var metric = faiss.MetricL2 + if content.metric == index.CosineSimilarity { + metric = faiss.MetricInnerProduct + } + + nvecs := len(ids) + nlist := determineCentroids(nvecs) + indexDescription, indexClass := determineIndexToUse(nvecs, nlist) + faissIndex, err := faiss.IndexFactory(int(content.dim), indexDescription, metric) + if err != nil { + return 0, err + } + + defer faissIndex.Close() + + if indexClass == IndexTypeIVF { + err = faissIndex.SetDirectMap(2) + if err != nil { + return 0, err + } + + nprobe := calculateNprobe(nlist, content.indexOptimizedFor) + faissIndex.SetNProbe(nprobe) + + err = faissIndex.Train(vecs) + if err != nil { + return 0, err + } + } + + err = faissIndex.AddWithIDs(vecs, ids) + if err != nil { + return 0, err + } + + // serialize the built index into a byte slice + buf, err := faiss.WriteIndexIntoBuffer(faissIndex) + if err != nil { + return 0, err + } + + fieldStart := w.Count() + // writing out two offset values to indicate that the current field's + // vector section doesn't have valid doc value content within it. + n := binary.PutUvarint(tempBuf, uint64(fieldNotUninverted)) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return 0, err + } + n = binary.PutUvarint(tempBuf, uint64(fieldNotUninverted)) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(tempBuf, uint64(index.SupportedVectorIndexOptimizations[content.indexOptimizedFor])) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return 0, err + } + + // write the number of unique vectors + n = binary.PutUvarint(tempBuf, uint64(faissIndex.Ntotal())) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return 0, err + } + + // fixme: this can cause a write amplification. need to improve this. + // todo: might need to a reformating to optimize according to mmap needs. + // reformating idea: storing all the IDs mapping towards the end of the + // section would be help avoiding in paging in this data as part of a page + // (which is to load a non-cacheable info like index). this could help the + // paging costs + for vecID, _ := range content.vecs { + docID := vo.vecIDMap[vecID].docID + // write the vecID + n = binary.PutVarint(tempBuf, vecID) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(tempBuf, uint64(docID)) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return 0, err + } + } + + // record the fieldStart value for this section. + // write the vecID -> docID mapping + // write the index bytes and its length + n = binary.PutUvarint(tempBuf, uint64(len(buf))) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return 0, err + } + + // write the vector index data + _, err = w.Write(buf) + if err != nil { + return 0, err + } + + // accounts for whatever data has been written out to the writer. + vo.incrementBytesWritten(uint64(w.Count() - fieldStart)) + vo.fieldAddrs[fieldID] = fieldStart + } + return 0, nil +} + +func (vo *vectorIndexOpaque) process(field index.VectorField, fieldID uint16, docNum uint32) { + if !vo.init { + vo.realloc() + vo.init = true + } + if fieldID == math.MaxUint16 { + // doc processing checkpoint. currently nothing to do + return + } + + //process field + vec := field.Vector() + dim := field.Dims() + metric := field.Similarity() + indexOptimizedFor := field.IndexOptimizedFor() + + // caller is supposed to make sure len(vec) is a multiple of dim. + // Not double checking it here to avoid the overhead. + numSubVecs := len(vec) / dim + for i := 0; i < numSubVecs; i++ { + subVec := vec[i*dim : (i+1)*dim] + + // NOTE: currently, indexing only unique vectors. + subVecHash := hashCode(subVec) + if _, ok := vo.vecIDMap[subVecHash]; !ok { + vo.vecIDMap[subVecHash] = &vecInfo{ + docID: docNum, + } + } + + // tracking the unique vectors for every field which will be used later + // to construct the vector index. + if _, ok := vo.vecFieldMap[fieldID]; !ok { + vo.vecFieldMap[fieldID] = &indexContent{ + vecs: map[int64]*vecInfo{ + subVecHash: &vecInfo{ + vec: subVec, + }, + }, + dim: uint16(dim), + metric: metric, + indexOptimizedFor: indexOptimizedFor, + } + } else { + vo.vecFieldMap[fieldID].vecs[subVecHash] = &vecInfo{ + vec: subVec, + } + } + } +} + +// todo: better hash function? +// keep the perf aspects in mind with respect to the hash function. +// Uses a time based seed to prevent 2 identical vectors in different +// segments from having the same hash (which otherwise could cause an +// issue when merging those segments) +func hashCode(a []float32) int64 { + var rv, sum int64 + for _, v := range a { + // Weighing each element of the vector differently to minimise chance + // of collisions between non identical vectors. + sum = int64(math.Float32bits(v)) + sum*31 + } + + // Similar to getVectorCode(), this uses the first 32 bits for the vector sum + // and the last 32 for a random 32-bit int to ensure identical vectors have + // unique hashes. + rv = sum<<32 | int64(rand.Int31()) + return rv +} + +func (v *faissVectorIndexSection) getvectorIndexOpaque(opaque map[int]resetable) *vectorIndexOpaque { + if _, ok := opaque[SectionFaissVectorIndex]; !ok { + opaque[SectionFaissVectorIndex] = v.InitOpaque(nil) + } + return opaque[SectionFaissVectorIndex].(*vectorIndexOpaque) +} + +func (v *faissVectorIndexSection) InitOpaque(args map[string]interface{}) resetable { + rv := &vectorIndexOpaque{ + fieldAddrs: make(map[uint16]int), + vecIDMap: make(map[int64]*vecInfo), + vecFieldMap: make(map[uint16]*indexContent), + } + for k, v := range args { + rv.Set(k, v) + } + + return rv +} + +type indexContent struct { + vecs map[int64]*vecInfo + dim uint16 + metric string + indexOptimizedFor string +} + +type vecInfo struct { + vec []float32 + docID uint32 +} + +type vectorIndexOpaque struct { + init bool + + bytesWritten uint64 + + lastNumVecs int + lastNumFields int + + // maps the field to the address of its vector section + fieldAddrs map[uint16]int + + // maps the vecID to basic info involved around it such as + // the docID its present in and the vector itself + vecIDMap map[int64]*vecInfo + // maps the field to information necessary for its vector + // index to be build. + vecFieldMap map[uint16]*indexContent + + tmp0 []byte +} + +func (v *vectorIndexOpaque) realloc() { + // when an opaque instance is reused, the two maps are pre-allocated + // with space before they were reset. this can be useful in continuous + // mutation scenarios, where the batch sizes are more or less same. + v.vecFieldMap = make(map[uint16]*indexContent, v.lastNumFields) + v.vecIDMap = make(map[int64]*vecInfo, v.lastNumVecs) + v.fieldAddrs = make(map[uint16]int, v.lastNumFields) +} + +func (v *vectorIndexOpaque) incrementBytesWritten(val uint64) { + atomic.AddUint64(&v.bytesWritten, val) +} + +func (v *vectorIndexOpaque) BytesWritten() uint64 { + return atomic.LoadUint64(&v.bytesWritten) +} + +func (v *vectorIndexOpaque) BytesRead() uint64 { + return 0 +} + +func (v *vectorIndexOpaque) ResetBytesRead(uint64) { +} + +// cleanup stuff over here for reusability +func (v *vectorIndexOpaque) Reset() (err error) { + // tracking the number of vecs and fields processed and tracked in this + // opaque, for better allocations of the maps + v.lastNumVecs = len(v.vecIDMap) + v.lastNumFields = len(v.vecFieldMap) + + v.init = false + v.fieldAddrs = nil + v.vecFieldMap = nil + v.vecIDMap = nil + v.tmp0 = v.tmp0[:0] + + atomic.StoreUint64(&v.bytesWritten, 0) + + return nil +} + +func (v *vectorIndexOpaque) Set(key string, val interface{}) { +} diff --git a/vendor/github.com/blevesearch/zapx/v16/section_inverted_text_index.go b/vendor/github.com/blevesearch/zapx/v16/section_inverted_text_index.go new file mode 100644 index 0000000000..d19dc9453d --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/section_inverted_text_index.go @@ -0,0 +1,1016 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + "sync/atomic" + + "github.com/RoaringBitmap/roaring" + index "github.com/blevesearch/bleve_index_api" + seg "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +func init() { + registerSegmentSection(SectionInvertedTextIndex, &invertedTextIndexSection{}) +} + +type invertedTextIndexSection struct { +} + +// this function is something that tells the inverted index section whether to +// process a particular field or not - since it might be processed by another +// section this function helps in avoiding unnecessary work. +// (only used by faiss vector section currently, will need a separate API for every +// section we introduce in the future or a better way forward - TODO) +var isFieldNotApplicableToInvertedTextSection func(field index.Field) bool + +func (i *invertedTextIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) { + if isFieldNotApplicableToInvertedTextSection == nil || + !isFieldNotApplicableToInvertedTextSection(field) { + invIndexOpaque := i.getInvertedIndexOpaque(opaque) + invIndexOpaque.process(field, fieldID, docNum) + } +} + +func (i *invertedTextIndexSection) Persist(opaque map[int]resetable, w *CountHashWriter) (n int64, err error) { + invIndexOpaque := i.getInvertedIndexOpaque(opaque) + _, err = invIndexOpaque.writeDicts(w) + return 0, err +} + +func (i *invertedTextIndexSection) AddrForField(opaque map[int]resetable, fieldID int) int { + invIndexOpaque := i.getInvertedIndexOpaque(opaque) + return invIndexOpaque.fieldAddrs[fieldID] +} + +func mergeAndPersistInvertedSection(segments []*SegmentBase, dropsIn []*roaring.Bitmap, + fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, + newDocNumsIn [][]uint64, newSegDocCount uint64, chunkMode uint32, + w *CountHashWriter, closeCh chan struct{}) (map[int]int, uint64, error) { + var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + var bufLoc []uint64 + + var postings *PostingsList + var postItr *PostingsIterator + + fieldAddrs := make(map[int]int) + dictOffsets := make([]uint64, len(fieldsInv)) + fieldDvLocsStart := make([]uint64, len(fieldsInv)) + fieldDvLocsEnd := make([]uint64, len(fieldsInv)) + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + + var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, 0, err + } + + newRoaring := roaring.NewBitmap() + newDocNums := make([][]uint64, 0, len(segments)) + drops := make([]*roaring.Bitmap, 0, len(segments)) + dicts := make([]*Dictionary, 0, len(segments)) + itrs := make([]vellum.Iterator, 0, len(segments)) + segmentsInFocus := make([]*SegmentBase, 0, len(segments)) + // for each field + for fieldID, fieldName := range fieldsInv { + // collect FST iterators from all active segments for this field + newDocNums = newDocNums[:0] + drops = drops[:0] + dicts = dicts[:0] + itrs = itrs[:0] + segmentsInFocus = segmentsInFocus[:0] + for segmentI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + dict, err2 := segment.dictionary(fieldName) + if err2 != nil { + return nil, 0, err2 + } + if dict != nil && dict.fst != nil { + itr, err2 := dict.fst.Iterator(nil, nil) + if err2 != nil && err2 != vellum.ErrIteratorDone { + return nil, 0, err2 + } + if itr != nil { + newDocNums = append(newDocNums, newDocNumsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } + dicts = append(dicts, dict) + itrs = append(itrs, itr) + segmentsInFocus = append(segmentsInFocus, segment) + } + } + } + + var prevTerm []byte + + newRoaring.Clear() + + var lastDocNum, lastFreq, lastNorm uint64 + + // determines whether to use "1-hit" encoding optimization + // when a term appears in only 1 doc, with no loc info, + // has freq of 1, and the docNum fits into 31-bits + use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { + if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { + docNum := uint64(newRoaring.Minimum()) + if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { + return true, docNum, lastNorm + } + } + return false, 0, 0 + } + + finishTerm := func(term []byte) error { + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) + if err != nil { + return err + } + + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) + if err != nil { + return err + } + } + + newRoaring.Clear() + + tfEncoder.Reset() + locEncoder.Reset() + + lastDocNum = 0 + lastFreq = 0 + lastNorm = 0 + + return nil + } + + enumerator, err := newEnumerator(itrs) + + for err == nil { + term, itrI, postingsOffset := enumerator.Current() + + if !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + // if the term changed, write out the info collected + // for the previous term + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err + } + } + if !bytes.Equal(prevTerm, term) || prevTerm == nil { + // compute cardinality of field-term in new seg + var newCard uint64 + lowItrIdxs, lowItrVals := enumerator.GetLowIdxsAndValues() + for i, idx := range lowItrIdxs { + pl, err := dicts[idx].postingsListFromOffset(lowItrVals[i], drops[idx], nil) + if err != nil { + return nil, 0, err + } + newCard += pl.Count() + } + // compute correct chunk size with this + chunkSize, err := getChunkSize(chunkMode, newCard, newSegDocCount) + if err != nil { + return nil, 0, err + } + // update encoders chunk + tfEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + locEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + } + + postings, err = dicts[itrI].postingsListFromOffset( + postingsOffset, drops[itrI], postings) + if err != nil { + return nil, 0, err + } + + postItr = postings.iterator(true, true, true, postItr) + + if fieldsSame { + // can optimize by copying freq/norm/loc bytes directly + lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( + term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder) + } else { + lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder, bufLoc) + } + if err != nil { + return nil, 0, err + } + + prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem + prevTerm = append(prevTerm, term...) + + err = enumerator.Next() + } + if err != vellum.ErrIteratorDone { + return nil, 0, err + } + // close the enumerator to free the underlying iterators + err = enumerator.Close() + if err != nil { + return nil, 0, err + } + + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err + } + + dictOffset := uint64(w.Count()) + + err = newVellum.Close() + if err != nil { + return nil, 0, err + } + vellumData := vellumBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, 0, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, 0, err + } + + dictOffsets[fieldID] = dictOffset + + fieldDvLocsStart[fieldID] = uint64(w.Count()) + + // update the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, 0, err + } + fdvEncoder := newChunkedContentCoder(chunkSize, newSegDocCount-1, w, true) + + fdvReadersAvailable := false + var dvIterClone *docValueReader + for segmentI, segment := range segmentsInFocus { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) + if dvIter, exists := segment.fieldDvReaders[SectionInvertedTextIndex][fieldIDPlus1-1]; exists && + dvIter != nil { + fdvReadersAvailable = true + dvIterClone = dvIter.cloneInto(dvIterClone) + err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { + if newDocNums[segmentI][docNum] == docDropped { + return nil + } + err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) + if err != nil { + return err + } + return nil + }) + if err != nil { + return nil, 0, err + } + } + } + + if fdvReadersAvailable { + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } + + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, 0, err + } + + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + } else { + fieldDvLocsStart[fieldID] = fieldNotUninverted + fieldDvLocsEnd[fieldID] = fieldNotUninverted + } + + fieldStart := w.Count() + + n = binary.PutUvarint(bufMaxVarintLen64, fieldDvLocsStart[fieldID]) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, fieldDvLocsEnd[fieldID]) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, dictOffsets[fieldID]) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, 0, err + } + + fieldAddrs[fieldID] = fieldStart + + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) + if err != nil { + return nil, 0, err + } + } + + fieldDvLocsOffset := uint64(w.Count()) + + return fieldAddrs, fieldDvLocsOffset, nil +} + +func (i *invertedTextIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase, + drops []*roaring.Bitmap, fieldsInv []string, newDocNumsIn [][]uint64, + w *CountHashWriter, closeCh chan struct{}) error { + io := i.getInvertedIndexOpaque(opaque) + fieldAddrs, _, err := mergeAndPersistInvertedSection(segments, drops, fieldsInv, + io.FieldsMap, io.fieldsSame, newDocNumsIn, io.numDocs, io.chunkMode, w, closeCh) + if err != nil { + return err + } + + io.fieldAddrs = fieldAddrs + return nil +} + +func (i *invertedIndexOpaque) grabBuf(size int) []byte { + buf := i.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + i.tmp0 = buf + } + return buf[:size] +} + +func (i *invertedIndexOpaque) incrementBytesWritten(bytes uint64) { + atomic.AddUint64(&i.bytesWritten, bytes) +} + +func (i *invertedIndexOpaque) BytesWritten() uint64 { + return atomic.LoadUint64(&i.bytesWritten) +} + +func (i *invertedIndexOpaque) BytesRead() uint64 { + return 0 +} + +func (i *invertedIndexOpaque) ResetBytesRead(uint64) {} + +func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uint64, err error) { + + if io.results == nil || len(io.results) == 0 { + return nil, nil + } + + dictOffsets = make([]uint64, len(io.FieldsInv)) + + fdvOffsetsStart := make([]uint64, len(io.FieldsInv)) + fdvOffsetsEnd := make([]uint64, len(io.FieldsInv)) + + buf := io.grabBuf(binary.MaxVarintLen64) + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, uint64(len(io.results)-1)) + locEncoder := newChunkedIntCoder(1024, uint64(len(io.results)-1)) + + var docTermMap [][]byte + + if io.builder == nil { + io.builder, err = vellum.New(&io.builderBuf, nil) + if err != nil { + return nil, err + } + } + + for fieldID, terms := range io.DictKeys { + if cap(docTermMap) < len(io.results) { + docTermMap = make([][]byte, len(io.results)) + } else { + docTermMap = docTermMap[:len(io.results)] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } + + dict := io.Dicts[fieldID] + + for _, term := range terms { // terms are already sorted + pid := dict[term] - 1 + + postingsBS := io.Postings[pid] + + freqNorms := io.FreqNorms[pid] + freqNormOffset := 0 + + locs := io.Locs[pid] + locOffset := 0 + + chunkSize, err := getChunkSize(io.chunkMode, postingsBS.GetCardinality(), uint64(len(io.results))) + if err != nil { + return nil, err + } + tfEncoder.SetChunkSize(chunkSize, uint64(len(io.results)-1)) + locEncoder.SetChunkSize(chunkSize, uint64(len(io.results)-1)) + + postingsItr := postingsBS.Iterator() + for postingsItr.HasNext() { + docNum := uint64(postingsItr.Next()) + + freqNorm := freqNorms[freqNormOffset] + + // check if freq/norm is enabled + if freqNorm.freq > 0 { + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), + uint64(math.Float32bits(freqNorm.norm))) + } else { + // if disabled, then skip the norm part + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0)) + } + if err != nil { + return nil, err + } + + if freqNorm.numLocs > 0 { + numBytesLocs := 0 + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + numBytesLocs += totalUvarintBytes( + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs)), loc.arrayposs) + } + + err = locEncoder.Add(docNum, uint64(numBytesLocs)) + if err != nil { + return nil, err + } + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs))) + if err != nil { + return nil, err + } + + err = locEncoder.Add(docNum, loc.arrayposs...) + if err != nil { + return nil, err + } + } + locOffset += freqNorm.numLocs + } + + freqNormOffset++ + + docTermMap[docNum] = append( + append(docTermMap[docNum], term...), + termSeparator) + } + + tfEncoder.Close() + locEncoder.Close() + io.incrementBytesWritten(locEncoder.getBytesWritten()) + io.incrementBytesWritten(tfEncoder.getBytesWritten()) + + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, w, buf) + if err != nil { + return nil, err + } + + if postingsOffset > uint64(0) { + err = io.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return nil, err + } + } + + tfEncoder.Reset() + locEncoder.Reset() + } + + err = io.builder.Close() + if err != nil { + return nil, err + } + + // record where this dictionary starts + dictOffsets[fieldID] = uint64(w.Count()) + + vellumData := io.builderBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + io.incrementBytesWritten(uint64(len(vellumData))) + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, err + } + + // reset vellum for reuse + io.builderBuf.Reset() + + err = io.builder.Reset(&io.builderBuf) + if err != nil { + return nil, err + } + + // write the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, err + } + + fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(io.results)-1), w, false) + if io.IncludeDocValues[fieldID] { + for docNum, docTerms := range docTermMap { + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) + if err != nil { + return nil, err + } + } + } + err = fdvEncoder.Close() + if err != nil { + return nil, err + } + + io.incrementBytesWritten(fdvEncoder.getBytesWritten()) + + fdvOffsetsStart[fieldID] = uint64(w.Count()) + + _, err = fdvEncoder.Write() + if err != nil { + return nil, err + } + + fdvOffsetsEnd[fieldID] = uint64(w.Count()) + fdvEncoder.Reset() + } else { + fdvOffsetsStart[fieldID] = fieldNotUninverted + fdvOffsetsEnd[fieldID] = fieldNotUninverted + } + + fieldStart := w.Count() + + n = binary.PutUvarint(buf, fdvOffsetsStart[fieldID]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + n = binary.PutUvarint(buf, fdvOffsetsEnd[fieldID]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + n = binary.PutUvarint(buf, dictOffsets[fieldID]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + io.fieldAddrs[fieldID] = fieldStart + } + + return dictOffsets, nil +} + +func (io *invertedIndexOpaque) process(field index.Field, fieldID uint16, docNum uint32) { + if !io.init && io.results != nil { + io.realloc() + io.init = true + } + + // if the fieldID is MaxUint16, it's mainly indicated that the caller has + // finished invoking the process() for every field on that doc. + if fieldID == math.MaxUint16 { + for fid, tfs := range io.reusableFieldTFs { + dict := io.Dicts[fid] + norm := math.Float32frombits(uint32(io.reusableFieldLens[fid])) + + for term, tf := range tfs { + pid := dict[term] - 1 + bs := io.Postings[pid] + bs.Add(uint32(docNum)) + + io.FreqNorms[pid] = append(io.FreqNorms[pid], + interimFreqNorm{ + freq: uint64(tf.Frequency()), + norm: norm, + numLocs: len(tf.Locations), + }) + + if len(tf.Locations) > 0 { + locs := io.Locs[pid] + + for _, loc := range tf.Locations { + var locf = uint16(fid) + if loc.Field != "" { + locf = uint16(io.getOrDefineField(loc.Field)) + } + var arrayposs []uint64 + if len(loc.ArrayPositions) > 0 { + arrayposs = loc.ArrayPositions + } + locs = append(locs, interimLoc{ + fieldID: locf, + pos: uint64(loc.Position), + start: uint64(loc.Start), + end: uint64(loc.End), + arrayposs: arrayposs, + }) + } + + io.Locs[pid] = locs + } + } + } + for i := 0; i < len(io.FieldsInv); i++ { // clear these for reuse + io.reusableFieldLens[i] = 0 + io.reusableFieldTFs[i] = nil + } + return + } + + io.reusableFieldLens[fieldID] += field.AnalyzedLength() + existingFreqs := io.reusableFieldTFs[fieldID] + if existingFreqs != nil { + existingFreqs.MergeAll(field.Name(), field.AnalyzedTokenFrequencies()) + } else { + io.reusableFieldTFs[fieldID] = field.AnalyzedTokenFrequencies() + } +} + +func (i *invertedIndexOpaque) realloc() { + var pidNext int + + var totTFs int + var totLocs int + i.FieldsMap = map[string]uint16{} + + i.getOrDefineField("_id") // _id field is fieldID 0 + + for _, result := range i.results { + result.VisitComposite(func(field index.CompositeField) { + i.getOrDefineField(field.Name()) + }) + result.VisitFields(func(field index.Field) { + i.getOrDefineField(field.Name()) + }) + } + + sort.Strings(i.FieldsInv[1:]) // keep _id as first field + + for fieldID, fieldName := range i.FieldsInv { + i.FieldsMap[fieldName] = uint16(fieldID + 1) + } + + visitField := func(field index.Field) { + fieldID := uint16(i.getOrDefineField(field.Name())) + + dict := i.Dicts[fieldID] + dictKeys := i.DictKeys[fieldID] + + tfs := field.AnalyzedTokenFrequencies() + for term, tf := range tfs { + pidPlus1, exists := dict[term] + if !exists { + pidNext++ + pidPlus1 = uint64(pidNext) + + dict[term] = pidPlus1 + dictKeys = append(dictKeys, term) + + i.numTermsPerPostingsList = append(i.numTermsPerPostingsList, 0) + i.numLocsPerPostingsList = append(i.numLocsPerPostingsList, 0) + } + + pid := pidPlus1 - 1 + + i.numTermsPerPostingsList[pid] += 1 + i.numLocsPerPostingsList[pid] += len(tf.Locations) + + totLocs += len(tf.Locations) + } + + totTFs += len(tfs) + + i.DictKeys[fieldID] = dictKeys + if field.Options().IncludeDocValues() { + i.IncludeDocValues[fieldID] = true + } + } + + if cap(i.IncludeDocValues) >= len(i.FieldsInv) { + i.IncludeDocValues = i.IncludeDocValues[:len(i.FieldsInv)] + } else { + i.IncludeDocValues = make([]bool, len(i.FieldsInv)) + } + + for _, result := range i.results { + // walk each composite field + result.VisitComposite(func(field index.CompositeField) { + visitField(field) + }) + + // walk each field + result.VisitFields(visitField) + } + + numPostingsLists := pidNext + + if cap(i.Postings) >= numPostingsLists { + i.Postings = i.Postings[:numPostingsLists] + } else { + postings := make([]*roaring.Bitmap, numPostingsLists) + copy(postings, i.Postings[:cap(i.Postings)]) + for i := 0; i < numPostingsLists; i++ { + if postings[i] == nil { + postings[i] = roaring.New() + } + } + i.Postings = postings + } + + if cap(i.FreqNorms) >= numPostingsLists { + i.FreqNorms = i.FreqNorms[:numPostingsLists] + } else { + i.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + } + + if cap(i.freqNormsBacking) >= totTFs { + i.freqNormsBacking = i.freqNormsBacking[:totTFs] + } else { + i.freqNormsBacking = make([]interimFreqNorm, totTFs) + } + + freqNormsBacking := i.freqNormsBacking + for pid, numTerms := range i.numTermsPerPostingsList { + i.FreqNorms[pid] = freqNormsBacking[0:0] + freqNormsBacking = freqNormsBacking[numTerms:] + } + + if cap(i.Locs) >= numPostingsLists { + i.Locs = i.Locs[:numPostingsLists] + } else { + i.Locs = make([][]interimLoc, numPostingsLists) + } + + if cap(i.locsBacking) >= totLocs { + i.locsBacking = i.locsBacking[:totLocs] + } else { + i.locsBacking = make([]interimLoc, totLocs) + } + + locsBacking := i.locsBacking + for pid, numLocs := range i.numLocsPerPostingsList { + i.Locs[pid] = locsBacking[0:0] + locsBacking = locsBacking[numLocs:] + } + + for _, dict := range i.DictKeys { + sort.Strings(dict) + } + + if cap(i.reusableFieldTFs) >= len(i.FieldsInv) { + i.reusableFieldTFs = i.reusableFieldTFs[:len(i.FieldsInv)] + } else { + i.reusableFieldTFs = make([]index.TokenFrequencies, len(i.FieldsInv)) + } + + if cap(i.reusableFieldLens) >= len(i.FieldsInv) { + i.reusableFieldLens = i.reusableFieldLens[:len(i.FieldsInv)] + } else { + i.reusableFieldLens = make([]int, len(i.FieldsInv)) + } +} + +func (i *invertedTextIndexSection) getInvertedIndexOpaque(opaque map[int]resetable) *invertedIndexOpaque { + if _, ok := opaque[SectionInvertedTextIndex]; !ok { + opaque[SectionInvertedTextIndex] = i.InitOpaque(nil) + } + return opaque[SectionInvertedTextIndex].(*invertedIndexOpaque) +} + +func (i *invertedIndexOpaque) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := i.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(i.FieldsInv) + 1) + i.FieldsMap[fieldName] = fieldIDPlus1 + i.FieldsInv = append(i.FieldsInv, fieldName) + + i.Dicts = append(i.Dicts, make(map[string]uint64)) + + n := len(i.DictKeys) + if n < cap(i.DictKeys) { + i.DictKeys = i.DictKeys[:n+1] + i.DictKeys[n] = i.DictKeys[n][:0] + } else { + i.DictKeys = append(i.DictKeys, []string(nil)) + } + } + + return int(fieldIDPlus1 - 1) +} + +func (i *invertedTextIndexSection) InitOpaque(args map[string]interface{}) resetable { + rv := &invertedIndexOpaque{ + fieldAddrs: map[int]int{}, + } + for k, v := range args { + rv.Set(k, v) + } + + return rv +} + +type invertedIndexOpaque struct { + results []index.Document + + chunkMode uint32 + + // indicates whethere the following structs are initialized + init bool + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + // Term dictionaries for each field + // field id -> term -> postings list id + 1 + Dicts []map[string]uint64 + + // Terms for each field, where terms are sorted ascending + // field id -> []term + DictKeys [][]string + + // Fields whose IncludeDocValues is true + // field id -> bool + IncludeDocValues []bool + + // postings id -> bitmap of docNums + Postings []*roaring.Bitmap + + // postings id -> freq/norm's, one for each docNum in postings + FreqNorms [][]interimFreqNorm + freqNormsBacking []interimFreqNorm + + // postings id -> locs, one for each freq + Locs [][]interimLoc + locsBacking []interimLoc + + numTermsPerPostingsList []int // key is postings list id + numLocsPerPostingsList []int // key is postings list id + + builder *vellum.Builder + builderBuf bytes.Buffer + + // reusable stuff for processing fields etc. + reusableFieldLens []int + reusableFieldTFs []index.TokenFrequencies + + tmp0 []byte + + fieldAddrs map[int]int + + bytesWritten uint64 + fieldsSame bool + numDocs uint64 +} + +func (io *invertedIndexOpaque) Reset() (err error) { + // cleanup stuff over here + io.results = nil + io.init = false + io.chunkMode = 0 + io.FieldsMap = nil + io.FieldsInv = nil + for i := range io.Dicts { + io.Dicts[i] = nil + } + io.Dicts = io.Dicts[:0] + for i := range io.DictKeys { + io.DictKeys[i] = io.DictKeys[i][:0] + } + io.DictKeys = io.DictKeys[:0] + for i := range io.IncludeDocValues { + io.IncludeDocValues[i] = false + } + io.IncludeDocValues = io.IncludeDocValues[:0] + for _, idn := range io.Postings { + idn.Clear() + } + io.Postings = io.Postings[:0] + io.FreqNorms = io.FreqNorms[:0] + for i := range io.freqNormsBacking { + io.freqNormsBacking[i] = interimFreqNorm{} + } + io.freqNormsBacking = io.freqNormsBacking[:0] + io.Locs = io.Locs[:0] + for i := range io.locsBacking { + io.locsBacking[i] = interimLoc{} + } + io.locsBacking = io.locsBacking[:0] + io.numTermsPerPostingsList = io.numTermsPerPostingsList[:0] + io.numLocsPerPostingsList = io.numLocsPerPostingsList[:0] + io.builderBuf.Reset() + if io.builder != nil { + err = io.builder.Reset(&io.builderBuf) + } + + io.reusableFieldLens = io.reusableFieldLens[:0] + io.reusableFieldTFs = io.reusableFieldTFs[:0] + + io.tmp0 = io.tmp0[:0] + atomic.StoreUint64(&io.bytesWritten, 0) + io.fieldsSame = false + io.numDocs = 0 + + return err +} +func (i *invertedIndexOpaque) Set(key string, val interface{}) { + switch key { + case "results": + i.results = val.([]index.Document) + case "chunkMode": + i.chunkMode = val.(uint32) + case "fieldsSame": + i.fieldsSame = val.(bool) + case "fieldsMap": + i.FieldsMap = val.(map[string]uint16) + case "numDocs": + i.numDocs = val.(uint64) + } +} diff --git a/vendor/github.com/blevesearch/zapx/v16/segment.go b/vendor/github.com/blevesearch/zapx/v16/segment.go new file mode 100644 index 0000000000..062abf2c34 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/segment.go @@ -0,0 +1,872 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "os" + "sync" + "sync/atomic" + "unsafe" + + "github.com/RoaringBitmap/roaring" + mmap "github.com/blevesearch/mmap-go" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" + "github.com/golang/snappy" +) + +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) +} + +// Open returns a zap impl of a segment +func (*ZapPlugin) Open(path string) (segment.Segment, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + mm, err := mmap.Map(f, mmap.RDONLY, 0) + if err != nil { + // mmap failed, try to close the file + _ = f.Close() + return nil, err + } + + rv := &Segment{ + SegmentBase: SegmentBase{ + fieldsMap: make(map[string]uint16), + fieldFSTs: make(map[uint16]*vellum.FST), + fieldDvReaders: make([]map[uint16]*docValueReader, len(segmentSections)), + }, + f: f, + mm: mm, + path: path, + refs: 1, + } + rv.SegmentBase.updateSize() + + err = rv.loadConfig() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadFieldsNew() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadDvReaders() + if err != nil { + _ = rv.Close() + return nil, err + } + + return rv, nil +} + +// SegmentBase is a memory only, read-only implementation of the +// segment.Segment interface, using zap's data representation. +type SegmentBase struct { + mem []byte + memCRC uint32 + chunkMode uint32 + fieldsMap map[string]uint16 // fieldName -> fieldID+1 + fieldsInv []string // fieldID -> fieldName + fieldsSectionsMap []map[uint16]uint64 // fieldID -> section -> address + numDocs uint64 + storedIndexOffset uint64 + fieldsIndexOffset uint64 + sectionsIndexOffset uint64 + docValueOffset uint64 + dictLocs []uint64 + fieldDvReaders []map[uint16]*docValueReader // naive chunk cache per field; section->field->reader + fieldDvNames []string // field names cached in fieldDvReaders + size uint64 + + // atomic access to these variables + bytesRead uint64 + bytesWritten uint64 + + m sync.Mutex + fieldFSTs map[uint16]*vellum.FST +} + +func (sb *SegmentBase) Size() int { + return int(sb.size) +} + +func (sb *SegmentBase) updateSize() { + sizeInBytes := reflectStaticSizeSegmentBase + + cap(sb.mem) + + // fieldsMap + for k := range sb.fieldsMap { + sizeInBytes += (len(k) + SizeOfString) + SizeOfUint16 + } + + // fieldsInv, dictLocs + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + SizeOfString + } + sizeInBytes += len(sb.dictLocs) * SizeOfUint64 + + // fieldDvReaders + for _, secDvReaders := range sb.fieldDvReaders { + for _, v := range secDvReaders { + sizeInBytes += SizeOfUint16 + SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + } + + sb.size = uint64(sizeInBytes) +} + +func (sb *SegmentBase) AddRef() {} +func (sb *SegmentBase) DecRef() (err error) { return nil } +func (sb *SegmentBase) Close() (err error) { return nil } + +// Segment implements a persisted segment.Segment interface, by +// embedding an mmap()'ed SegmentBase. +type Segment struct { + SegmentBase + + f *os.File + mm mmap.MMap + path string + version uint32 + crc uint32 + + m sync.Mutex // Protects the fields that follow. + refs int64 +} + +func (s *Segment) Size() int { + // 8 /* size of file pointer */ + // 4 /* size of version -> uint32 */ + // 4 /* size of crc -> uint32 */ + sizeOfUints := 16 + + sizeInBytes := (len(s.path) + SizeOfString) + sizeOfUints + + // mutex, refs -> int64 + sizeInBytes += 16 + + // do not include the mmap'ed part + return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) +} + +func (s *Segment) AddRef() { + s.m.Lock() + s.refs++ + s.m.Unlock() +} + +func (s *Segment) DecRef() (err error) { + s.m.Lock() + s.refs-- + if s.refs == 0 { + err = s.closeActual() + } + s.m.Unlock() + return err +} + +func (s *Segment) loadConfig() error { + crcOffset := len(s.mm) - 4 + s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) + + verOffset := crcOffset - 4 + s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) + if Version < IndexSectionsVersion && s.version != Version { + return fmt.Errorf("unsupported version %d != %d", s.version, Version) + } + + chunkOffset := verOffset - 4 + s.chunkMode = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) + + docValueOffset := chunkOffset - 8 + s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) + + fieldsIndexOffset := docValueOffset - 8 + + // determining the right footer size based on version, this becomes important + // while loading the fields portion or the sections portion of the index file. + var footerSize int + if s.version >= IndexSectionsVersion { + // for version 16 and above, parse the sectionsIndexOffset + s.sectionsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8]) + fieldsIndexOffset = fieldsIndexOffset - 8 + footerSize = FooterSize + } else { + footerSize = FooterSize - 8 + } + + s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8]) + + storedIndexOffset := fieldsIndexOffset - 8 + s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8]) + + numDocsOffset := storedIndexOffset - 8 + s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) + + // 8*4 + 4*3 = 44 bytes being accounted from all the offsets + // above being read from the file + s.incrementBytesRead(uint64(footerSize)) + s.SegmentBase.mem = s.mm[:len(s.mm)-footerSize] + return nil +} + +// Implements the segment.DiskStatsReporter interface +// Only the persistedSegment type implments the +// interface, as the intention is to retrieve the bytes +// read from the on-disk segment as part of the current +// query. +func (s *Segment) ResetBytesRead(val uint64) { + atomic.StoreUint64(&s.SegmentBase.bytesRead, val) +} + +func (s *Segment) BytesRead() uint64 { + return atomic.LoadUint64(&s.bytesRead) +} + +func (s *Segment) BytesWritten() uint64 { + return 0 +} + +func (s *Segment) incrementBytesRead(val uint64) { + atomic.AddUint64(&s.bytesRead, val) +} + +func (s *SegmentBase) BytesWritten() uint64 { + return atomic.LoadUint64(&s.bytesWritten) +} + +func (s *SegmentBase) setBytesWritten(val uint64) { + atomic.AddUint64(&s.bytesWritten, val) +} + +func (s *SegmentBase) BytesRead() uint64 { + return 0 +} + +func (s *SegmentBase) ResetBytesRead(val uint64) {} + +func (s *SegmentBase) incrementBytesRead(val uint64) { + atomic.AddUint64(&s.bytesRead, val) +} + +func (s *SegmentBase) loadFields() error { + // NOTE for now we assume the fields index immediately precedes + // the footer, and if this changes, need to adjust accordingly (or + // store explicit length), where s.mem was sliced from s.mm in Open(). + fieldsIndexEnd := uint64(len(s.mem)) + + // iterate through fields index + var fieldID uint64 + for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { + addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) + + // accounting the address of the dictLoc being read from file + s.incrementBytesRead(8) + + dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd]) + n := uint64(read) + s.dictLocs = append(s.dictLocs, dictLoc) + + var nameLen uint64 + nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd]) + n += uint64(read) + + name := string(s.mem[addr+n : addr+n+nameLen]) + + s.incrementBytesRead(n + nameLen) + s.fieldsInv = append(s.fieldsInv, name) + s.fieldsMap[name] = uint16(fieldID + 1) + + fieldID++ + } + return nil +} + +func (s *SegmentBase) loadFieldsNew() error { + pos := s.sectionsIndexOffset + + if pos == 0 { + // this is the case only for older file formats + return s.loadFields() + } + + // read the number of fields + numFields, sz := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(sz) + s.incrementBytesRead(uint64(sz)) + + var fieldID uint64 + + for fieldID < numFields { + addr := binary.BigEndian.Uint64(s.mem[pos : pos+8]) + s.incrementBytesRead(8) + + fieldSectionMap := make(map[uint16]uint64) + + err := s.loadFieldNew(uint16(fieldID), addr, fieldSectionMap) + if err != nil { + return err + } + + s.fieldsSectionsMap = append(s.fieldsSectionsMap, fieldSectionMap) + + fieldID++ + pos += 8 + } + + return nil +} + +func (s *SegmentBase) loadFieldNew(fieldID uint16, pos uint64, + fieldSectionMap map[uint16]uint64) error { + if pos == 0 { + // there is no indexing structure present for this field/section + return nil + } + + fieldStartPos := pos // to track the number of bytes read + fieldNameLen, sz := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(sz) + + fieldName := string(s.mem[pos : pos+fieldNameLen]) + pos += fieldNameLen + + s.fieldsInv = append(s.fieldsInv, fieldName) + s.fieldsMap[fieldName] = uint16(fieldID + 1) + + fieldNumSections, sz := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(sz) + + for sectionIdx := uint64(0); sectionIdx < fieldNumSections; sectionIdx++ { + // read section id + fieldSectionType := binary.BigEndian.Uint16(s.mem[pos : pos+2]) + pos += 2 + fieldSectionAddr := binary.BigEndian.Uint64(s.mem[pos : pos+8]) + pos += 8 + fieldSectionMap[fieldSectionType] = fieldSectionAddr + if fieldSectionType == SectionInvertedTextIndex { + // for the fields which don't have the inverted index, the offset is + // 0 and during query time, because there is no valid dictionary we + // will just have follow a no-op path. + if fieldSectionAddr == 0 { + s.dictLocs = append(s.dictLocs, 0) + continue + } + + read := 0 + // skip the doc values + _, n := binary.Uvarint(s.mem[fieldSectionAddr : fieldSectionAddr+binary.MaxVarintLen64]) + fieldSectionAddr += uint64(n) + read += n + _, n = binary.Uvarint(s.mem[fieldSectionAddr : fieldSectionAddr+binary.MaxVarintLen64]) + fieldSectionAddr += uint64(n) + read += n + dictLoc, n := binary.Uvarint(s.mem[fieldSectionAddr : fieldSectionAddr+binary.MaxVarintLen64]) + // account the bytes read while parsing the field's inverted index section + s.incrementBytesRead(uint64(read + n)) + s.dictLocs = append(s.dictLocs, dictLoc) + } + } + + // account the bytes read while parsing the sections field index. + s.incrementBytesRead((pos - uint64(fieldStartPos)) + fieldNameLen) + return nil +} + +// Dictionary returns the term dictionary for the specified field +func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { + dict, err := s.dictionary(field) + if err == nil && dict == nil { + return emptyDictionary, nil + } + return dict, err +} + +func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 > 0 { + rv = &Dictionary{ + sb: sb, + field: field, + fieldID: fieldIDPlus1 - 1, + } + + dictStart := sb.dictLocs[rv.fieldID] + if dictStart > 0 { + var ok bool + sb.m.Lock() + if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok { + // read the length of the vellum data + vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) + if vellumLen == 0 { + sb.m.Unlock() + return nil, fmt.Errorf("empty dictionary for field: %v", field) + } + fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] + rv.incrementBytesRead(uint64(read) + vellumLen) + rv.fst, err = vellum.Load(fstBytes) + if err != nil { + sb.m.Unlock() + return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) + } + + sb.fieldFSTs[rv.fieldID] = rv.fst + } + + sb.m.Unlock() + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) + } + } + } + + return rv, nil +} + +// visitDocumentCtx holds data structures that are reusable across +// multiple VisitDocument() calls to avoid memory allocations +type visitDocumentCtx struct { + buf []byte + reader bytes.Reader + arrayPos []uint64 +} + +var visitDocumentCtxPool = sync.Pool{ + New: func() interface{} { + reuse := &visitDocumentCtx{} + return reuse + }, +} + +// VisitStoredFields invokes the StoredFieldValueVisitor for each stored field +// for the specified doc number +func (s *SegmentBase) VisitStoredFields(num uint64, visitor segment.StoredFieldValueVisitor) error { + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return s.visitStoredFields(vdc, num, visitor) +} + +func (s *SegmentBase) visitStoredFields(vdc *visitDocumentCtx, num uint64, + visitor segment.StoredFieldValueVisitor) error { + // first make sure this is a valid number in this segment + if num < s.numDocs { + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + idFieldVal := compressed[:idFieldValLen] + + keepGoing := visitor("_id", byte('t'), idFieldVal, nil) + if !keepGoing { + visitDocumentCtxPool.Put(vdc) + return nil + } + + // handle non-"_id" fields + compressed = compressed[idFieldValLen:] + + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) + if err != nil { + return err + } + + for keepGoing { + field, err := binary.ReadUvarint(&vdc.reader) + if err == io.EOF { + break + } + if err != nil { + return err + } + typ, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + offset, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + l, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + numap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + var arrayPos []uint64 + if numap > 0 { + if cap(vdc.arrayPos) < int(numap) { + vdc.arrayPos = make([]uint64, numap) + } + arrayPos = vdc.arrayPos[:numap] + for i := 0; i < int(numap); i++ { + ap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + arrayPos[i] = ap + } + } + value := uncompressed[offset : offset+l] + keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) + } + + vdc.buf = uncompressed + } + return nil +} + +// DocID returns the value of the _id field for the given docNum +func (s *SegmentBase) DocID(num uint64) ([]byte, error) { + if num >= s.numDocs { + return nil, nil + } + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return nil, err + } + idFieldVal := compressed[:idFieldValLen] + + visitDocumentCtxPool.Put(vdc) + + return idFieldVal, nil +} + +// Count returns the number of documents in this segment. +func (s *SegmentBase) Count() uint64 { + return s.numDocs +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { + rv := roaring.New() + + if len(s.fieldsMap) > 0 { + idDict, err := s.dictionary("_id") + if err != nil { + return nil, err + } + + postingsList := emptyPostingsList + + sMax, err := idDict.fst.GetMaxKey() + if err != nil { + return nil, err + } + sMaxStr := string(sMax) + for _, id := range ids { + if id <= sMaxStr { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err + } + postingsList.OrInto(rv) + } + } + } + + return rv, nil +} + +// Fields returns the field names used in this segment +func (s *SegmentBase) Fields() []string { + return s.fieldsInv +} + +// Path returns the path of this segment on disk +func (s *Segment) Path() string { + return s.path +} + +// Close releases all resources associated with this segment +func (s *Segment) Close() (err error) { + return s.DecRef() +} + +func (s *Segment) closeActual() (err error) { + if s.mm != nil { + err = s.mm.Unmap() + } + // try to close file even if unmap failed + if s.f != nil { + err2 := s.f.Close() + if err == nil { + // try to return first error + err = err2 + } + } + return +} + +// some helpers i started adding for the command-line utility + +// Data returns the underlying mmaped data slice +func (s *Segment) Data() []byte { + return s.mm +} + +// CRC returns the CRC value stored in the file footer +func (s *Segment) CRC() uint32 { + return s.crc +} + +// Version returns the file version in the file footer +func (s *Segment) Version() uint32 { + return s.version +} + +// ChunkFactor returns the chunk factor in the file footer +func (s *Segment) ChunkMode() uint32 { + return s.chunkMode +} + +// FieldsIndexOffset returns the fields index offset in the file footer +func (s *Segment) FieldsIndexOffset() uint64 { + return s.fieldsIndexOffset +} + +// StoredIndexOffset returns the stored value index offset in the file footer +func (s *Segment) StoredIndexOffset() uint64 { + return s.storedIndexOffset +} + +// DocValueOffset returns the docValue offset in the file footer +func (s *Segment) DocValueOffset() uint64 { + return s.docValueOffset +} + +// NumDocs returns the number of documents in the file footer +func (s *Segment) NumDocs() uint64 { + return s.numDocs +} + +// DictAddr is a helper function to compute the file offset where the +// dictionary is stored for the specified field. +func (s *Segment) DictAddr(field string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[field] + if !ok { + return 0, fmt.Errorf("no such field '%s'", field) + } + + return s.dictLocs[fieldIDPlus1-1], nil +} + +func (s *Segment) getSectionDvOffsets(fieldID int, secID uint16) (uint64, uint64, uint64, error) { + // Version is gonna be 16 + var fieldLocStart uint64 = fieldNotUninverted + fieldLocEnd := fieldLocStart + sectionMap := s.fieldsSectionsMap[fieldID] + fieldAddrStart := sectionMap[secID] + n := 0 + + if fieldAddrStart > 0 { + // fixed encoding as of now, need to uvarint this + var read uint64 + fieldLocStart, n = binary.Uvarint(s.mem[fieldAddrStart+read : fieldAddrStart+read+binary.MaxVarintLen64]) + if n <= 0 { + return 0, 0, 0, fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) + } + read += uint64(n) + + fieldLocEnd, n = binary.Uvarint(s.mem[fieldAddrStart+read : fieldAddrStart+read+binary.MaxVarintLen64]) + if n <= 0 { + return 0, 0, 0, fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) + } + read += uint64(n) + + s.incrementBytesRead(read) + } + + return fieldLocStart, fieldLocEnd, 0, nil +} + +func (s *Segment) loadDvReader(fieldID int, secID uint16) error { + start, end, _, err := s.getSectionDvOffsets(fieldID, secID) + if err != nil { + return err + } + + fieldDvReader, err := s.loadFieldDocValueReader(s.fieldsInv[fieldID], start, end) + if err != nil { + return err + } + + if fieldDvReader != nil { + if s.fieldDvReaders[secID] == nil { + s.fieldDvReaders[secID] = make(map[uint16]*docValueReader) + } + // fix the structure of fieldDvReaders + // currently it populates the inverted index doc values + s.fieldDvReaders[secID][uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, s.fieldsInv[fieldID]) + } + return nil +} + +func (s *Segment) loadDvReadersLegacy() error { + // older file formats to parse the docValueIndex and if that says doc values + // aren't there in this segment file, just return nil + if s.docValueOffset == fieldNotUninverted { + return nil + } + + for fieldID := range s.fieldsInv { + var read uint64 + start, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) + } + read += uint64(n) + end, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) + } + read += uint64(n) + s.incrementBytesRead(read) + + fieldDvReader, err := s.loadFieldDocValueReader(s.fieldsInv[fieldID], start, end) + if err != nil { + return err + } + + if fieldDvReader != nil { + // older file formats have docValues corresponding only to inverted index + // ignore the rest. + if s.fieldDvReaders[SectionInvertedTextIndex] == nil { + s.fieldDvReaders[SectionInvertedTextIndex] = make(map[uint16]*docValueReader) + } + // fix the structure of fieldDvReaders + // currently it populates the inverted index doc values + s.fieldDvReaders[SectionInvertedTextIndex][uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, s.fieldsInv[fieldID]) + } + } + + return nil +} + +// Segment is a file segment, and loading the dv readers from that segment +// must account for the version while loading since the formats are different +// in the older and the Version version. +func (s *Segment) loadDvReaders() error { + if s.numDocs == 0 { + return nil + } + + if s.version < IndexSectionsVersion { + return s.loadDvReadersLegacy() + } + + // for every section of every field, load the doc values and register + // the readers. + for fieldID := range s.fieldsInv { + for secID := range segmentSections { + s.loadDvReader(fieldID, secID) + } + } + + return nil +} + +// since segmentBase is an in-memory segment, it can be called only +// for v16 file formats as part of InitSegmentBase() while introducing +// a segment into the system. +func (s *SegmentBase) loadDvReaders() error { + + // evaluate -> s.docValueOffset == fieldNotUninverted + if s.numDocs == 0 { + return nil + } + + for fieldID, sections := range s.fieldsSectionsMap { + for secID, secOffset := range sections { + if secOffset > 0 { + // fixed encoding as of now, need to uvarint this + pos := secOffset + var read uint64 + fieldLocStart, n := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %v", s.fieldsInv[fieldID]) + } + pos += uint64(n) + read += uint64(n) + fieldLocEnd, n := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %v", s.fieldsInv[fieldID]) + } + pos += uint64(n) + read += uint64(n) + + s.incrementBytesRead(read) + + dataLoc, n := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the dataLoc "+ + "offset for sectionID %v field %v", secID, s.fieldsInv[fieldID]) + } + if secID == SectionInvertedTextIndex { + s.dictLocs = append(s.dictLocs, dataLoc) + s.incrementBytesRead(uint64(n)) + } + fieldDvReader, err := s.loadFieldDocValueReader(s.fieldsInv[fieldID], fieldLocStart, fieldLocEnd) + if err != nil { + return err + } + if fieldDvReader != nil { + if s.fieldDvReaders[secID] == nil { + s.fieldDvReaders[secID] = make(map[uint16]*docValueReader) + } + s.fieldDvReaders[secID][uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, s.fieldsInv[fieldID]) + } + } + } + } + + return nil +} diff --git a/vendor/github.com/blevesearch/zapx/v16/sizes.go b/vendor/github.com/blevesearch/zapx/v16/sizes.go new file mode 100644 index 0000000000..34166ea330 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/sizes.go @@ -0,0 +1,59 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "reflect" +) + +func init() { + var b bool + SizeOfBool = int(reflect.TypeOf(b).Size()) + var f32 float32 + SizeOfFloat32 = int(reflect.TypeOf(f32).Size()) + var f64 float64 + SizeOfFloat64 = int(reflect.TypeOf(f64).Size()) + var i int + SizeOfInt = int(reflect.TypeOf(i).Size()) + var m map[int]int + SizeOfMap = int(reflect.TypeOf(m).Size()) + var ptr *int + SizeOfPtr = int(reflect.TypeOf(ptr).Size()) + var slice []int + SizeOfSlice = int(reflect.TypeOf(slice).Size()) + var str string + SizeOfString = int(reflect.TypeOf(str).Size()) + var u8 uint8 + SizeOfUint8 = int(reflect.TypeOf(u8).Size()) + var u16 uint16 + SizeOfUint16 = int(reflect.TypeOf(u16).Size()) + var u32 uint32 + SizeOfUint32 = int(reflect.TypeOf(u32).Size()) + var u64 uint64 + SizeOfUint64 = int(reflect.TypeOf(u64).Size()) +} + +var SizeOfBool int +var SizeOfFloat32 int +var SizeOfFloat64 int +var SizeOfInt int +var SizeOfMap int +var SizeOfPtr int +var SizeOfSlice int +var SizeOfString int +var SizeOfUint8 int +var SizeOfUint16 int +var SizeOfUint32 int +var SizeOfUint64 int diff --git a/vendor/github.com/blevesearch/zapx/v16/write.go b/vendor/github.com/blevesearch/zapx/v16/write.go new file mode 100644 index 0000000000..1906a9bdbd --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/write.go @@ -0,0 +1,173 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "io" + + "github.com/RoaringBitmap/roaring" +) + +// writes out the length of the roaring bitmap in bytes as varint +// then writes out the roaring bitmap itself +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, + reuseBufVarint []byte) (int, error) { + buf, err := r.ToBytes() + if err != nil { + return 0, err + } + + var tw int + + // write out the length + n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) + nw, err := w.Write(reuseBufVarint[:n]) + tw += nw + if err != nil { + return tw, err + } + + // write out the roaring bytes + nw, err = w.Write(buf) + tw += nw + if err != nil { + return tw, err + } + + return tw, nil +} + +func persistFieldsSection(fieldsInv []string, w *CountHashWriter, dictLocs []uint64, opaque map[int]resetable) (uint64, error) { + var rv uint64 + fieldsOffsets := make([]uint64, 0, len(fieldsInv)) + + for fieldID, fieldName := range fieldsInv { + // record start of this field + fieldsOffsets = append(fieldsOffsets, uint64(w.Count())) + + // write field name length + _, err := writeUvarints(w, uint64(len(fieldName))) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write([]byte(fieldName)) + if err != nil { + return 0, err + } + + // write out the number of field-specific indexes + // FIXME hard-coding to 2, and not attempting to support sparseness well + _, err = writeUvarints(w, uint64(len(segmentSections))) + if err != nil { + return 0, err + } + + // now write pairs of index section ids, and start addresses for each field + // which has a specific section's data. this serves as the starting point + // using which a field's section data can be read and parsed. + for segmentSectionType, segmentSectionImpl := range segmentSections { + binary.Write(w, binary.BigEndian, segmentSectionType) + binary.Write(w, binary.BigEndian, uint64(segmentSectionImpl.AddrForField(opaque, fieldID))) + } + } + + rv = uint64(w.Count()) + // write out number of fields + _, err := writeUvarints(w, uint64(len(fieldsInv))) + if err != nil { + return 0, err + } + // now write out the fields index + for fieldID := range fieldsInv { + err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} + +// FooterSize is the size of the footer record in bytes +// crc + ver + chunk + docValueOffset + sectionsIndexOffset + field offset + stored offset + num docs +const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8 + 8 + +// in the index sections format, the fieldsIndexOffset points to the sectionsIndexOffset +func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, sectionsIndexOffset, docValueOffset uint64, + chunkMode uint32, crcBeforeFooter uint32, writerIn io.Writer) error { + w := NewCountHashWriter(writerIn) + w.crc = crcBeforeFooter + + // write out the number of docs + err := binary.Write(w, binary.BigEndian, numDocs) + if err != nil { + return err + } + // write out the stored field index location: + err = binary.Write(w, binary.BigEndian, storedIndexOffset) + if err != nil { + return err + } + // write out the field index location + err = binary.Write(w, binary.BigEndian, fieldsIndexOffset) + if err != nil { + return err + } + + // write out the new field index location (to be removed later, as this can eventually replace the old) + err = binary.Write(w, binary.BigEndian, sectionsIndexOffset) + if err != nil { + return err + } + + // write out the fieldDocValue location + err = binary.Write(w, binary.BigEndian, docValueOffset) + if err != nil { + return err + } + // write out 32-bit chunk factor + err = binary.Write(w, binary.BigEndian, chunkMode) + if err != nil { + return err + } + // write out 32-bit version + err = binary.Write(w, binary.BigEndian, Version) + if err != nil { + return err + } + // write out CRC-32 of everything upto but not including this CRC + err = binary.Write(w, binary.BigEndian, w.crc) + if err != nil { + return err + } + return nil +} + +func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) { + buf := make([]byte, binary.MaxVarintLen64) + for _, val := range vals { + n := binary.PutUvarint(buf, val) + var nw int + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + return tw, err +} diff --git a/vendor/github.com/blevesearch/zapx/v16/zap.md b/vendor/github.com/blevesearch/zapx/v16/zap.md new file mode 100644 index 0000000000..675ac56c0b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/zap.md @@ -0,0 +1,192 @@ +# ZAP File Format + +## Legend + +### File Sections + + |========| + | | file section + |========| + +### Fixed-size fields + + |--------| |----| |--| |-| + | | uint64 | | uint32 | | uint16 | | uint8 + |--------| |----| |--| |-| + +### Varints + + |~~~~~~~~| + | | varint(up to uint64) + |~~~~~~~~| + +### Arbitrary-length fields + + |--------...---| + | | arbitrary-length field (string, vellum, roaring bitmap) + |--------...---| + +### Chunked data + + [--------] + [ ] + [--------] + +## Overview + +Footer section describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. + + +==================================================+ + | Stored Fields | + |==================================================| + +-----> | Stored Fields Index | + | |==================================================| + | | Inverted Text Index Section | + | |==================================================| + | | Vector Index Section | + | |==================================================| + | | Sections Info | + | |==================================================| + | +-> | Sections Index | + | | |========+========+====+=====+======+====+====+====| + | | | D# | SF | F | S | FDV | CF | V | CC | (Footer) + | | +========+====+===+====+==+==+======+====+====+====+ + | | | | + +---------------------+ | + |-----------------------------+ + + + D#. Number of Docs. + SF. Stored Fields Index Offset. + F. Field Index Offset. + S. Sections Index Offset + FDV. Field DocValue Offset. + CF. Chunk Factor. + V. Version. + CC. CRC32. + + +## Stored Fields + +Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located. + + 0 [SF] [SF + D# * 8] + | Stored Fields | Stored Fields Index | + |================================|==================================| + | | | + | |--------------------| ||--------|--------|. . .|--------|| + | |-> | Stored Fields Data | || 0 | 1 | | D# - 1 || + | | |--------------------| ||--------|----|---|. . .|--------|| + | | | | | + |===|============================|==============|===================| + | | + |-------------------------------------------| + +Stored Fields Data is an arbitrary size record, which consists of metadata and [Snappy](https://github.com/golang/snappy)-compressed data. + + Stored Fields Data + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + | MDS | CDS | MD | CD | + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + + MDS. Metadata size. + CDS. Compressed data size. + MD. Metadata. + CD. Snappy-compressed data. + +## Index Sections + +Sections Index is a set of NF uint64 addresses (0 through F# - 1) each of which are offsets to the records in the Sections Info. Inside the sections info, we have further offsets to specific type of index section for that particular field in the segment file. For example, field 0 may correspond to Vector Indexing and its records would have offsets to the Vector Index Section whereas a field 1 may correspond to Text Indexing and its records would rather point to somewhere within the Inverted Text Index Section. + + (...) [F] [F + F#] + + Sections Info + Sections Index + + |============================================================================|=====================================| + | | | + | +---------+---------+-----+---------+---------+~~~~~~~~+~~~~~~~~+--+...+-+ | +-------+--------+...+------+-----+ | + +----> S1 Addr | S1 Type | ... | Sn Addr | Sn Type | NS | Length | Name | | | 0 | 1 | | F#-1 | NF | | + | | +---------+---------+-----+---------+---------+~~~~~~~~+~~~~~~~~+--+...+-+ | +-------+----+---+...+------+-----+ | + | | | | | + | +============================================================================+==============|======================+ + | | + +----------------------------------------------------------------------------------------------+ + + NF. Number of fields + NS. Number of index sections + Sn. nth index section + + +## Inverted Text Index Section + +Each fields has its own types of indexes in separate sections as indicated above. This can be a vector index or inverted text index. + +In case of inverted text index, the dictionary is encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term. + + +================================================================+- Inverted Text + | | Index Section + | | + | Freq/Norm (chunked) | + | [~~~~~~+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | +->[ Freq | Norm (float32 under varint) ] | + | | [~~~~~~+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | | | + | +------------------------------------------------------------+ | + | Location Details (chunked) | | + | [~~~~~~+~~~~~+~~~~~~~+~~~~~+~~~~~~+~~~~~~~~+~~~~~] | | + | +->[ Size | Pos | Start | End | Arr# | ArrPos | ... ] | | + | | [~~~~~~+~~~~~+~~~~~~~+~~~~~+~~~~~~+~~~~~~~~+~~~~~] | | + | | | | + | +----------------------+ | | + | Postings List | | | + | +~~~~~~~~+~~~~~+~~+~~~~~~~~+----------+...+-+ | | + | +->+ F/N | LD | Length | ROARING BITMAP | | | + | | +~~~~~+~~|~~~~~~~~|~~~~~~~~+----------+...+-+ | | + | | +----------------------------------------------+ | + | +-------------------------------------------------+ | + | | | + | Dictionary | | + | +~~~~~~~~~~+~~~~~~~+~~~~~~~~+--------------------------+-...-+ | + +-----> DV Start | DV End| Length | VELLUM DATA : (TERM -> OFFSET) | | + | | +~~~~~~~~~~+~~~~~~~+~~~~~~~~+----------------------------...-+ | + | | | + | | | + | |================================================================+- Vector Index Section + | | | + | |================================================================+- Sections Info + +-----------------------------+ | + | | | + | +-------+-----+-----+------+~~~~~~~~+~~~~~~~~+--+...+--+ | + | | ... | ITI | ITI ADDR | NS | Length | Name | | + | +-------+-----+------------+~~~~~~~~+~~~~~~~~+--+...+--+ | + +================================================================+ + + + ITI - Inverted Text Index + + +## Doc Values + +DocValue start and end offsets are stored within the section content of each field. This allows each field having its own type of index to choose whether to store the doc values or not. For example, it may not make sense to store doc values for vector indexing and so, the offsets can be invalid ones for it whereas the fields having text indexing may have valid doc values offsets. + + + +================================================================+ + | +------...--+ | + | +->+ DocValues +<-+ | + | | +------...--+ | | + |==|=================|===========================================+- Inverted Text + ++~+~~~~~~~~~+~~~~~~~+~~+~~~~~~~~+-----------------------...--+ | Index Section + || DV START | DV END | LENGTH | VELLUM DATA: TERM -> OFFSET| | + ++~~~~~~~~~~~+~~~~~~~~~~+~~~~~~~~+-----------------------...--+ | + +================================================================+ + + +DocValues is chunked Snappy-compressed values for each document and field. + + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + [ Doc# in Chunk | Doc1 | Offset1 | ... | DocN | OffsetN | SNAPPY COMPRESSED DATA ] + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + +Last 16 bytes are description of chunks. + + |~~~~~~~~~~~~...~|----------------|----------------| + | Chunk Sizes | Chunk Size Arr | Chunk# | + |~~~~~~~~~~~~...~|----------------|----------------| diff --git a/vendor/modules.txt b/vendor/modules.txt index a28f6bc36f..a31353913f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -68,6 +68,7 @@ github.com/ProtonMail/go-crypto/openpgp/s2k ## explicit; go 1.14 github.com/RoaringBitmap/roaring github.com/RoaringBitmap/roaring/internal +github.com/RoaringBitmap/roaring/roaring64 # github.com/agnivade/levenshtein v1.1.1 ## explicit; go 1.13 github.com/agnivade/levenshtein @@ -159,8 +160,8 @@ github.com/bitly/go-simplejson # github.com/bits-and-blooms/bitset v1.2.1 ## explicit; go 1.14 github.com/bits-and-blooms/bitset -# github.com/blevesearch/bleve/v2 v2.3.10 -## explicit; go 1.19 +# github.com/blevesearch/bleve/v2 v2.4.0 +## explicit; go 1.20 github.com/blevesearch/bleve/v2 github.com/blevesearch/bleve/v2/analysis github.com/blevesearch/bleve/v2/analysis/analyzer/custom @@ -197,13 +198,16 @@ github.com/blevesearch/bleve/v2/search/scorer github.com/blevesearch/bleve/v2/search/searcher github.com/blevesearch/bleve/v2/size github.com/blevesearch/bleve/v2/util -# github.com/blevesearch/bleve_index_api v1.0.6 -## explicit; go 1.18 +# github.com/blevesearch/bleve_index_api v1.1.6 +## explicit; go 1.20 github.com/blevesearch/bleve_index_api -# github.com/blevesearch/geo v0.1.18 +# github.com/blevesearch/geo v0.1.20 ## explicit; go 1.18 github.com/blevesearch/geo/geojson github.com/blevesearch/geo/s2 +# github.com/blevesearch/go-faiss v1.0.13 +## explicit; go 1.19 +github.com/blevesearch/go-faiss # github.com/blevesearch/go-porterstemmer v1.0.3 ## explicit; go 1.13 github.com/blevesearch/go-porterstemmer @@ -213,8 +217,8 @@ github.com/blevesearch/gtreap # github.com/blevesearch/mmap-go v1.0.4 ## explicit; go 1.13 github.com/blevesearch/mmap-go -# github.com/blevesearch/scorch_segment_api/v2 v2.1.6 -## explicit; go 1.19 +# github.com/blevesearch/scorch_segment_api/v2 v2.2.9 +## explicit; go 1.20 github.com/blevesearch/scorch_segment_api/v2 # github.com/blevesearch/segment v0.9.1 ## explicit; go 1.18 @@ -247,6 +251,9 @@ github.com/blevesearch/zapx/v14 # github.com/blevesearch/zapx/v15 v15.3.13 ## explicit; go 1.19 github.com/blevesearch/zapx/v15 +# github.com/blevesearch/zapx/v16 v16.0.12 +## explicit; go 1.20 +github.com/blevesearch/zapx/v16 # github.com/bluele/gcache v0.0.2 ## explicit; go 1.15 github.com/bluele/gcache