diff --git a/go.mod b/go.mod
index 75c32790b..84b95b60a 100644
--- a/go.mod
+++ b/go.mod
@@ -11,7 +11,7 @@ require (
github.com/Nerzal/gocloak/v13 v13.9.0
github.com/bbalet/stopwords v1.0.0
github.com/beevik/etree v1.4.0
- github.com/blevesearch/bleve/v2 v2.4.0
+ github.com/blevesearch/bleve/v2 v2.4.2
github.com/cenkalti/backoff v2.2.1+incompatible
github.com/coreos/go-oidc/v3 v3.10.0
github.com/cs3org/go-cs3apis v0.0.0-20240724121416-062c4e3046cb
@@ -126,7 +126,7 @@ require (
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/OneOfOne/xxhash v1.2.8 // indirect
github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 // indirect
- github.com/RoaringBitmap/roaring v1.2.3 // indirect
+ github.com/RoaringBitmap/roaring v1.9.3 // indirect
github.com/agnivade/levenshtein v1.1.1 // indirect
github.com/ajg/form v1.5.1 // indirect
github.com/alexedwards/argon2id v1.0.0 // indirect
@@ -137,14 +137,14 @@ require (
github.com/aws/aws-sdk-go v1.45.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bitly/go-simplejson v0.5.0 // indirect
- github.com/bits-and-blooms/bitset v1.2.1 // indirect
- github.com/blevesearch/bleve_index_api v1.1.6 // indirect
+ github.com/bits-and-blooms/bitset v1.12.0 // indirect
+ github.com/blevesearch/bleve_index_api v1.1.10 // indirect
github.com/blevesearch/geo v0.1.20 // indirect
- github.com/blevesearch/go-faiss v1.0.13 // indirect
+ github.com/blevesearch/go-faiss v1.0.20 // indirect
github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
github.com/blevesearch/gtreap v0.1.1 // indirect
github.com/blevesearch/mmap-go v1.0.4 // indirect
- github.com/blevesearch/scorch_segment_api/v2 v2.2.9 // indirect
+ github.com/blevesearch/scorch_segment_api/v2 v2.2.15 // indirect
github.com/blevesearch/segment v0.9.1 // indirect
github.com/blevesearch/snowballstem v0.9.0 // indirect
github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect
@@ -154,7 +154,7 @@ require (
github.com/blevesearch/zapx/v13 v13.3.10 // indirect
github.com/blevesearch/zapx/v14 v14.3.10 // indirect
github.com/blevesearch/zapx/v15 v15.3.13 // indirect
- github.com/blevesearch/zapx/v16 v16.0.12 // indirect
+ github.com/blevesearch/zapx/v16 v16.1.5 // indirect
github.com/bluele/gcache v0.0.2 // indirect
github.com/bmizerany/pat v0.0.0-20210406213842-e4b6760bdd6f // indirect
github.com/bombsimon/logrusr/v3 v3.1.0 // indirect
diff --git a/go.sum b/go.sum
index 99591abcb..6f039c96d 100644
--- a/go.sum
+++ b/go.sum
@@ -822,8 +822,8 @@ github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdII
github.com/OpenDNS/vegadns2client v0.0.0-20180418235048-a3fa4a771d87/go.mod h1:iGLljf5n9GjT6kc0HBvyI1nOKnGQbNB66VzSNbK5iks=
github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 h1:kkhsdkhsCvIsutKu5zLMgWtgh9YxGCNAw8Ad8hjwfYg=
github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0=
-github.com/RoaringBitmap/roaring v1.2.3 h1:yqreLINqIrX22ErkKI0vY47/ivtJr6n+kMhVOVmhWBY=
-github.com/RoaringBitmap/roaring v1.2.3/go.mod h1:plvDsJQpxOC5bw8LRteu/MLWHsHez/3y6cubLI4/1yE=
+github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4S2OByM=
+github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI=
github.com/aduffeck/gowebdav v0.0.0-20231215102054-212d4a4374f6 h1:ws0yvsikTQdmheKINP16tBzAHdttrHwbz/q3Fgl9X1Y=
@@ -893,26 +893,25 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
github.com/bitly/go-simplejson v0.5.0 h1:6IH+V8/tVMab511d5bn4M7EwGXZf9Hj6i2xSwkNEM+Y=
github.com/bitly/go-simplejson v0.5.0/go.mod h1:cXHtHw4XUPsvGaxgjIAn8PhEWG9NfngEKAMDJEczWVA=
-github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
-github.com/bits-and-blooms/bitset v1.2.1 h1:M+/hrU9xlMp7t4TyTDQW97d3tRPVuKFC6zBEK16QnXY=
-github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
+github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
+github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84=
-github.com/blevesearch/bleve/v2 v2.4.0 h1:2xyg+Wv60CFHYccXc+moGxbL+8QKT/dZK09AewHgKsg=
-github.com/blevesearch/bleve/v2 v2.4.0/go.mod h1:IhQHoFAbHgWKYavb9rQgQEJJVMuY99cKdQ0wPpst2aY=
-github.com/blevesearch/bleve_index_api v1.1.6 h1:orkqDFCBuNU2oHW9hN2YEJmet+TE9orml3FCGbl1cKk=
-github.com/blevesearch/bleve_index_api v1.1.6/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
+github.com/blevesearch/bleve/v2 v2.4.2 h1:NooYP1mb3c0StkiY9/xviiq2LGSaE8BQBCc/pirMx0U=
+github.com/blevesearch/bleve/v2 v2.4.2/go.mod h1:ATNKj7Yl2oJv/lGuF4kx39bST2dveX6w0th2FFYLkc8=
+github.com/blevesearch/bleve_index_api v1.1.10 h1:PDLFhVjrjQWr6jCuU7TwlmByQVCSEURADHdCqVS9+g0=
+github.com/blevesearch/bleve_index_api v1.1.10/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
-github.com/blevesearch/go-faiss v1.0.13 h1:zfFs7ZYD0NqXVSY37j0JZjZT1BhE9AE4peJfcx/NB4A=
-github.com/blevesearch/go-faiss v1.0.13/go.mod h1:jrxHrbl42X/RnDPI+wBoZU8joxxuRwedrxqswQ3xfU8=
+github.com/blevesearch/go-faiss v1.0.20 h1:AIkdTQFWuZ5LQmKQSebgMR4RynGNw8ZseJXaan5kvtI=
+github.com/blevesearch/go-faiss v1.0.20/go.mod h1:jrxHrbl42X/RnDPI+wBoZU8joxxuRwedrxqswQ3xfU8=
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M=
github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y=
github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
-github.com/blevesearch/scorch_segment_api/v2 v2.2.9 h1:3nBaSBRFokjE4FtPW3eUDgcAu3KphBg1GP07zy/6Uyk=
-github.com/blevesearch/scorch_segment_api/v2 v2.2.9/go.mod h1:ckbeb7knyOOvAdZinn/ASbB7EA3HoagnJkmEV3J7+sg=
+github.com/blevesearch/scorch_segment_api/v2 v2.2.15 h1:prV17iU/o+A8FiZi9MXmqbagd8I0bCqM7OKUYPbnb5Y=
+github.com/blevesearch/scorch_segment_api/v2 v2.2.15/go.mod h1:db0cmP03bPNadXrCDuVkKLV6ywFSiRgPFT1YVrestBc=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
@@ -931,8 +930,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ=
github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg=
-github.com/blevesearch/zapx/v16 v16.0.12 h1:Uccxvjmn+hQ6ywQP+wIiTpdq9LnAviGoryJOmGwAo/I=
-github.com/blevesearch/zapx/v16 v16.0.12/go.mod h1:MYnOshRfSm4C4drxx1LGRI+MVFByykJ2anDY1fxdk9Q=
+github.com/blevesearch/zapx/v16 v16.1.5 h1:b0sMcarqNFxuXvjoXsF8WtwVahnxyhEvBSRJi/AUHjU=
+github.com/blevesearch/zapx/v16 v16.1.5/go.mod h1:J4mSF39w1QELc11EWRSBFkPeZuO7r/NPKkHzDCoiaI8=
github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw=
github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY=
diff --git a/vendor/github.com/RoaringBitmap/roaring/Makefile b/vendor/github.com/RoaringBitmap/roaring/Makefile
deleted file mode 100644
index 0a4f9f0aa..000000000
--- a/vendor/github.com/RoaringBitmap/roaring/Makefile
+++ /dev/null
@@ -1,107 +0,0 @@
-.PHONY: help all test format fmtcheck vet lint qa deps clean nuke ser fetch-real-roaring-datasets
-
-
-
-
-
-
-
-
-# Display general help about this command
-help:
- @echo ""
- @echo "The following commands are available:"
- @echo ""
- @echo " make qa : Run all the tests"
- @echo " make test : Run the unit tests"
- @echo ""
- @echo " make format : Format the source code"
- @echo " make fmtcheck : Check if the source code has been formatted"
- @echo " make vet : Check for suspicious constructs"
- @echo " make lint : Check for style errors"
- @echo ""
- @echo " make deps : Get the dependencies"
- @echo " make clean : Remove any build artifact"
- @echo " make nuke : Deletes any intermediate file"
- @echo ""
- @echo " make fuzz-smat : Fuzzy testing with smat"
- @echo " make fuzz-stream : Fuzzy testing with stream deserialization"
- @echo " make fuzz-buffer : Fuzzy testing with buffer deserialization"
- @echo ""
-
-# Alias for help target
-all: help
-test:
- go test
- go test -race -run TestConcurrent*
-# Format the source code
-format:
- @find ./ -type f -name "*.go" -exec gofmt -w {} \;
-
-# Check if the source code has been formatted
-fmtcheck:
- @mkdir -p target
- @find ./ -type f -name "*.go" -exec gofmt -d {} \; | tee target/format.diff
- @test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; }
-
-# Check for syntax errors
-vet:
- GOPATH=$(GOPATH) go vet ./...
-
-# Check for style errors
-lint:
- GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint ./...
-
-
-
-
-
-# Alias to run all quality-assurance checks
-qa: fmtcheck test vet lint
-
-# --- INSTALL ---
-
-# Get the dependencies
-deps:
- GOPATH=$(GOPATH) go get github.com/stretchr/testify
- GOPATH=$(GOPATH) go get github.com/bits-and-blooms/bitset
- GOPATH=$(GOPATH) go get github.com/golang/lint/golint
- GOPATH=$(GOPATH) go get github.com/mschoch/smat
- GOPATH=$(GOPATH) go get github.com/dvyukov/go-fuzz/go-fuzz
- GOPATH=$(GOPATH) go get github.com/dvyukov/go-fuzz/go-fuzz-build
- GOPATH=$(GOPATH) go get github.com/glycerine/go-unsnap-stream
- GOPATH=$(GOPATH) go get github.com/philhofer/fwd
- GOPATH=$(GOPATH) go get github.com/jtolds/gls
-
-fuzz-smat:
- go test -tags=gofuzz -run=TestGenerateSmatCorpus
- go-fuzz-build -func FuzzSmat github.com/RoaringBitmap/roaring
- go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200
-
-
-fuzz-stream:
- go-fuzz-build -func FuzzSerializationStream github.com/RoaringBitmap/roaring
- go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200
-
-
-fuzz-buffer:
- go-fuzz-build -func FuzzSerializationBuffer github.com/RoaringBitmap/roaring
- go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200
-
-# Remove any build artifact
-clean:
- GOPATH=$(GOPATH) go clean ./...
-
-# Deletes any intermediate file
-nuke:
- rm -rf ./target
- GOPATH=$(GOPATH) go clean -i ./...
-
-cover:
- go test -coverprofile=coverage.out
- go tool cover -html=coverage.out
-
-fetch-real-roaring-datasets:
- # pull github.com/RoaringBitmap/real-roaring-datasets -> testdata/real-roaring-datasets
- git submodule init
- git submodule update
diff --git a/vendor/github.com/RoaringBitmap/roaring/README.md b/vendor/github.com/RoaringBitmap/roaring/README.md
index 753b8068b..acd3058b7 100644
--- a/vendor/github.com/RoaringBitmap/roaring/README.md
+++ b/vendor/github.com/RoaringBitmap/roaring/README.md
@@ -1,5 +1,7 @@
-roaring [](https://godoc.org/github.com/RoaringBitmap/roaring/roaring64) [](https://goreportcard.com/report/github.com/RoaringBitmap/roaring)
-[](https://cloud.drone.io/RoaringBitmap/roaring)
+# roaring
+
+[](https://godoc.org/github.com/RoaringBitmap/roaring) [](https://goreportcard.com/report/github.com/RoaringBitmap/roaring)
+



@@ -31,17 +33,17 @@ Roaring bitmaps are found to work well in many important applications:
The ``roaring`` Go library is used by
* [anacrolix/torrent]
-* [runv](https://github.com/hyperhq/runv)
* [InfluxDB](https://www.influxdata.com)
* [Pilosa](https://www.pilosa.com/)
* [Bleve](http://www.blevesearch.com)
+* [Weaviate](https://github.com/weaviate/weaviate)
* [lindb](https://github.com/lindb/lindb)
* [Elasticell](https://github.com/deepfabric/elasticell)
* [SourceGraph](https://github.com/sourcegraph/sourcegraph)
* [M3](https://github.com/m3db/m3)
* [trident](https://github.com/NetApp/trident)
* [Husky](https://www.datadoghq.com/blog/engineering/introducing-husky/)
-
+* [FrostDB](https://github.com/polarsignals/frostdb)
This library is used in production in several systems, it is part of the [Awesome Go collection](https://awesome-go.com).
@@ -99,7 +101,7 @@ whether you like it or not. That can become very wasteful.
This being said, there are definitively cases where attempting to use compressed bitmaps is wasteful.
For example, if you have a small universe size. E.g., your bitmaps represent sets of integers
-from [0,n) where n is small (e.g., n=64 or n=128). If you are able to uncompressed BitSet and
+from [0,n) where n is small (e.g., n=64 or n=128). If you can use uncompressed BitSet and
it does not blow up your memory usage, then compressed bitmaps are probably not useful
to you. In fact, if you do not need compression, then a BitSet offers remarkable speed.
@@ -134,7 +136,7 @@ There is a big problem with these formats however that can hurt you badly in som
Roaring solves this problem. It works in the following manner. It divides the data into chunks of 216 integers
(e.g., [0, 216), [216, 2 x 216), ...). Within a chunk, it can use an uncompressed bitmap, a simple list of integers,
-or a list of runs. Whatever format it uses, they all allow you to check for the present of any one value quickly
+or a list of runs. Whatever format it uses, they all allow you to check for the presence of any one value quickly
(e.g., with a binary search). The net result is that Roaring can compute many operations much faster than run-length-encoded
formats like WAH, EWAH, Concise... Maybe surprisingly, Roaring also generally offers better compression ratios.
diff --git a/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go b/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go
index 9541fd536..80fa676ef 100644
--- a/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go
+++ b/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go
@@ -17,8 +17,17 @@ func (ac *arrayContainer) String() string {
}
func (ac *arrayContainer) fillLeastSignificant16bits(x []uint32, i int, mask uint32) int {
+ if i < 0 {
+ panic("negative index")
+ }
+ if len(ac.content) == 0 {
+ return i
+ }
+ _ = x[len(ac.content)-1+i]
+ _ = ac.content[len(ac.content)-1]
for k := 0; k < len(ac.content); k++ {
- x[k+i] = uint32(ac.content[k]) | mask
+ x[k+i] =
+ uint32(ac.content[k]) | mask
}
return i + len(ac.content)
}
@@ -655,10 +664,54 @@ func (ac *arrayContainer) iandNot(a container) container {
}
func (ac *arrayContainer) iandNotRun16(rc *runContainer16) container {
- rcb := rc.toBitmapContainer()
- acb := ac.toBitmapContainer()
- acb.iandNotBitmapSurely(rcb)
- *ac = *(acb.toArrayContainer())
+ // Fast path: if either the array container or the run container is empty, the result is the array.
+ if ac.isEmpty() || rc.isEmpty() {
+ // Empty
+ return ac
+ }
+ // Fast path: if the run container is full, the result is empty.
+ if rc.isFull() {
+ ac.content = ac.content[:0]
+ return ac
+ }
+ current_run := 0
+ // All values in [start_run, end_end] are part of the run
+ start_run := rc.iv[current_run].start
+ end_end := start_run + rc.iv[current_run].length
+ // We are going to read values in the array at index i, and we are
+ // going to write them at index pos. So we do in-place processing.
+ // We always have that pos <= i by construction. So we can either
+ // overwrite a value just read, or a value that was previous read.
+ pos := 0
+ i := 0
+ for ; i < len(ac.content); i++ {
+ if ac.content[i] < start_run {
+ // the value in the array appears before the run [start_run, end_end]
+ ac.content[pos] = ac.content[i]
+ pos++
+ } else if ac.content[i] <= end_end {
+ // nothing to do, the value is in the array but also in the run.
+ } else {
+ // We have the value in the array after the run. We cannot tell
+ // whether we need to keep it or not. So let us move to another run.
+ if current_run+1 < len(rc.iv) {
+ current_run++
+ start_run = rc.iv[current_run].start
+ end_end = start_run + rc.iv[current_run].length
+ i-- // retry with the same i
+ } else {
+ // We have exhausted the number of runs. We can keep the rest of the values
+ // from i to len(ac.content) - 1 inclusively.
+ break // We are done, the rest of the array will be kept
+ }
+ }
+ }
+ for ; i < len(ac.content); i++ {
+ ac.content[pos] = ac.content[i]
+ pos++
+ }
+ // We 'shink' the slice.
+ ac.content = ac.content[:pos]
return ac
}
diff --git a/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go b/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go
index 71029f4ff..bf08bfca3 100644
--- a/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go
+++ b/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go
@@ -888,13 +888,67 @@ func (bc *bitmapContainer) iandNot(a container) container {
}
func (bc *bitmapContainer) iandNotArray(ac *arrayContainer) container {
- acb := ac.toBitmapContainer()
- return bc.iandNotBitmapSurely(acb)
+ if ac.isEmpty() || bc.isEmpty() {
+ // Nothing to do.
+ return bc
+ }
+
+ // Word by word, we remove the elements in ac from bc. The approach is to build
+ // a mask of the elements to remove, and then apply it to the bitmap.
+ wordIdx := uint16(0)
+ mask := uint64(0)
+ for i, v := range ac.content {
+ if v/64 != wordIdx {
+ // Flush the current word.
+ if i != 0 {
+ // We're removing bits that are set in the mask and in the current word.
+ // To figure out the cardinality change, we count the number of bits that
+ // are set in the mask and in the current word.
+ mask &= bc.bitmap[wordIdx]
+ bc.bitmap[wordIdx] &= ^mask
+ bc.cardinality -= int(popcount(mask))
+ }
+
+ wordIdx = v / 64
+ mask = 0
+ }
+ mask |= 1 << (v % 64)
+ }
+
+ // Flush the last word.
+ mask &= bc.bitmap[wordIdx]
+ bc.bitmap[wordIdx] &= ^mask
+ bc.cardinality -= int(popcount(mask))
+
+ if bc.getCardinality() <= arrayDefaultMaxSize {
+ return bc.toArrayContainer()
+ }
+ return bc
}
func (bc *bitmapContainer) iandNotRun16(rc *runContainer16) container {
- rcb := rc.toBitmapContainer()
- return bc.iandNotBitmapSurely(rcb)
+ if rc.isEmpty() || bc.isEmpty() {
+ // Nothing to do.
+ return bc
+ }
+
+ wordRangeStart := rc.iv[0].start / 64
+ wordRangeEnd := (rc.iv[len(rc.iv)-1].last()) / 64 // inclusive
+
+ cardinalityChange := popcntSlice(bc.bitmap[wordRangeStart : wordRangeEnd+1]) // before cardinality - after cardinality (for word range)
+
+ for _, iv := range rc.iv {
+ resetBitmapRange(bc.bitmap, int(iv.start), int(iv.last())+1)
+ }
+
+ cardinalityChange -= popcntSlice(bc.bitmap[wordRangeStart : wordRangeEnd+1])
+
+ bc.cardinality -= int(cardinalityChange)
+
+ if bc.getCardinality() <= arrayDefaultMaxSize {
+ return bc.toArrayContainer()
+ }
+ return bc
}
func (bc *bitmapContainer) andNotArray(value2 *arrayContainer) container {
@@ -1062,7 +1116,6 @@ func (bc *bitmapContainer) PrevSetBit(i int) int {
// reference the java implementation
// https://github.com/RoaringBitmap/RoaringBitmap/blob/master/src/main/java/org/roaringbitmap/BitmapContainer.java#L875-L892
-//
func (bc *bitmapContainer) numberOfRuns() int {
if bc.cardinality == 0 {
return 0
diff --git a/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go b/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go
index 3e5490a9d..d5ebb91ab 100644
--- a/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go
+++ b/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go
@@ -10,6 +10,11 @@ type ByteInput interface {
// Next returns a slice containing the next n bytes from the buffer,
// advancing the buffer as if the bytes had been returned by Read.
Next(n int) ([]byte, error)
+ // NextReturnsSafeSlice returns true if Next() returns a safe slice as opposed
+ // to a slice that points to an underlying buffer possibly owned by another system.
+ // When NextReturnsSafeSlice returns false, the result from Next() should be copied
+ // before it is modified (i.e., it is immutable).
+ NextReturnsSafeSlice() bool
// ReadUInt32 reads uint32 with LittleEndian order
ReadUInt32() (uint32, error)
// ReadUInt16 reads uint16 with LittleEndian order
@@ -42,6 +47,25 @@ type ByteBuffer struct {
off int
}
+// NewByteBuffer creates a new ByteBuffer.
+func NewByteBuffer(buf []byte) *ByteBuffer {
+ return &ByteBuffer{
+ buf: buf,
+ }
+}
+
+var _ io.Reader = (*ByteBuffer)(nil)
+
+// Read implements io.Reader.
+func (b *ByteBuffer) Read(p []byte) (int, error) {
+ data, err := b.Next(len(p))
+ if err != nil {
+ return 0, err
+ }
+ copy(p, data)
+ return len(data), nil
+}
+
// Next returns a slice containing the next n bytes from the reader
// If there are fewer bytes than the given n, io.ErrUnexpectedEOF will be returned
func (b *ByteBuffer) Next(n int) ([]byte, error) {
@@ -57,6 +81,12 @@ func (b *ByteBuffer) Next(n int) ([]byte, error) {
return data, nil
}
+// NextReturnsSafeSlice returns false since ByteBuffer might hold
+// an array owned by some other systems.
+func (b *ByteBuffer) NextReturnsSafeSlice() bool {
+ return false
+}
+
// ReadUInt32 reads uint32 with LittleEndian order
func (b *ByteBuffer) ReadUInt32() (uint32, error) {
if len(b.buf)-b.off < 4 {
@@ -109,26 +139,45 @@ func (b *ByteBuffer) Reset(buf []byte) {
type ByteInputAdapter struct {
r io.Reader
readBytes int
+ buf [4]byte
+}
+
+var _ io.Reader = (*ByteInputAdapter)(nil)
+
+// Read implements io.Reader.
+func (b *ByteInputAdapter) Read(buf []byte) (int, error) {
+ m, err := io.ReadAtLeast(b.r, buf, len(buf))
+ b.readBytes += m
+
+ if err != nil {
+ return 0, err
+ }
+
+ return m, nil
}
// Next returns a slice containing the next n bytes from the buffer,
// advancing the buffer as if the bytes had been returned by Read.
func (b *ByteInputAdapter) Next(n int) ([]byte, error) {
buf := make([]byte, n)
- m, err := io.ReadAtLeast(b.r, buf, n)
- b.readBytes += m
+ _, err := b.Read(buf)
if err != nil {
return nil, err
}
-
return buf, nil
}
+// NextReturnsSafeSlice returns true since ByteInputAdapter always returns a slice
+// allocated with make([]byte, ...)
+func (b *ByteInputAdapter) NextReturnsSafeSlice() bool {
+ return true
+}
+
// ReadUInt32 reads uint32 with LittleEndian order
func (b *ByteInputAdapter) ReadUInt32() (uint32, error) {
- buf, err := b.Next(4)
-
+ buf := b.buf[:4]
+ _, err := b.Read(buf)
if err != nil {
return 0, err
}
@@ -138,8 +187,8 @@ func (b *ByteInputAdapter) ReadUInt32() (uint32, error) {
// ReadUInt16 reads uint16 with LittleEndian order
func (b *ByteInputAdapter) ReadUInt16() (uint16, error) {
- buf, err := b.Next(2)
-
+ buf := b.buf[:2]
+ _, err := b.Read(buf)
if err != nil {
return 0, err
}
diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring.go b/vendor/github.com/RoaringBitmap/roaring/roaring.go
index 7220da272..a31cdbd9e 100644
--- a/vendor/github.com/RoaringBitmap/roaring/roaring.go
+++ b/vendor/github.com/RoaringBitmap/roaring/roaring.go
@@ -13,6 +13,7 @@ import (
"strconv"
"github.com/RoaringBitmap/roaring/internal"
+ "github.com/bits-and-blooms/bitset"
)
// Bitmap represents a compressed bitmap where you can add integers.
@@ -53,17 +54,186 @@ func (rb *Bitmap) ToBytes() ([]byte, error) {
return rb.highlowcontainer.toBytes()
}
+const wordSize = uint64(64)
+const log2WordSize = uint64(6)
+const capacity = ^uint64(0)
+const bitmapContainerSize = (1 << 16) / 64 // bitmap size in words
+
+// DenseSize returns the size of the bitmap when stored as a dense bitmap.
+func (rb *Bitmap) DenseSize() uint64 {
+ if rb.highlowcontainer.size() == 0 {
+ return 0
+ }
+
+ maximum := 1 + uint64(rb.Maximum())
+ if maximum > (capacity - wordSize + 1) {
+ return uint64(capacity >> log2WordSize)
+ }
+
+ return uint64((maximum + (wordSize - 1)) >> log2WordSize)
+}
+
+// ToDense returns a slice of uint64s representing the bitmap as a dense bitmap.
+// Useful to convert a roaring bitmap to a format that can be used by other libraries
+// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
+func (rb *Bitmap) ToDense() []uint64 {
+ sz := rb.DenseSize()
+ if sz == 0 {
+ return nil
+ }
+
+ bitmap := make([]uint64, sz)
+ rb.WriteDenseTo(bitmap)
+ return bitmap
+}
+
+// FromDense creates a bitmap from a slice of uint64s representing the bitmap as a dense bitmap.
+// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
+// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
+//
+// This function will not create any run containers, only array and bitmap containers. It's up to
+// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
+//
+// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
+// This is useful when the bitmap is going to be modified after this function returns or if it's
+// undesirable to hold references to large bitmaps which the GC would not be able to collect.
+// One copy can still happen even when doCopy is false if the bitmap length is not divisible
+// by bitmapContainerSize.
+//
+// See also FromBitSet.
+func FromDense(bitmap []uint64, doCopy bool) *Bitmap {
+ sz := (len(bitmap) + bitmapContainerSize - 1) / bitmapContainerSize // round up
+ rb := &Bitmap{
+ highlowcontainer: roaringArray{
+ containers: make([]container, 0, sz),
+ keys: make([]uint16, 0, sz),
+ needCopyOnWrite: make([]bool, 0, sz),
+ },
+ }
+ rb.FromDense(bitmap, doCopy)
+ return rb
+}
+
+// FromDense unmarshalls from a slice of uint64s representing the bitmap as a dense bitmap.
+// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
+// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
+// Callers are responsible for ensuring that the bitmap is empty before calling this function.
+//
+// This function will not create any run containers, only array and bitmap containers. It is up to
+// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
+//
+// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
+// This is useful when the bitmap is going to be modified after this function returns or if it's
+// undesirable to hold references to large bitmaps which the GC would not be able to collect.
+// One copy can still happen even when doCopy is false if the bitmap length is not divisible
+// by bitmapContainerSize.
+//
+// See FromBitSet.
+func (rb *Bitmap) FromDense(bitmap []uint64, doCopy bool) {
+ if len(bitmap) == 0 {
+ return
+ }
+
+ var k uint16
+ const size = bitmapContainerSize
+
+ for len(bitmap) > 0 {
+ hi := size
+ if len(bitmap) < size {
+ hi = len(bitmap)
+ }
+
+ words := bitmap[:hi]
+ count := int(popcntSlice(words))
+
+ switch {
+ case count > arrayDefaultMaxSize:
+ c := &bitmapContainer{cardinality: count, bitmap: words}
+ cow := true
+
+ if doCopy || len(words) < size {
+ c.bitmap = make([]uint64, size)
+ copy(c.bitmap, words)
+ cow = false
+ }
+
+ rb.highlowcontainer.appendContainer(k, c, cow)
+
+ case count > 0:
+ c := &arrayContainer{content: make([]uint16, count)}
+ var pos, base int
+ for _, w := range words {
+ for w != 0 {
+ t := w & -w
+ c.content[pos] = uint16(base + int(popcount(t-1)))
+ pos++
+ w ^= t
+ }
+ base += 64
+ }
+ rb.highlowcontainer.appendContainer(k, c, false)
+ }
+
+ bitmap = bitmap[hi:]
+ k++
+ }
+}
+
+// WriteDenseTo writes to a slice of uint64s representing the bitmap as a dense bitmap.
+// Callers are responsible for allocating enough space in the bitmap using DenseSize.
+// Useful to convert a roaring bitmap to a format that can be used by other libraries
+// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
+func (rb *Bitmap) WriteDenseTo(bitmap []uint64) {
+ for i, ct := range rb.highlowcontainer.containers {
+ hb := uint32(rb.highlowcontainer.keys[i]) << 16
+
+ switch c := ct.(type) {
+ case *arrayContainer:
+ for _, x := range c.content {
+ n := int(hb | uint32(x))
+ bitmap[n>>log2WordSize] |= uint64(1) << uint(x%64)
+ }
+
+ case *bitmapContainer:
+ copy(bitmap[int(hb)>>log2WordSize:], c.bitmap)
+
+ case *runContainer16:
+ for j := range c.iv {
+ start := uint32(c.iv[j].start)
+ end := start + uint32(c.iv[j].length) + 1
+ lo := int(hb|start) >> log2WordSize
+ hi := int(hb|(end-1)) >> log2WordSize
+
+ if lo == hi {
+ bitmap[lo] |= (^uint64(0) << uint(start%64)) &
+ (^uint64(0) >> (uint(-end) % 64))
+ continue
+ }
+
+ bitmap[lo] |= ^uint64(0) << uint(start%64)
+ for n := lo + 1; n < hi; n++ {
+ bitmap[n] = ^uint64(0)
+ }
+ bitmap[hi] |= ^uint64(0) >> (uint(-end) % 64)
+ }
+ default:
+ panic("unsupported container type")
+ }
+ }
+}
+
// Checksum computes a hash (currently FNV-1a) for a bitmap that is suitable for
// using bitmaps as elements in hash sets or as keys in hash maps, as well as
// generally quicker comparisons.
// The implementation is biased towards efficiency in little endian machines, so
// expect some extra CPU cycles and memory to be used if your machine is big endian.
-// Likewise, don't use this to verify integrity unless you're certain you'll load
-// the bitmap on a machine with the same endianess used to create it.
+// Likewise, do not use this to verify integrity unless you are certain you will load
+// the bitmap on a machine with the same endianess used to create it. (Thankfully
+// very few people use big endian machines these days.)
func (rb *Bitmap) Checksum() uint64 {
const (
offset = 14695981039346656037
- prime = 1099511628211
+ prime = 1099511628211
)
var bytes []byte
@@ -106,6 +276,20 @@ func (rb *Bitmap) Checksum() uint64 {
return hash
}
+// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
+// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
+// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
+// The containers in the resulting bitmap are immutable containers tied to the provided byte array and they rely on
+// copy-on-write which means that modifying them creates copies. Thus FromUnsafeBytes is more likely to be appropriate for read-only use cases,
+// when the resulting bitmap can be considered immutable.
+//
+// See also the FromBuffer function.
+// See https://github.com/RoaringBitmap/roaring/pull/395 for more details.
+func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) {
+ stream := internal.NewByteBuffer(data)
+ return rb.ReadFrom(stream)
+}
+
// ReadFrom reads a serialized version of this bitmap from stream.
// The format is compatible with other RoaringBitmap
// implementations (Java, C) and is documented here:
@@ -114,12 +298,18 @@ func (rb *Bitmap) Checksum() uint64 {
// So add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom.
// It is not necessary to pass cookieHeader when call roaring.ReadFrom to read the roaring32 data directly.
func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) {
- stream := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
- stream.Reset(reader)
+ stream, ok := reader.(internal.ByteInput)
+ if !ok {
+ byteInputAdapter := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
+ byteInputAdapter.Reset(reader)
+ stream = byteInputAdapter
+ }
p, err = rb.highlowcontainer.readFrom(stream, cookieHeader...)
- internal.ByteInputAdapterPool.Put(stream)
+ if !ok {
+ internal.ByteInputAdapterPool.Put(stream.(*internal.ByteInputAdapter))
+ }
return
}
@@ -139,12 +329,17 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err
// You should *not* change the copy-on-write status of the resulting
// bitmaps (SetCopyOnWrite).
//
+// Thus FromBuffer is more likely to be appropriate for read-only use cases,
+// when the resulting bitmap can be considered immutable.
+//
// If buf becomes unavailable, then a bitmap created with
// FromBuffer would be effectively broken. Furthermore, any
// bitmap derived from this bitmap (e.g., via Or, And) might
// also be broken. Thus, before making buf unavailable, you should
// call CloneCopyOnWriteContainers on all such bitmaps.
//
+// See also the FromUnsafeBytes function which can have better performance
+// in some cases.
func (rb *Bitmap) FromBuffer(buf []byte) (p int64, err error) {
stream := internal.ByteBufferPool.Get().(*internal.ByteBuffer)
stream.Reset(buf)
@@ -194,6 +389,16 @@ func (rb *Bitmap) Clear() {
rb.highlowcontainer.clear()
}
+// ToBitSet copies the content of the RoaringBitmap into a bitset.BitSet instance
+func (rb *Bitmap) ToBitSet() *bitset.BitSet {
+ return bitset.From(rb.ToDense())
+}
+
+// FromBitSet creates a new RoaringBitmap from a bitset.BitSet instance
+func FromBitSet(bitset *bitset.BitSet) *Bitmap {
+ return FromDense(bitset.Bytes(), false)
+}
+
// ToArray creates a new slice containing all of the integers stored in the Bitmap in sorted order
func (rb *Bitmap) ToArray() []uint32 {
array := make([]uint32, rb.GetCardinality())
@@ -233,7 +438,7 @@ func BoundSerializedSizeInBytes(cardinality uint64, universeSize uint64) uint64
contnbr := (universeSize + uint64(65535)) / uint64(65536)
if contnbr > cardinality {
contnbr = cardinality
- // we can't have more containers than we have values
+ // we cannot have more containers than we have values
}
headermax := 8*contnbr + 4
if 4 > (contnbr+7)/8 {
@@ -276,9 +481,9 @@ type intIterator struct {
// This way, instead of making up-to 64k allocations per full iteration
// we get a single allocation and simply reinitialize the appropriate
// iterator and point to it in the generic `iter` member on each key bound.
- shortIter shortIterator
- runIter runIterator16
- bitmapIter bitmapContainerShortIterator
+ shortIter shortIterator
+ runIter runIterator16
+ bitmapIter bitmapContainerShortIterator
}
// HasNext returns true if there are more integers to iterate over
@@ -341,14 +546,13 @@ func (ii *intIterator) AdvanceIfNeeded(minval uint32) {
// IntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
type IntIterator = intIterator
-
// Initialize configures the existing iterator so that it can iterate through the values of
// the provided bitmap.
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
-func (p *intIterator) Initialize(a *Bitmap) {
- p.pos = 0
- p.highlowcontainer = &a.highlowcontainer
- p.init()
+func (ii *intIterator) Initialize(a *Bitmap) {
+ ii.pos = 0
+ ii.highlowcontainer = &a.highlowcontainer
+ ii.init()
}
type intReverseIterator struct {
@@ -357,9 +561,9 @@ type intReverseIterator struct {
iter shortIterable
highlowcontainer *roaringArray
- shortIter reverseIterator
- runIter runReverseIterator16
- bitmapIter reverseBitmapContainerShortIterator
+ shortIter reverseIterator
+ runIter runReverseIterator16
+ bitmapIter reverseBitmapContainerShortIterator
}
// HasNext returns true if there are more integers to iterate over
@@ -414,10 +618,10 @@ type IntReverseIterator = intReverseIterator
// Initialize configures the existing iterator so that it can iterate through the values of
// the provided bitmap.
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
-func (p *intReverseIterator) Initialize(a *Bitmap) {
- p.highlowcontainer = &a.highlowcontainer
- p.pos = a.highlowcontainer.size() - 1
- p.init()
+func (ii *intReverseIterator) Initialize(a *Bitmap) {
+ ii.highlowcontainer = &a.highlowcontainer
+ ii.pos = a.highlowcontainer.size() - 1
+ ii.init()
}
// ManyIntIterable allows you to iterate over the values in a Bitmap
@@ -434,9 +638,9 @@ type manyIntIterator struct {
iter manyIterable
highlowcontainer *roaringArray
- shortIter shortIterator
- runIter runIterator16
- bitmapIter bitmapContainerManyIterator
+ shortIter shortIterator
+ runIter runIterator16
+ bitmapIter bitmapContainerManyIterator
}
func (ii *manyIntIterator) init() {
@@ -495,17 +699,16 @@ func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int {
return n
}
-
// ManyIntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
type ManyIntIterator = manyIntIterator
// Initialize configures the existing iterator so that it can iterate through the values of
// the provided bitmap.
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
-func (p *manyIntIterator) Initialize(a *Bitmap) {
- p.pos = 0
- p.highlowcontainer = &a.highlowcontainer
- p.init()
+func (ii *manyIntIterator) Initialize(a *Bitmap) {
+ ii.pos = 0
+ ii.highlowcontainer = &a.highlowcontainer
+ ii.init()
}
// String creates a string representation of the Bitmap
@@ -569,7 +772,7 @@ func (rb *Bitmap) Iterate(cb func(x uint32) bool) {
// Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order;
// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove).
func (rb *Bitmap) Iterator() IntPeekable {
- p := new(intIterator)
+ p := new(intIterator)
p.Initialize(rb)
return p
}
@@ -847,7 +1050,7 @@ func (rb *Bitmap) Select(x uint32) (uint32, error) {
return uint32(key)<<16 + uint32(c.selectInt(uint16(remaining))), nil
}
}
- return 0, fmt.Errorf("can't find %dth integer in a bitmap with only %d items", x, rb.GetCardinality())
+ return 0, fmt.Errorf("cannot find %dth integer in a bitmap with only %d items", x, rb.GetCardinality())
}
// And computes the intersection between two bitmaps and stores the result in the current bitmap
diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile b/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile
index 7e8953c78..cb36d8673 100644
--- a/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile
+++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile
@@ -33,7 +33,6 @@ help:
all: help
test:
go test
- go test -race -run TestConcurrent*
# Format the source code
format:
@find ./ -type f -name "*.go" -exec gofmt -w {} \;
diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go
index 0e93c0335..6cae3284c 100644
--- a/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go
+++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go
@@ -2,6 +2,7 @@ package roaring64
import (
"fmt"
+ "io"
"math/bits"
"runtime"
"sync"
@@ -24,8 +25,8 @@ const (
// It depends upon the bitmap libraries. It is not thread safe, so
// upstream concurrency guards must be provided.
type BSI struct {
- bA []*Bitmap
- eBM *Bitmap // Existence BitMap
+ bA []Bitmap
+ eBM Bitmap // Existence BitMap
MaxValue int64
MinValue int64
runOptimized bool
@@ -39,11 +40,8 @@ func NewBSI(maxValue int64, minValue int64) *BSI {
if bits.Len64(uint64(maxValue)) > bitsz {
bitsz = bits.Len64(uint64(maxValue))
}
- ba := make([]*Bitmap, bitsz)
- for i := 0; i < len(ba); i++ {
- ba[i] = NewBitmap()
- }
- return &BSI{bA: ba, eBM: NewBitmap(), MaxValue: maxValue, MinValue: minValue}
+ ba := make([]Bitmap, bitsz)
+ return &BSI{bA: ba, MaxValue: maxValue, MinValue: minValue}
}
// NewDefaultBSI constructs an auto-sized BSI
@@ -67,7 +65,7 @@ func (b *BSI) HasRunCompression() bool {
// GetExistenceBitmap returns a pointer to the underlying existence bitmap of the BSI
func (b *BSI) GetExistenceBitmap() *Bitmap {
- return b.eBM
+ return &b.eBM
}
// ValueExists tests whether the value exists.
@@ -83,54 +81,41 @@ func (b *BSI) GetCardinality() uint64 {
// BitCount returns the number of bits needed to represent values.
func (b *BSI) BitCount() int {
-
return len(b.bA)
}
// SetValue sets a value for a given columnID.
func (b *BSI) SetValue(columnID uint64, value int64) {
-
// If max/min values are set to zero then automatically determine bit array size
if b.MaxValue == 0 && b.MinValue == 0 {
- ba := make([]*Bitmap, bits.Len64(uint64(value)))
- for i := len(ba) - b.BitCount(); i > 0; i-- {
- b.bA = append(b.bA, NewBitmap())
- if b.runOptimized {
- b.bA[i].RunOptimize()
- }
+ minBits := bits.Len64(uint64(value))
+ for len(b.bA) < minBits {
+ b.bA = append(b.bA, Bitmap{})
}
}
- var wg sync.WaitGroup
-
for i := 0; i < b.BitCount(); i++ {
- wg.Add(1)
- go func(j int) {
- defer wg.Done()
- if uint64(value)&(1< 0 {
- b.bA[j].Add(uint64(columnID))
- } else {
- b.bA[j].Remove(uint64(columnID))
- }
- }(i)
+ if uint64(value)&(1< 0 {
+ b.bA[i].Add(columnID)
+ } else {
+ b.bA[i].Remove(columnID)
+ }
}
- wg.Wait()
- b.eBM.Add(uint64(columnID))
+ b.eBM.Add(columnID)
}
-// GetValue gets the value at the column ID. Second param will be false for non-existant values.
-func (b *BSI) GetValue(columnID uint64) (int64, bool) {
- value := int64(0)
- exists := b.eBM.Contains(uint64(columnID))
+// GetValue gets the value at the column ID. Second param will be false for non-existent values.
+func (b *BSI) GetValue(columnID uint64) (value int64, exists bool) {
+ exists = b.eBM.Contains(columnID)
if !exists {
- return value, exists
+ return
}
for i := 0; i < b.BitCount(); i++ {
- if b.bA[i].Contains(uint64(columnID)) {
- value |= (1 << uint64(i))
+ if b.bA[i].Contains(columnID) {
+ value |= 1 << i
}
}
- return int64(value), exists
+ return
}
type action func(t *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.WaitGroup)
@@ -261,13 +246,12 @@ type task struct {
// For the RANGE parameter the comparison criteria is >= valueOrStart and <= end.
// The parallelism parameter indicates the number of CPU threads to be applied for processing. A value
// of zero indicates that all available CPU resources will be potentially utilized.
-//
func (b *BSI) CompareValue(parallelism int, op Operation, valueOrStart, end int64,
foundSet *Bitmap) *Bitmap {
comp := &task{bsi: b, op: op, valueOrStart: valueOrStart, end: end}
if foundSet == nil {
- return parallelExecutor(parallelism, comp, compareValue, b.eBM)
+ return parallelExecutor(parallelism, comp, compareValue, &b.eBM)
}
return parallelExecutor(parallelism, comp, compareValue, foundSet)
}
@@ -522,7 +506,6 @@ func (b *BSI) minOrMax(op Operation, batch []uint64, resultsChan chan int64, wg
// Sum all values contained within the foundSet. As a convenience, the cardinality of the foundSet
// is also returned (for calculating the average).
-//
func (b *BSI) Sum(foundSet *Bitmap) (sum int64, count uint64) {
count = foundSet.GetCardinality()
@@ -531,7 +514,7 @@ func (b *BSI) Sum(foundSet *Bitmap) (sum int64, count uint64) {
wg.Add(1)
go func(j int) {
defer wg.Done()
- atomic.AddInt64(&sum, int64(foundSet.AndCardinality(b.bA[j])< b.BitCount() {
- newBm := NewBitmap()
- if b.runOptimized {
- newBm.RunOptimize()
- }
- b.bA = append(b.bA, newBm)
+ bm := Bitmap{}
+ bm.RunOptimize()
+ b.bA = append(b.bA, bm)
}
a := make([][]*Bitmap, bits)
@@ -599,9 +579,8 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) {
a[i] = make([]*Bitmap, 0)
for _, x := range bsis {
if len(x.bA) > i {
- a[i] = append(a[i], x.bA[i])
+ a[i] = append(a[i], &x.bA[i])
} else {
- a[i] = []*Bitmap{NewBitmap()}
if b.runOptimized {
a[i][0].RunOptimize()
}
@@ -612,7 +591,7 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) {
// Consolidate existence bit maps
ebms := make([]*Bitmap, len(bsis))
for i := range ebms {
- ebms[i] = bsis[i].eBM
+ ebms[i] = &bsis[i].eBM
}
// First merge all the bit slices from all bsi maps that exist in target
@@ -621,17 +600,17 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) {
wg.Add(1)
go func(j int) {
defer wg.Done()
- x := []*Bitmap{b.bA[j]}
+ x := []*Bitmap{&b.bA[j]}
x = append(x, a[j]...)
- b.bA[j] = ParOr(parallelism, x...)
+ b.bA[j] = *ParOr(parallelism, x...)
}(i)
}
wg.Wait()
// merge all the EBM maps
- x := []*Bitmap{b.eBM}
+ x := []*Bitmap{&b.eBM}
x = append(x, ebms...)
- b.eBM = ParOr(parallelism, x...)
+ b.eBM = *ParOr(parallelism, x...)
}
// UnmarshalBinary de-serialize a BSI. The value at bitData[0] is the EBM. Other indices are in least to most
@@ -643,7 +622,7 @@ func (b *BSI) UnmarshalBinary(bitData [][]byte) error {
continue
}
if b.BitCount() < i {
- newBm := NewBitmap()
+ newBm := Bitmap{}
if b.runOptimized {
newBm.RunOptimize()
}
@@ -659,7 +638,7 @@ func (b *BSI) UnmarshalBinary(bitData [][]byte) error {
}
// First element of bitData is the EBM
if bitData[0] == nil {
- b.eBM = NewBitmap()
+ b.eBM = Bitmap{}
if b.runOptimized {
b.eBM.RunOptimize()
}
@@ -674,6 +653,39 @@ func (b *BSI) UnmarshalBinary(bitData [][]byte) error {
return nil
}
+// ReadFrom reads a serialized version of this BSI from stream.
+func (b *BSI) ReadFrom(stream io.Reader) (p int64, err error) {
+ bm, n, err := readBSIContainerFromStream(stream)
+ p += n
+ if err != nil {
+ err = fmt.Errorf("reading existence bitmap: %w", err)
+ return
+ }
+ b.eBM = bm
+ b.bA = b.bA[:0]
+ for {
+ // This forces a new memory location to be allocated and if we're lucky it only escapes if
+ // there's no error.
+ var bm Bitmap
+ bm, n, err = readBSIContainerFromStream(stream)
+ p += n
+ if err == io.EOF {
+ err = nil
+ return
+ }
+ if err != nil {
+ err = fmt.Errorf("reading bit slice index %v: %w", len(b.bA), err)
+ return
+ }
+ b.bA = append(b.bA, bm)
+ }
+}
+
+func readBSIContainerFromStream(r io.Reader) (bm Bitmap, p int64, err error) {
+ p, err = bm.ReadFrom(r)
+ return
+}
+
// MarshalBinary serializes a BSI
func (b *BSI) MarshalBinary() ([][]byte, error) {
@@ -694,6 +706,23 @@ func (b *BSI) MarshalBinary() ([][]byte, error) {
return data, nil
}
+// WriteTo writes a serialized version of this BSI to stream.
+func (b *BSI) WriteTo(w io.Writer) (n int64, err error) {
+ n1, err := b.eBM.WriteTo(w)
+ n += n1
+ if err != nil {
+ return
+ }
+ for _, bm := range b.bA {
+ n1, err = bm.WriteTo(w)
+ n += n1
+ if err != nil {
+ return
+ }
+ }
+ return
+}
+
// BatchEqual returns a bitmap containing the column IDs where the values are contained within the list of values provided.
func (b *BSI) BatchEqual(parallelism int, values []int64) *Bitmap {
@@ -702,7 +731,7 @@ func (b *BSI) BatchEqual(parallelism int, values []int64) *Bitmap {
valMap[values[i]] = struct{}{}
}
comp := &task{bsi: b, values: valMap}
- return parallelExecutor(parallelism, comp, batchEqual, b.eBM)
+ return parallelExecutor(parallelism, comp, batchEqual, &b.eBM)
}
func batchEqual(e *task, batch []uint64, resultsChan chan *Bitmap,
@@ -742,13 +771,13 @@ func (b *BSI) ClearValues(foundSet *Bitmap) {
wg.Add(1)
go func() {
defer wg.Done()
- ClearBits(foundSet, b.eBM)
+ ClearBits(foundSet, &b.eBM)
}()
for i := 0; i < b.BitCount(); i++ {
wg.Add(1)
go func(j int) {
defer wg.Done()
- ClearBits(foundSet, b.bA[j])
+ ClearBits(foundSet, &b.bA[j])
}(i)
}
wg.Wait()
@@ -758,19 +787,19 @@ func (b *BSI) ClearValues(foundSet *Bitmap) {
func (b *BSI) NewBSIRetainSet(foundSet *Bitmap) *BSI {
newBSI := NewBSI(b.MaxValue, b.MinValue)
- newBSI.bA = make([]*Bitmap, b.BitCount())
+ newBSI.bA = make([]Bitmap, b.BitCount())
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
- newBSI.eBM = b.eBM.Clone()
+ newBSI.eBM = *b.eBM.Clone()
newBSI.eBM.And(foundSet)
}()
for i := 0; i < b.BitCount(); i++ {
wg.Add(1)
go func(j int) {
defer wg.Done()
- newBSI.bA[j] = b.bA[j].Clone()
+ newBSI.bA[j] = *b.bA[j].Clone()
newBSI.bA[j].And(foundSet)
}(i)
}
@@ -780,28 +809,28 @@ func (b *BSI) NewBSIRetainSet(foundSet *Bitmap) *BSI {
// Clone performs a deep copy of BSI contents.
func (b *BSI) Clone() *BSI {
- return b.NewBSIRetainSet(b.eBM)
+ return b.NewBSIRetainSet(&b.eBM)
}
// Add - In-place sum the contents of another BSI with this BSI, column wise.
func (b *BSI) Add(other *BSI) {
- b.eBM.Or(other.eBM)
+ b.eBM.Or(&other.eBM)
for i := 0; i < len(other.bA); i++ {
- b.addDigit(other.bA[i], i)
+ b.addDigit(&other.bA[i], i)
}
}
func (b *BSI) addDigit(foundSet *Bitmap, i int) {
if i >= len(b.bA) {
- b.bA = append(b.bA, NewBitmap())
+ b.bA = append(b.bA, Bitmap{})
}
- carry := And(b.bA[i], foundSet)
+ carry := And(&b.bA[i], foundSet)
b.bA[i].Xor(foundSet)
if !carry.IsEmpty() {
if i+1 >= len(b.bA) {
- b.bA = append(b.bA, NewBitmap())
+ b.bA = append(b.bA, Bitmap{})
}
b.addDigit(carry, i+1)
}
@@ -811,7 +840,6 @@ func (b *BSI) addDigit(foundSet *Bitmap, i int) {
// contained within the input BSI. Given that for BSIs, different columnIDs can have the same value. TransposeWithCounts
// is useful for situations where there is a one-to-many relationship between the vectored integer sets. The resulting BSI
// contains the number of times a particular value appeared in the input BSI.
-//
func (b *BSI) TransposeWithCounts(parallelism int, foundSet, filterSet *Bitmap) *BSI {
return parallelExecutorBSIResults(parallelism, b, transposeWithCounts, foundSet, filterSet, true)
@@ -844,9 +872,42 @@ func transposeWithCounts(input *BSI, filterSet *Bitmap, batch []uint64, resultsC
// Increment - In-place increment of values in a BSI. Found set select columns for incrementing.
func (b *BSI) Increment(foundSet *Bitmap) {
b.addDigit(foundSet, 0)
+ b.eBM.Or(foundSet)
}
// IncrementAll - In-place increment of all values in a BSI.
func (b *BSI) IncrementAll() {
b.Increment(b.GetExistenceBitmap())
}
+
+// Equals - Check for semantic equality of two BSIs.
+func (b *BSI) Equals(other *BSI) bool {
+ if !b.eBM.Equals(&other.eBM) {
+ return false
+ }
+ for i := 0; i < len(b.bA) || i < len(other.bA); i++ {
+ if i >= len(b.bA) {
+ if !other.bA[i].IsEmpty() {
+ return false
+ }
+ } else if i >= len(other.bA) {
+ if !b.bA[i].IsEmpty() {
+ return false
+ }
+ } else {
+ if !b.bA[i].Equals(&other.bA[i]) {
+ return false
+ }
+ }
+ }
+ return true
+}
+
+// GetSizeInBytes - the size in bytes of the data structure
+func (b *BSI) GetSizeInBytes() int {
+ size := b.eBM.GetSizeInBytes()
+ for _, bm := range b.bA {
+ size += bm.GetSizeInBytes()
+ }
+ return int(size)
+}
diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go
index 688f84d82..9e7f6b7f1 100644
--- a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go
+++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go
@@ -9,6 +9,7 @@ import (
"strconv"
"github.com/RoaringBitmap/roaring"
+ "github.com/RoaringBitmap/roaring/internal"
)
const serialCookieNoRunContainer = 12346 // only arrays and bitmaps
@@ -61,7 +62,7 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) {
}
n += int64(written)
pos := 0
- keyBuf := make([]byte, 4)
+ keyBuf := buf[:4]
for pos < rb.highlowcontainer.size() {
c := rb.highlowcontainer.getContainerAtIndex(pos)
binary.LittleEndian.PutUint32(keyBuf, rb.highlowcontainer.getKeyAtIndex(pos))
@@ -80,37 +81,86 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) {
return n, nil
}
+// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
+// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
+// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
+func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) {
+ stream := internal.NewByteBuffer(data)
+ sizeBuf := make([]byte, 8)
+ n, err := stream.Read(sizeBuf)
+ if err != nil {
+ return 0, err
+ }
+ p += int64(n)
+ size := binary.LittleEndian.Uint64(sizeBuf)
+
+ rb.highlowcontainer.resize(0)
+ if cap(rb.highlowcontainer.keys) >= int(size) {
+ rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size]
+ } else {
+ rb.highlowcontainer.keys = make([]uint32, size)
+ }
+ if cap(rb.highlowcontainer.containers) >= int(size) {
+ rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size]
+ } else {
+ rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
+ }
+ if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) {
+ rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size]
+ } else {
+ rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
+ }
+ for i := uint64(0); i < size; i++ {
+ keyBuf, err := stream.Next(4)
+ if err != nil {
+ return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read key #%d: %w", i, err)
+ }
+ p += 4
+ rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf)
+ rb.highlowcontainer.containers[i] = roaring.NewBitmap()
+ n, err := rb.highlowcontainer.containers[i].ReadFrom(stream)
+ if n == 0 || err != nil {
+ return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err)
+ }
+ p += int64(n)
+ }
+
+ return p, nil
+}
+
// ReadFrom reads a serialized version of this bitmap from stream.
// The format is compatible with other 64-bit RoaringBitmap
// implementations (Java, Go, C++) and it has a specification :
// https://github.com/RoaringBitmap/RoaringFormatSpec#extention-for-64-bit-implementations
func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) {
- cookie, r32, p, err := tryReadFromRoaring32(rb, stream)
- if err != nil {
- return p, err
- } else if r32 {
- return p, nil
- }
- // TODO: Add buffer interning as in base roaring package.
-
- sizeBuf := make([]byte, 4)
+ sizeBuf := make([]byte, 8)
var n int
- n, err = stream.Read(sizeBuf)
- if n == 0 || err != nil {
- return int64(n), fmt.Errorf("error in bitmap.readFrom: could not read number of containers: %s", err)
+ n, err = io.ReadFull(stream, sizeBuf)
+ if err != nil {
+ return int64(n), err
}
p += int64(n)
- sizeBuf = append(cookie, sizeBuf...)
-
size := binary.LittleEndian.Uint64(sizeBuf)
- rb.highlowcontainer = roaringArray64{}
- rb.highlowcontainer.keys = make([]uint32, size)
- rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
- rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
- keyBuf := make([]byte, 4)
+ rb.highlowcontainer.resize(0)
+ if cap(rb.highlowcontainer.keys) >= int(size) {
+ rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size]
+ } else {
+ rb.highlowcontainer.keys = make([]uint32, size)
+ }
+ if cap(rb.highlowcontainer.containers) >= int(size) {
+ rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size]
+ } else {
+ rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
+ }
+ if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) {
+ rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size]
+ } else {
+ rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
+ }
+ keyBuf := sizeBuf[:4]
for i := uint64(0); i < size; i++ {
- n, err = stream.Read(keyBuf)
- if n == 0 || err != nil {
+ n, err = io.ReadFull(stream, keyBuf)
+ if err != nil {
return int64(n), fmt.Errorf("error in bitmap.readFrom: could not read key #%d: %s", i, err)
}
p += int64(n)
@@ -126,30 +176,6 @@ func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) {
return p, nil
}
-func tryReadFromRoaring32(rb *Bitmap, stream io.Reader) (cookie []byte, r32 bool, p int64, err error) {
- // Verify the first two bytes are a valid MagicNumber.
- cookie = make([]byte, 4)
- size, err := stream.Read(cookie)
- if err != nil {
- return cookie, false, int64(size), err
- }
- fileMagic := int(binary.LittleEndian.Uint16(cookie[0:2]))
- if fileMagic == serialCookieNoRunContainer || fileMagic == serialCookie {
- bm32 := roaring.NewBitmap()
- p, err = bm32.ReadFrom(stream, cookie...)
- if err != nil {
- return
- }
- rb.highlowcontainer = roaringArray64{
- keys: []uint32{0},
- containers: []*roaring.Bitmap{bm32},
- needCopyOnWrite: []bool{false},
- }
- return cookie, true, p, nil
- }
- return
-}
-
// FromBuffer creates a bitmap from its serialized version stored in buffer
// func (rb *Bitmap) FromBuffer(data []byte) (p int64, err error) {
//
@@ -298,12 +324,8 @@ func (rb *Bitmap) ContainsInt(x int) bool {
}
// Equals returns true if the two bitmaps contain the same integers
-func (rb *Bitmap) Equals(o interface{}) bool {
- srb, ok := o.(*Bitmap)
- if ok {
- return srb.highlowcontainer.equals(rb.highlowcontainer)
- }
- return false
+func (rb *Bitmap) Equals(srb *Bitmap) bool {
+ return srb.highlowcontainer.equals(rb.highlowcontainer)
}
// Add the integer x to the bitmap
@@ -1228,3 +1250,14 @@ func (rb *Bitmap) Stats() roaring.Statistics {
func (rb *Bitmap) GetSerializedSizeInBytes() uint64 {
return rb.highlowcontainer.serializedSizeInBytes()
}
+
+// Roaring32AsRoaring64 inserts a 32-bit roaring bitmap into
+// a 64-bit roaring bitmap. No copy is made.
+func Roaring32AsRoaring64(bm32 *roaring.Bitmap) *Bitmap {
+ rb := NewBitmap()
+ rb.highlowcontainer.resize(0)
+ rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, 0)
+ rb.highlowcontainer.containers = append(rb.highlowcontainer.containers, bm32)
+ rb.highlowcontainer.needCopyOnWrite = append(rb.highlowcontainer.needCopyOnWrite, false)
+ return rb
+}
diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go
index 9e3b34bc8..26aabd722 100644
--- a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go
+++ b/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go
@@ -1,6 +1,8 @@
package roaring64
-import "github.com/RoaringBitmap/roaring"
+import (
+ "github.com/RoaringBitmap/roaring"
+)
type roaringArray64 struct {
keys []uint32
@@ -12,9 +14,10 @@ type roaringArray64 struct {
// runOptimize compresses the element containers to minimize space consumed.
// Q: how does this interact with copyOnWrite and needCopyOnWrite?
// A: since we aren't changing the logical content, just the representation,
-// we don't bother to check the needCopyOnWrite bits. We replace
-// (possibly all) elements of ra.containers in-place with space
-// optimized versions.
+//
+// we don't bother to check the needCopyOnWrite bits. We replace
+// (possibly all) elements of ra.containers in-place with space
+// optimized versions.
func (ra *roaringArray64) runOptimize() {
for i := range ra.containers {
ra.containers[i].RunOptimize()
@@ -39,7 +42,7 @@ func (ra *roaringArray64) appendCopy(sa roaringArray64, startingindex int) {
// since there is no copy-on-write, we need to clone the container (this is important)
ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex].Clone(), copyonwrite)
} else {
- ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], copyonwrite)
+ ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex].Clone(), copyonwrite)
if !sa.needsCopyOnWrite(startingindex) {
sa.setNeedsCopyOnWrite(startingindex)
}
@@ -121,6 +124,8 @@ func (ra *roaringArray64) removeIndexRange(begin, end int) {
func (ra *roaringArray64) resize(newsize int) {
for k := newsize; k < len(ra.containers); k++ {
+ ra.keys[k] = 0
+ ra.needCopyOnWrite[k] = false
ra.containers[k] = nil
}
diff --git a/vendor/github.com/RoaringBitmap/roaring/roaringarray.go b/vendor/github.com/RoaringBitmap/roaring/roaringarray.go
index eeb3d3131..079195dda 100644
--- a/vendor/github.com/RoaringBitmap/roaring/roaringarray.go
+++ b/vendor/github.com/RoaringBitmap/roaring/roaringarray.go
@@ -4,8 +4,9 @@ import (
"bytes"
"encoding/binary"
"fmt"
- "github.com/RoaringBitmap/roaring/internal"
"io"
+
+ "github.com/RoaringBitmap/roaring/internal"
)
type container interface {
@@ -112,9 +113,10 @@ func newRoaringArray() *roaringArray {
// runOptimize compresses the element containers to minimize space consumed.
// Q: how does this interact with copyOnWrite and needCopyOnWrite?
// A: since we aren't changing the logical content, just the representation,
-// we don't bother to check the needCopyOnWrite bits. We replace
-// (possibly all) elements of ra.containers in-place with space
-// optimized versions.
+//
+// we don't bother to check the needCopyOnWrite bits. We replace
+// (possibly all) elements of ra.containers in-place with space
+// optimized versions.
func (ra *roaringArray) runOptimize() {
for i := range ra.containers {
ra.containers[i] = ra.containers[i].toEfficientContainer()
@@ -465,9 +467,7 @@ func (ra *roaringArray) serializedSizeInBytes() uint64 {
return answer
}
-//
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
-//
func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) {
hasRun := ra.hasRunCompression()
isRunSizeInBytes := 0
@@ -544,15 +544,14 @@ func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) {
return n, nil
}
-//
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
-//
func (ra *roaringArray) toBytes() ([]byte, error) {
var buf bytes.Buffer
_, err := ra.writeTo(&buf)
return buf.Bytes(), err
}
+// Reads a serialized roaringArray from a byte slice.
func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte) (int64, error) {
var cookie uint32
var err error
@@ -567,6 +566,8 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte
return stream.GetReadBytes(), fmt.Errorf("error in roaringArray.readFrom: could not read initial cookie: %s", err)
}
}
+ // If NextReturnsSafeSlice is false, then willNeedCopyOnWrite should be true
+ willNeedCopyOnWrite := !stream.NextReturnsSafeSlice()
var size uint32
var isRunBitmap []byte
@@ -631,7 +632,7 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte
key := keycard[2*i]
card := int(keycard[2*i+1]) + 1
ra.keys[i] = key
- ra.needCopyOnWrite[i] = true
+ ra.needCopyOnWrite[i] = willNeedCopyOnWrite
if isRunBitmap != nil && isRunBitmap[i/8]&(1<<(i%8)) != 0 {
// run container
diff --git a/vendor/github.com/RoaringBitmap/roaring/runcontainer.go b/vendor/github.com/RoaringBitmap/roaring/runcontainer.go
index 4ce48a294..7098ba28f 100644
--- a/vendor/github.com/RoaringBitmap/roaring/runcontainer.go
+++ b/vendor/github.com/RoaringBitmap/roaring/runcontainer.go
@@ -47,6 +47,7 @@ import (
// runContainer16 does run-length encoding of sets of
// uint16 integers.
type runContainer16 struct {
+ // iv is a slice of sorted, non-overlapping, non-adjacent intervals.
iv []interval16
}
@@ -253,10 +254,8 @@ func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 {
}
-//
// newRunContainer16FromArray populates a new
// runContainer16 from the contents of arr.
-//
func newRunContainer16FromArray(arr *arrayContainer) *runContainer16 {
// keep this in sync with newRunContainer16FromVals above
@@ -834,24 +833,23 @@ func (rc *runContainer16) numIntervals() int {
// If key is not already present, then whichInterval16 is
// set as follows:
//
-// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
-// last interval16 in rc.iv;
+// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
+// last interval16 in rc.iv;
//
-// b) whichInterval16 == -1 if key is before our first
-// interval16 in rc.iv;
+// b) whichInterval16 == -1 if key is before our first
+// interval16 in rc.iv;
//
-// c) whichInterval16 is set to the minimum index of rc.iv
-// which comes strictly before the key;
-// so rc.iv[whichInterval16].last < key,
-// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
-// (Note that whichInterval16+1 won't exist when
-// whichInterval16 is the last interval.)
+// c) whichInterval16 is set to the minimum index of rc.iv
+// which comes strictly before the key;
+// so rc.iv[whichInterval16].last < key,
+// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
+// (Note that whichInterval16+1 won't exist when
+// whichInterval16 is the last interval.)
//
// runContainer16.search always returns whichInterval16 < len(rc.iv).
//
// The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there
// no upper bound.
-//
func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (whichInterval16 int, alreadyPresent bool, numCompares int) {
n := int(len(rc.iv))
if n == 0 {
@@ -937,21 +935,20 @@ func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (w
// If key is not already present, then whichInterval16 is
// set as follows:
//
-// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
-// last interval16 in rc.iv;
+// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
+// last interval16 in rc.iv;
//
-// b) whichInterval16 == -1 if key is before our first
-// interval16 in rc.iv;
+// b) whichInterval16 == -1 if key is before our first
+// interval16 in rc.iv;
//
-// c) whichInterval16 is set to the minimum index of rc.iv
-// which comes strictly before the key;
-// so rc.iv[whichInterval16].last < key,
-// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
-// (Note that whichInterval16+1 won't exist when
-// whichInterval16 is the last interval.)
+// c) whichInterval16 is set to the minimum index of rc.iv
+// which comes strictly before the key;
+// so rc.iv[whichInterval16].last < key,
+// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
+// (Note that whichInterval16+1 won't exist when
+// whichInterval16 is the last interval.)
//
// runContainer16.search always returns whichInterval16 < len(rc.iv).
-//
func (rc *runContainer16) search(key int) (whichInterval16 int, alreadyPresent bool, numCompares int) {
return rc.searchRange(key, 0, 0)
}
@@ -994,7 +991,6 @@ func newRunContainer16() *runContainer16 {
// newRunContainer16CopyIv creates a run container, initializing
// with a copy of the supplied iv slice.
-//
func newRunContainer16CopyIv(iv []interval16) *runContainer16 {
rc := &runContainer16{
iv: make([]interval16, len(iv)),
@@ -1011,7 +1007,6 @@ func (rc *runContainer16) Clone() *runContainer16 {
// newRunContainer16TakeOwnership returns a new runContainer16
// backed by the provided iv slice, which we will
// assume exclusive control over from now on.
-//
func newRunContainer16TakeOwnership(iv []interval16) *runContainer16 {
rc := &runContainer16{
iv: iv,
@@ -2006,7 +2001,6 @@ func (rc *runContainer16) not(firstOfRange, endx int) container {
// Current routine is correct but
// makes 2 more passes through the arrays than should be
// strictly necessary. Measure both ways though--this may not matter.
-//
func (rc *runContainer16) Not(firstOfRange, endx int) *runContainer16 {
if firstOfRange > endx {
@@ -2329,7 +2323,6 @@ func runArrayUnionToRuns(rc *runContainer16, ac *arrayContainer) ([]interval16,
// the backing array, and then you write
// the answer at the beginning. What this
// trick does is minimize memory allocations.
-//
func (rc *runContainer16) lazyIOR(a container) container {
// not lazy at the moment
return rc.ior(a)
diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization.go b/vendor/github.com/RoaringBitmap/roaring/serialization.go
index 70e3bbcc5..dbfecc846 100644
--- a/vendor/github.com/RoaringBitmap/roaring/serialization.go
+++ b/vendor/github.com/RoaringBitmap/roaring/serialization.go
@@ -7,7 +7,6 @@ import (
// writeTo for runContainer16 follows this
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
-//
func (b *runContainer16) writeTo(stream io.Writer) (int, error) {
buf := make([]byte, 2+4*len(b.iv))
binary.LittleEndian.PutUint16(buf[0:], uint16(len(b.iv)))
diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go b/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go
index 2e4ea5954..6e3a5d554 100644
--- a/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go
+++ b/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go
@@ -79,12 +79,12 @@ func (bc *bitmapContainer) asLittleEndianByteSlice() []byte {
// Deserialization code follows
-////
+// //
// These methods (byteSliceAsUint16Slice,...) do not make copies,
// they are pointer-based (unsafe). The caller is responsible to
// ensure that the input slice does not get garbage collected, deleted
// or modified while you hold the returned slince.
-////
+// //
func byteSliceAsUint16Slice(slice []byte) (result []uint16) { // here we create a new slice holder
if len(slice)%2 != 0 {
panic("Slice size should be divisible by 2")
@@ -295,7 +295,6 @@ func byteSliceAsBoolSlice(slice []byte) (result []bool) {
// bitmap derived from this bitmap (e.g., via Or, And) might
// also be broken. Thus, before making buf unavailable, you should
// call CloneCopyOnWriteContainers on all such bitmaps.
-//
func (rb *Bitmap) FrozenView(buf []byte) error {
return rb.highlowcontainer.frozenView(buf)
}
@@ -313,7 +312,7 @@ func (rb *Bitmap) FrozenView(buf []byte) error {
* uint8_t[num_containers]
* uint32_t
*
- * is a 4-byte value which is a bit union of FROZEN_COOKIE (15 bits)
+ * is a 4-byte value which is a bit union of frozenCookie (15 bits)
* and the number of containers (17 bits).
*
* stores number of elements for every container.
@@ -329,43 +328,50 @@ func (rb *Bitmap) FrozenView(buf []byte) error {
* All members have their native alignments during deserilization except ,
* which is not guaranteed to be aligned by 4 bytes.
*/
-const FROZEN_COOKIE = 13766
+const frozenCookie = 13766
var (
- FrozenBitmapInvalidCookie = errors.New("header does not contain the FROZEN_COOKIE")
- FrozenBitmapBigEndian = errors.New("loading big endian frozen bitmaps is not supported")
- FrozenBitmapIncomplete = errors.New("input buffer too small to contain a frozen bitmap")
- FrozenBitmapOverpopulated = errors.New("too many containers")
- FrozenBitmapUnexpectedData = errors.New("spurious data in input")
- FrozenBitmapInvalidTypecode = errors.New("unrecognized typecode")
- FrozenBitmapBufferTooSmall = errors.New("buffer too small")
+ // ErrFrozenBitmapInvalidCookie is returned when the header does not contain the frozenCookie.
+ ErrFrozenBitmapInvalidCookie = errors.New("header does not contain the frozenCookie")
+ // ErrFrozenBitmapBigEndian is returned when the header is big endian.
+ ErrFrozenBitmapBigEndian = errors.New("loading big endian frozen bitmaps is not supported")
+ // ErrFrozenBitmapIncomplete is returned when the buffer is too small to contain a frozen bitmap.
+ ErrFrozenBitmapIncomplete = errors.New("input buffer too small to contain a frozen bitmap")
+ // ErrFrozenBitmapOverpopulated is returned when the number of containers is too large.
+ ErrFrozenBitmapOverpopulated = errors.New("too many containers")
+ // ErrFrozenBitmapUnexpectedData is returned when the buffer contains unexpected data.
+ ErrFrozenBitmapUnexpectedData = errors.New("spurious data in input")
+ // ErrFrozenBitmapInvalidTypecode is returned when the typecode is invalid.
+ ErrFrozenBitmapInvalidTypecode = errors.New("unrecognized typecode")
+ // ErrFrozenBitmapBufferTooSmall is returned when the buffer is too small.
+ ErrFrozenBitmapBufferTooSmall = errors.New("buffer too small")
)
func (ra *roaringArray) frozenView(buf []byte) error {
if len(buf) < 4 {
- return FrozenBitmapIncomplete
+ return ErrFrozenBitmapIncomplete
}
headerBE := binary.BigEndian.Uint32(buf[len(buf)-4:])
- if headerBE&0x7fff == FROZEN_COOKIE {
- return FrozenBitmapBigEndian
+ if headerBE&0x7fff == frozenCookie {
+ return ErrFrozenBitmapBigEndian
}
header := binary.LittleEndian.Uint32(buf[len(buf)-4:])
buf = buf[:len(buf)-4]
- if header&0x7fff != FROZEN_COOKIE {
- return FrozenBitmapInvalidCookie
+ if header&0x7fff != frozenCookie {
+ return ErrFrozenBitmapInvalidCookie
}
nCont := int(header >> 15)
if nCont > (1 << 16) {
- return FrozenBitmapOverpopulated
+ return ErrFrozenBitmapOverpopulated
}
// 1 byte per type, 2 bytes per key, 2 bytes per count.
if len(buf) < 5*nCont {
- return FrozenBitmapIncomplete
+ return ErrFrozenBitmapIncomplete
}
types := buf[len(buf)-nCont:]
@@ -390,12 +396,12 @@ func (ra *roaringArray) frozenView(buf []byte) error {
nRun++
nRunEl += int(counts[i])
default:
- return FrozenBitmapInvalidTypecode
+ return ErrFrozenBitmapInvalidTypecode
}
}
if len(buf) < (1<<13)*nBitmap+4*nRunEl+2*nArrayEl {
- return FrozenBitmapIncomplete
+ return ErrFrozenBitmapIncomplete
}
bitsetsArena := byteSliceAsUint64Slice(buf[:(1<<13)*nBitmap])
@@ -408,15 +414,15 @@ func (ra *roaringArray) frozenView(buf []byte) error {
buf = buf[2*nArrayEl:]
if len(buf) != 0 {
- return FrozenBitmapUnexpectedData
+ return ErrFrozenBitmapUnexpectedData
}
var c container
- containersSz := int(unsafe.Sizeof(c))*nCont
- bitsetsSz := int(unsafe.Sizeof(bitmapContainer{}))*nBitmap
- arraysSz := int(unsafe.Sizeof(arrayContainer{}))*nArray
- runsSz := int(unsafe.Sizeof(runContainer16{}))*nRun
- needCOWSz := int(unsafe.Sizeof(true))*nCont
+ containersSz := int(unsafe.Sizeof(c)) * nCont
+ bitsetsSz := int(unsafe.Sizeof(bitmapContainer{})) * nBitmap
+ arraysSz := int(unsafe.Sizeof(arrayContainer{})) * nArray
+ runsSz := int(unsafe.Sizeof(runContainer16{})) * nRun
+ needCOWSz := int(unsafe.Sizeof(true)) * nCont
bitmapArenaSz := containersSz + bitsetsSz + arraysSz + runsSz + needCOWSz
bitmapArena := make([]byte, bitmapArenaSz)
@@ -475,9 +481,10 @@ func (ra *roaringArray) frozenView(buf []byte) error {
return nil
}
-func (bm *Bitmap) GetFrozenSizeInBytes() uint64 {
+// GetFrozenSizeInBytes returns the size in bytes of the frozen bitmap.
+func (rb *Bitmap) GetFrozenSizeInBytes() uint64 {
nBits, nArrayEl, nRunEl := uint64(0), uint64(0), uint64(0)
- for _, c := range bm.highlowcontainer.containers {
+ for _, c := range rb.highlowcontainer.containers {
switch v := c.(type) {
case *bitmapContainer:
nBits++
@@ -487,19 +494,21 @@ func (bm *Bitmap) GetFrozenSizeInBytes() uint64 {
nRunEl += uint64(len(v.iv))
}
}
- return 4 + 5*uint64(len(bm.highlowcontainer.containers)) +
+ return 4 + 5*uint64(len(rb.highlowcontainer.containers)) +
(nBits << 13) + 2*nArrayEl + 4*nRunEl
}
-func (bm *Bitmap) Freeze() ([]byte, error) {
- sz := bm.GetFrozenSizeInBytes()
+// Freeze serializes the bitmap in the CRoaring's frozen format.
+func (rb *Bitmap) Freeze() ([]byte, error) {
+ sz := rb.GetFrozenSizeInBytes()
buf := make([]byte, sz)
- _, err := bm.FreezeTo(buf)
+ _, err := rb.FreezeTo(buf)
return buf, err
}
-func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
- containers := bm.highlowcontainer.containers
+// FreezeTo serializes the bitmap in the CRoaring's frozen format.
+func (rb *Bitmap) FreezeTo(buf []byte) (int, error) {
+ containers := rb.highlowcontainer.containers
nCont := len(containers)
nBits, nArrayEl, nRunEl := 0, 0, 0
@@ -516,7 +525,7 @@ func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
serialSize := 4 + 5*nCont + (1<<13)*nBits + 4*nRunEl + 2*nArrayEl
if len(buf) < serialSize {
- return 0, FrozenBitmapBufferTooSmall
+ return 0, ErrFrozenBitmapBufferTooSmall
}
bitsArena := byteSliceAsUint64Slice(buf[:(1<<13)*nBits])
@@ -537,10 +546,10 @@ func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
types := buf[:nCont]
buf = buf[nCont:]
- header := uint32(FROZEN_COOKIE | (nCont << 15))
+ header := uint32(frozenCookie | (nCont << 15))
binary.LittleEndian.PutUint32(buf[:4], header)
- copy(keys, bm.highlowcontainer.keys[:])
+ copy(keys, rb.highlowcontainer.keys[:])
for i, c := range containers {
switch v := c.(type) {
@@ -567,11 +576,12 @@ func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
return serialSize, nil
}
-func (bm *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
+// WriteFrozenTo serializes the bitmap in the CRoaring's frozen format.
+func (rb *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
// FIXME: this is a naive version that iterates 4 times through the
// containers and allocates 3*len(containers) bytes; it's quite likely
// it can be done more efficiently.
- containers := bm.highlowcontainer.containers
+ containers := rb.highlowcontainer.containers
written := 0
for _, c := range containers {
@@ -610,7 +620,7 @@ func (bm *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
}
}
- n, err := wr.Write(uint16SliceAsByteSlice(bm.highlowcontainer.keys))
+ n, err := wr.Write(uint16SliceAsByteSlice(rb.highlowcontainer.keys))
written += n
if err != nil {
return written, err
@@ -642,7 +652,7 @@ func (bm *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
return written, err
}
- header := uint32(FROZEN_COOKIE | (len(containers) << 15))
+ header := uint32(frozenCookie | (len(containers) << 15))
if err := binary.Write(wr, binary.LittleEndian, header); err != nil {
return written, err
}
diff --git a/vendor/github.com/bits-and-blooms/bitset/README.md b/vendor/github.com/bits-and-blooms/bitset/README.md
index 97e83071e..848234e2f 100644
--- a/vendor/github.com/bits-and-blooms/bitset/README.md
+++ b/vendor/github.com/bits-and-blooms/bitset/README.md
@@ -7,6 +7,15 @@
[](https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc)
+This library is part of the [awesome go collection](https://github.com/avelino/awesome-go). It is used in production by several important systems:
+
+* [beego](https://github.com/beego/beego)
+* [CubeFS](https://github.com/cubefs/cubefs)
+* [Amazon EKS Distro](https://github.com/aws/eks-distro)
+* [sourcegraph](https://github.com/sourcegraph/sourcegraph)
+* [torrent](https://github.com/anacrolix/torrent)
+
+
## Description
Package bitset implements bitsets, a mapping between non-negative integers and boolean values.
@@ -60,19 +69,69 @@ func main() {
}
```
-As an alternative to BitSets, one should check out the 'big' package, which provides a (less set-theoretical) view of bitsets.
Package documentation is at: https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc
+## Serialization
+
+
+You may serialize a bitset safely and portably to a stream
+of bytes as follows:
+```Go
+ const length = 9585
+ const oneEvery = 97
+ bs := bitset.New(length)
+ // Add some bits
+ for i := uint(0); i < length; i += oneEvery {
+ bs = bs.Set(i)
+ }
+
+ var buf bytes.Buffer
+ n, err := bs.WriteTo(&buf)
+ if err != nil {
+ // failure
+ }
+ // Here n == buf.Len()
+```
+You can later deserialize the result as follows:
+
+```Go
+ // Read back from buf
+ bs = bitset.New()
+ n, err = bs.ReadFrom(&buf)
+ if err != nil {
+ // error
+ }
+ // n is the number of bytes read
+```
+
+The `ReadFrom` function attempts to read the data into the existing
+BitSet instance, to minimize memory allocations.
+
+
+*Performance tip*:
+When reading and writing to a file or a network connection, you may get better performance by
+wrapping your streams with `bufio` instances.
+
+E.g.,
+```Go
+ f, err := os.Create("myfile")
+ w := bufio.NewWriter(f)
+```
+```Go
+ f, err := os.Open("myfile")
+ r := bufio.NewReader(f)
+```
+
## Memory Usage
-The memory usage of a bitset using N bits is at least N/8 bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring).
+The memory usage of a bitset using `N` bits is at least `N/8` bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring).
## Implementation Note
Go 1.9 introduced a native `math/bits` library. We provide backward compatibility to Go 1.7, which might be removed.
-It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `unit64`). If so, the version will be bumped.
+It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `uint64`). If so, the version will be bumped.
## Installation
diff --git a/vendor/github.com/bits-and-blooms/bitset/SECURITY.md b/vendor/github.com/bits-and-blooms/bitset/SECURITY.md
new file mode 100644
index 000000000..f888420c3
--- /dev/null
+++ b/vendor/github.com/bits-and-blooms/bitset/SECURITY.md
@@ -0,0 +1,5 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+You can report privately a vulnerability by email at daniel@lemire.me (current maintainer).
diff --git a/vendor/github.com/bits-and-blooms/bitset/bitset.go b/vendor/github.com/bits-and-blooms/bitset/bitset.go
index 271ccdbd0..8fb9e9fa2 100644
--- a/vendor/github.com/bits-and-blooms/bitset/bitset.go
+++ b/vendor/github.com/bits-and-blooms/bitset/bitset.go
@@ -33,12 +33,10 @@ Example use:
As an alternative to BitSets, one should check out the 'big' package,
which provides a (less set-theoretical) view of bitsets.
-
*/
package bitset
import (
- "bufio"
"bytes"
"encoding/base64"
"encoding/binary"
@@ -52,6 +50,9 @@ import (
// the wordSize of a bit set
const wordSize = uint(64)
+// the wordSize of a bit set in bytes
+const wordBytes = wordSize / 8
+
// log2WordSize is lg(wordSize)
const log2WordSize = uint(6)
@@ -87,9 +88,20 @@ func (b *BitSet) safeSet() []uint64 {
return b.set
}
+// SetBitsetFrom fills the bitset with an array of integers without creating a new BitSet instance
+func (b *BitSet) SetBitsetFrom(buf []uint64) {
+ b.length = uint(len(buf)) * 64
+ b.set = buf
+}
+
// From is a constructor used to create a BitSet from an array of integers
func From(buf []uint64) *BitSet {
- return &BitSet{uint(len(buf)) * 64, buf}
+ return FromWithLength(uint(len(buf))*64, buf)
+}
+
+// FromWithLength constructs from an array of integers and length.
+func FromWithLength(len uint, set []uint64) *BitSet {
+ return &BitSet{len, set}
}
// Bytes returns the bitset as array of integers
@@ -105,6 +117,17 @@ func wordsNeeded(i uint) int {
return int((i + (wordSize - 1)) >> log2WordSize)
}
+// wordsNeededUnbound calculates the number of words needed for i bits, possibly exceeding the capacity.
+// This function is useful if you know that the capacity cannot be exceeded (e.g., you have an existing bitmap).
+func wordsNeededUnbound(i uint) int {
+ return int((i + (wordSize - 1)) >> log2WordSize)
+}
+
+// wordsIndex calculates the index of words in a `uint64`
+func wordsIndex(i uint) uint {
+ return i & (wordSize - 1)
+}
+
// New creates a new BitSet with a hint that length bits will be required
func New(length uint) (bset *BitSet) {
defer func() {
@@ -135,24 +158,22 @@ func (b *BitSet) Len() uint {
return b.length
}
-// extendSetMaybe adds additional words to incorporate new bits if needed
-func (b *BitSet) extendSetMaybe(i uint) {
- if i >= b.length { // if we need more bits, make 'em
- if i >= Cap() {
- panic("You are exceeding the capacity")
- }
- nsize := wordsNeeded(i + 1)
- if b.set == nil {
- b.set = make([]uint64, nsize)
- } else if cap(b.set) >= nsize {
- b.set = b.set[:nsize] // fast resize
- } else if len(b.set) < nsize {
- newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x
- copy(newset, b.set)
- b.set = newset
- }
- b.length = i + 1
+// extendSet adds additional words to incorporate new bits if needed
+func (b *BitSet) extendSet(i uint) {
+ if i >= Cap() {
+ panic("You are exceeding the capacity")
}
+ nsize := wordsNeeded(i + 1)
+ if b.set == nil {
+ b.set = make([]uint64, nsize)
+ } else if cap(b.set) >= nsize {
+ b.set = b.set[:nsize] // fast resize
+ } else if len(b.set) < nsize {
+ newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x
+ copy(newset, b.set)
+ b.set = newset
+ }
+ b.length = i + 1
}
// Test whether bit i is set.
@@ -160,7 +181,7 @@ func (b *BitSet) Test(i uint) bool {
if i >= b.length {
return false
}
- return b.set[i>>log2WordSize]&(1<<(i&(wordSize-1))) != 0
+ return b.set[i>>log2WordSize]&(1<>log2WordSize] |= 1 << (i & (wordSize - 1))
+ if i >= b.length { // if we need more bits, make 'em
+ b.extendSet(i)
+ }
+ b.set[i>>log2WordSize] |= 1 << wordsIndex(i)
return b
}
@@ -180,7 +203,7 @@ func (b *BitSet) Clear(i uint) *BitSet {
if i >= b.length {
return b
}
- b.set[i>>log2WordSize] &^= 1 << (i & (wordSize - 1))
+ b.set[i>>log2WordSize] &^= 1 << wordsIndex(i)
return b
}
@@ -205,7 +228,7 @@ func (b *BitSet) Flip(i uint) *BitSet {
if i >= b.length {
return b.Set(i)
}
- b.set[i>>log2WordSize] ^= 1 << (i & (wordSize - 1))
+ b.set[i>>log2WordSize] ^= 1 << wordsIndex(i)
return b
}
@@ -218,15 +241,23 @@ func (b *BitSet) FlipRange(start, end uint) *BitSet {
if start >= end {
return b
}
-
- b.extendSetMaybe(end - 1)
+ if end-1 >= b.length { // if we need more bits, make 'em
+ b.extendSet(end - 1)
+ }
var startWord uint = start >> log2WordSize
var endWord uint = end >> log2WordSize
- b.set[startWord] ^= ^(^uint64(0) << (start & (wordSize - 1)))
- for i := startWord; i < endWord; i++ {
- b.set[i] = ^b.set[i]
+ b.set[startWord] ^= ^(^uint64(0) << wordsIndex(start))
+ if endWord > 0 {
+ // bounds check elimination
+ data := b.set
+ _ = data[endWord-1]
+ for i := startWord; i < endWord; i++ {
+ data[i] = ^data[i]
+ }
+ }
+ if end&(wordSize-1) != 0 {
+ b.set[endWord] ^= ^uint64(0) >> wordsIndex(-end)
}
- b.set[endWord] ^= ^uint64(0) >> (-end & (wordSize - 1))
return b
}
@@ -254,9 +285,10 @@ func (b *BitSet) Shrink(lastbitindex uint) *BitSet {
copy(shrunk, b.set[:idx])
b.set = shrunk
b.length = length
- if length < 64 {
- b.set[idx-1] &= (allBits >> (uint64(64) - uint64(length&(wordSize-1))))
- }
+ lastWordUsedBits := length % 64
+ if lastWordUsedBits != 0 {
+ b.set[idx-1] &= allBits >> uint64(64-wordsIndex(lastWordUsedBits))
+ }
return b
}
@@ -285,7 +317,7 @@ func (b *BitSet) Compact() *BitSet {
// this method could be extremely slow and in some cases might cause the entire BitSet
// to be recopied.
func (b *BitSet) InsertAt(idx uint) *BitSet {
- insertAtElement := (idx >> log2WordSize)
+ insertAtElement := idx >> log2WordSize
// if length of set is a multiple of wordSize we need to allocate more space first
if b.isLenExactMultiple() {
@@ -304,13 +336,13 @@ func (b *BitSet) InsertAt(idx uint) *BitSet {
// generate a mask to extract the data that we need to shift left
// within the element where we insert a bit
- dataMask := ^(uint64(1)<> (i & (wordSize - 1))
+ w = w >> wordsIndex(i)
if w != 0 {
return i + trailingZeroes64(w), true
}
- x = x + 1
+ x++
+ // bounds check elimination in the loop
+ if x < 0 {
+ return 0, false
+ }
for x < len(b.set) {
if b.set[x] != 0 {
return uint(x)*wordSize + trailingZeroes64(b.set[x]), true
}
- x = x + 1
+ x++
}
return 0, false
@@ -415,21 +451,20 @@ func (b *BitSet) NextSet(i uint) (uint, bool) {
// including possibly the current index and up to cap(buffer).
// If the returned slice has len zero, then no more set bits were found
//
-// buffer := make([]uint, 256) // this should be reused
-// j := uint(0)
-// j, buffer = bitmap.NextSetMany(j, buffer)
-// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) {
-// for k := range buffer {
-// do something with buffer[k]
-// }
-// j += 1
-// }
-//
+// buffer := make([]uint, 256) // this should be reused
+// j := uint(0)
+// j, buffer = bitmap.NextSetMany(j, buffer)
+// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) {
+// for k := range buffer {
+// do something with buffer[k]
+// }
+// j += 1
+// }
//
// It is possible to retrieve all set bits as follow:
//
-// indices := make([]uint, bitmap.Count())
-// bitmap.NextSetMany(0, indices)
+// indices := make([]uint, bitmap.Count())
+// bitmap.NextSetMany(0, indices)
//
// However if bitmap.Count() is large, it might be preferable to
// use several calls to NextSetMany, for performance reasons.
@@ -440,7 +475,7 @@ func (b *BitSet) NextSetMany(i uint, buffer []uint) (uint, []uint) {
if x >= len(b.set) || capacity == 0 {
return 0, myanswer[:0]
}
- skip := i & (wordSize - 1)
+ skip := wordsIndex(i)
word := b.set[x] >> skip
myanswer = myanswer[:capacity]
size := int(0)
@@ -483,17 +518,23 @@ func (b *BitSet) NextClear(i uint) (uint, bool) {
return 0, false
}
w := b.set[x]
- w = w >> (i & (wordSize - 1))
- wA := allBits >> (i & (wordSize - 1))
+ w = w >> wordsIndex(i)
+ wA := allBits >> wordsIndex(i)
index := i + trailingZeroes64(^w)
if w != wA && index < b.length {
return index, true
}
x++
+ // bounds check elimination in the loop
+ if x < 0 {
+ return 0, false
+ }
for x < len(b.set) {
- index = uint(x)*wordSize + trailingZeroes64(^b.set[x])
- if b.set[x] != allBits && index < b.length {
- return index, true
+ if b.set[x] != allBits {
+ index = uint(x)*wordSize + trailingZeroes64(^b.set[x])
+ if index < b.length {
+ return index, true
+ }
}
x++
}
@@ -512,7 +553,7 @@ func (b *BitSet) ClearAll() *BitSet {
// wordCount returns the number of words used in a bit set
func (b *BitSet) wordCount() int {
- return len(b.set)
+ return wordsNeededUnbound(b.length)
}
// Clone this BitSet
@@ -524,9 +565,10 @@ func (b *BitSet) Clone() *BitSet {
return c
}
-// Copy into a destination BitSet
-// Returning the size of the destination BitSet
-// like array copy
+// Copy into a destination BitSet using the Go array copy semantics:
+// the number of bits copied is the minimum of the number of bits in the current
+// BitSet (Len()) and the destination Bitset.
+// We return the number of bits copied in the destination BitSet.
func (b *BitSet) Copy(c *BitSet) (count uint) {
if c == nil {
return
@@ -538,9 +580,33 @@ func (b *BitSet) Copy(c *BitSet) (count uint) {
if b.length < c.length {
count = b.length
}
+ // Cleaning the last word is needed to keep the invariant that other functions, such as Count, require
+ // that any bits in the last word that would exceed the length of the bitmask are set to 0.
+ c.cleanLastWord()
return
}
+// CopyFull copies into a destination BitSet such that the destination is
+// identical to the source after the operation, allocating memory if necessary.
+func (b *BitSet) CopyFull(c *BitSet) {
+ if c == nil {
+ return
+ }
+ c.length = b.length
+ if len(b.set) == 0 {
+ if c.set != nil {
+ c.set = c.set[:0]
+ }
+ } else {
+ if cap(c.set) < len(b.set) {
+ c.set = make([]uint64, len(b.set))
+ } else {
+ c.set = c.set[:len(b.set)]
+ }
+ copy(c.set, b.set)
+ }
+}
+
// Count (number of set bits).
// Also known as "popcount" or "population count".
func (b *BitSet) Count() uint {
@@ -563,10 +629,15 @@ func (b *BitSet) Equal(c *BitSet) bool {
if b.length == 0 { // if they have both length == 0, then could have nil set
return true
}
- // testing for equality shoud not transform the bitset (no call to safeSet)
-
- for p, v := range b.set {
- if c.set[p] != v {
+ wn := b.wordCount()
+ // bounds check elimination
+ if wn <= 0 {
+ return true
+ }
+ _ = b.set[wn-1]
+ _ = c.set[wn-1]
+ for p := 0; p < wn; p++ {
+ if c.set[p] != b.set[p] {
return false
}
}
@@ -585,9 +656,9 @@ func (b *BitSet) Difference(compare *BitSet) (result *BitSet) {
panicIfNull(b)
panicIfNull(compare)
result = b.Clone() // clone b (in case b is bigger than compare)
- l := int(compare.wordCount())
- if l > int(b.wordCount()) {
- l = int(b.wordCount())
+ l := compare.wordCount()
+ if l > b.wordCount() {
+ l = b.wordCount()
}
for i := 0; i < l; i++ {
result.set[i] = b.set[i] &^ compare.set[i]
@@ -599,9 +670,9 @@ func (b *BitSet) Difference(compare *BitSet) (result *BitSet) {
func (b *BitSet) DifferenceCardinality(compare *BitSet) uint {
panicIfNull(b)
panicIfNull(compare)
- l := int(compare.wordCount())
- if l > int(b.wordCount()) {
- l = int(b.wordCount())
+ l := compare.wordCount()
+ if l > b.wordCount() {
+ l = b.wordCount()
}
cnt := uint64(0)
cnt += popcntMaskSlice(b.set[:l], compare.set[:l])
@@ -614,12 +685,19 @@ func (b *BitSet) DifferenceCardinality(compare *BitSet) uint {
func (b *BitSet) InPlaceDifference(compare *BitSet) {
panicIfNull(b)
panicIfNull(compare)
- l := int(compare.wordCount())
- if l > int(b.wordCount()) {
- l = int(b.wordCount())
+ l := compare.wordCount()
+ if l > b.wordCount() {
+ l = b.wordCount()
}
+ if l <= 0 {
+ return
+ }
+ // bounds check elimination
+ data, cmpData := b.set, compare.set
+ _ = data[l-1]
+ _ = cmpData[l-1]
for i := 0; i < l; i++ {
- b.set[i] &^= compare.set[i]
+ data[i] &^= cmpData[i]
}
}
@@ -662,18 +740,29 @@ func (b *BitSet) IntersectionCardinality(compare *BitSet) uint {
func (b *BitSet) InPlaceIntersection(compare *BitSet) {
panicIfNull(b)
panicIfNull(compare)
- l := int(compare.wordCount())
- if l > int(b.wordCount()) {
- l = int(b.wordCount())
+ l := compare.wordCount()
+ if l > b.wordCount() {
+ l = b.wordCount()
}
- for i := 0; i < l; i++ {
- b.set[i] &= compare.set[i]
+ if l > 0 {
+ // bounds check elimination
+ data, cmpData := b.set, compare.set
+ _ = data[l-1]
+ _ = cmpData[l-1]
+
+ for i := 0; i < l; i++ {
+ data[i] &= cmpData[i]
+ }
}
- for i := l; i < len(b.set); i++ {
- b.set[i] = 0
+ if l >= 0 {
+ for i := l; i < len(b.set); i++ {
+ b.set[i] = 0
+ }
}
if compare.length > 0 {
- b.extendSetMaybe(compare.length - 1)
+ if compare.length-1 >= b.length {
+ b.extendSet(compare.length - 1)
+ }
}
}
@@ -708,15 +797,22 @@ func (b *BitSet) UnionCardinality(compare *BitSet) uint {
func (b *BitSet) InPlaceUnion(compare *BitSet) {
panicIfNull(b)
panicIfNull(compare)
- l := int(compare.wordCount())
- if l > int(b.wordCount()) {
- l = int(b.wordCount())
+ l := compare.wordCount()
+ if l > b.wordCount() {
+ l = b.wordCount()
}
- if compare.length > 0 {
- b.extendSetMaybe(compare.length - 1)
+ if compare.length > 0 && compare.length-1 >= b.length {
+ b.extendSet(compare.length - 1)
}
- for i := 0; i < l; i++ {
- b.set[i] |= compare.set[i]
+ if l > 0 {
+ // bounds check elimination
+ data, cmpData := b.set, compare.set
+ _ = data[l-1]
+ _ = cmpData[l-1]
+
+ for i := 0; i < l; i++ {
+ data[i] |= cmpData[i]
+ }
}
if len(compare.set) > l {
for i := l; i < len(compare.set); i++ {
@@ -756,15 +852,21 @@ func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint {
func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) {
panicIfNull(b)
panicIfNull(compare)
- l := int(compare.wordCount())
- if l > int(b.wordCount()) {
- l = int(b.wordCount())
+ l := compare.wordCount()
+ if l > b.wordCount() {
+ l = b.wordCount()
}
- if compare.length > 0 {
- b.extendSetMaybe(compare.length - 1)
+ if compare.length > 0 && compare.length-1 >= b.length {
+ b.extendSet(compare.length - 1)
}
- for i := 0; i < l; i++ {
- b.set[i] ^= compare.set[i]
+ if l > 0 {
+ // bounds check elimination
+ data, cmpData := b.set, compare.set
+ _ = data[l-1]
+ _ = cmpData[l-1]
+ for i := 0; i < l; i++ {
+ data[i] ^= cmpData[i]
+ }
}
if len(compare.set) > l {
for i := l; i < len(compare.set); i++ {
@@ -775,17 +877,17 @@ func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) {
// Is the length an exact multiple of word sizes?
func (b *BitSet) isLenExactMultiple() bool {
- return b.length%wordSize == 0
+ return wordsIndex(b.length) == 0
}
// Clean last word by setting unused bits to 0
func (b *BitSet) cleanLastWord() {
if !b.isLenExactMultiple() {
- b.set[len(b.set)-1] &= allBits >> (wordSize - b.length%wordSize)
+ b.set[len(b.set)-1] &= allBits >> (wordSize - wordsIndex(b.length))
}
}
-// Complement computes the (local) complement of a biset (up to length bits)
+// Complement computes the (local) complement of a bitset (up to length bits)
func (b *BitSet) Complement() (result *BitSet) {
panicIfNull(b)
result = New(b.length)
@@ -813,7 +915,6 @@ func (b *BitSet) None() bool {
return false
}
}
- return true
}
return true
}
@@ -826,12 +927,16 @@ func (b *BitSet) Any() bool {
// IsSuperSet returns true if this is a superset of the other set
func (b *BitSet) IsSuperSet(other *BitSet) bool {
- for i, e := other.NextSet(0); e; i, e = other.NextSet(i + 1) {
- if !b.Test(i) {
+ l := other.wordCount()
+ if b.wordCount() < l {
+ l = b.wordCount()
+ }
+ for i, word := range other.set[:l] {
+ if b.set[i]&word != word {
return false
}
}
- return true
+ return popcntSlice(other.set[l:]) == 0
}
// IsStrictSuperSet returns true if this is a strict superset of the other set
@@ -852,78 +957,156 @@ func (b *BitSet) DumpAsBits() string {
return buffer.String()
}
-// BinaryStorageSize returns the binary storage requirements
+// BinaryStorageSize returns the binary storage requirements (see WriteTo) in bytes.
func (b *BitSet) BinaryStorageSize() int {
- return binary.Size(uint64(0)) + binary.Size(b.set)
+ return int(wordBytes + wordBytes*uint(b.wordCount()))
}
-// WriteTo writes a BitSet to a stream
+func readUint64Array(reader io.Reader, data []uint64) error {
+ length := len(data)
+ bufferSize := 128
+ buffer := make([]byte, bufferSize*int(wordBytes))
+ for i := 0; i < length; i += bufferSize {
+ end := i + bufferSize
+ if end > length {
+ end = length
+ buffer = buffer[:wordBytes*uint(end-i)]
+ }
+ chunk := data[i:end]
+ if _, err := io.ReadFull(reader, buffer); err != nil {
+ return err
+ }
+ for i := range chunk {
+ chunk[i] = uint64(binaryOrder.Uint64(buffer[8*i:]))
+ }
+ }
+ return nil
+}
+
+func writeUint64Array(writer io.Writer, data []uint64) error {
+ bufferSize := 128
+ buffer := make([]byte, bufferSize*int(wordBytes))
+ for i := 0; i < len(data); i += bufferSize {
+ end := i + bufferSize
+ if end > len(data) {
+ end = len(data)
+ buffer = buffer[:wordBytes*uint(end-i)]
+ }
+ chunk := data[i:end]
+ for i, x := range chunk {
+ binaryOrder.PutUint64(buffer[8*i:], x)
+ }
+ _, err := writer.Write(buffer)
+ if err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// WriteTo writes a BitSet to a stream. The format is:
+// 1. uint64 length
+// 2. []uint64 set
+// Upon success, the number of bytes written is returned.
+//
+// Performance: if this function is used to write to a disk or network
+// connection, it might be beneficial to wrap the stream in a bufio.Writer.
+// E.g.,
+//
+// f, err := os.Create("myfile")
+// w := bufio.NewWriter(f)
func (b *BitSet) WriteTo(stream io.Writer) (int64, error) {
length := uint64(b.length)
-
// Write length
- err := binary.Write(stream, binaryOrder, length)
+ err := binary.Write(stream, binaryOrder, &length)
if err != nil {
- return 0, err
+ // Upon failure, we do not guarantee that we
+ // return the number of bytes written.
+ return int64(0), err
}
-
- // Write set
- err = binary.Write(stream, binaryOrder, b.set)
- return int64(b.BinaryStorageSize()), err
+ err = writeUint64Array(stream, b.set[:b.wordCount()])
+ if err != nil {
+ // Upon failure, we do not guarantee that we
+ // return the number of bytes written.
+ return int64(wordBytes), err
+ }
+ return int64(b.BinaryStorageSize()), nil
}
// ReadFrom reads a BitSet from a stream written using WriteTo
+// The format is:
+// 1. uint64 length
+// 2. []uint64 set
+// Upon success, the number of bytes read is returned.
+// If the current BitSet is not large enough to hold the data,
+// it is extended. In case of error, the BitSet is either
+// left unchanged or made empty if the error occurs too late
+// to preserve the content.
+//
+// Performance: if this function is used to read from a disk or network
+// connection, it might be beneficial to wrap the stream in a bufio.Reader.
+// E.g.,
+//
+// f, err := os.Open("myfile")
+// r := bufio.NewReader(f)
func (b *BitSet) ReadFrom(stream io.Reader) (int64, error) {
var length uint64
-
- // Read length first
err := binary.Read(stream, binaryOrder, &length)
if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
return 0, err
}
- newset := New(uint(length))
+ newlength := uint(length)
- if uint64(newset.length) != length {
+ if uint64(newlength) != length {
return 0, errors.New("unmarshalling error: type mismatch")
}
+ nWords := wordsNeeded(uint(newlength))
+ if cap(b.set) >= nWords {
+ b.set = b.set[:nWords]
+ } else {
+ b.set = make([]uint64, nWords)
+ }
- // Read remaining bytes as set
- err = binary.Read(stream, binaryOrder, newset.set)
+ b.length = newlength
+
+ err = readUint64Array(stream, b.set)
if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
+ // We do not want to leave the BitSet partially filled as
+ // it is error prone.
+ b.set = b.set[:0]
+ b.length = 0
return 0, err
}
- *b = *newset
return int64(b.BinaryStorageSize()), nil
}
// MarshalBinary encodes a BitSet into a binary form and returns the result.
func (b *BitSet) MarshalBinary() ([]byte, error) {
var buf bytes.Buffer
- writer := bufio.NewWriter(&buf)
-
- _, err := b.WriteTo(writer)
+ _, err := b.WriteTo(&buf)
if err != nil {
return []byte{}, err
}
- err = writer.Flush()
-
return buf.Bytes(), err
}
// UnmarshalBinary decodes the binary form generated by MarshalBinary.
func (b *BitSet) UnmarshalBinary(data []byte) error {
buf := bytes.NewReader(data)
- reader := bufio.NewReader(buf)
-
- _, err := b.ReadFrom(reader)
-
+ _, err := b.ReadFrom(buf)
return err
}
// MarshalJSON marshals a BitSet as a JSON structure
-func (b *BitSet) MarshalJSON() ([]byte, error) {
+func (b BitSet) MarshalJSON() ([]byte, error) {
buffer := bytes.NewBuffer(make([]byte, 0, b.BinaryStorageSize()))
_, err := b.WriteTo(buffer)
if err != nil {
diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go
index fc8ff4f36..7855c04b5 100644
--- a/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go
+++ b/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go
@@ -1,3 +1,4 @@
+//go:build go1.9
// +build go1.9
package bitset
@@ -14,6 +15,10 @@ func popcntSlice(s []uint64) uint64 {
func popcntMaskSlice(s, m []uint64) uint64 {
var cnt int
+ // this explicit check eliminates a bounds check in the loop
+ if len(m) < len(s) {
+ panic("mask slice is too short")
+ }
for i := range s {
cnt += bits.OnesCount64(s[i] &^ m[i])
}
@@ -22,6 +27,10 @@ func popcntMaskSlice(s, m []uint64) uint64 {
func popcntAndSlice(s, m []uint64) uint64 {
var cnt int
+ // this explicit check eliminates a bounds check in the loop
+ if len(m) < len(s) {
+ panic("mask slice is too short")
+ }
for i := range s {
cnt += bits.OnesCount64(s[i] & m[i])
}
@@ -30,6 +39,10 @@ func popcntAndSlice(s, m []uint64) uint64 {
func popcntOrSlice(s, m []uint64) uint64 {
var cnt int
+ // this explicit check eliminates a bounds check in the loop
+ if len(m) < len(s) {
+ panic("mask slice is too short")
+ }
for i := range s {
cnt += bits.OnesCount64(s[i] | m[i])
}
@@ -38,6 +51,10 @@ func popcntOrSlice(s, m []uint64) uint64 {
func popcntXorSlice(s, m []uint64) uint64 {
var cnt int
+ // this explicit check eliminates a bounds check in the loop
+ if len(m) < len(s) {
+ panic("mask slice is too short")
+ }
for i := range s {
cnt += bits.OnesCount64(s[i] ^ m[i])
}
diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go
index 4cf64f24a..116e04440 100644
--- a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go
+++ b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go
@@ -1,5 +1,5 @@
-// +build !go1.9
-// +build amd64,!appengine
+//go:build !go1.9 && amd64 && !appengine
+// +build !go1.9,amd64,!appengine
package bitset
diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go
index 21e0ff7b4..9e0ad464e 100644
--- a/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go
+++ b/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go
@@ -1,3 +1,4 @@
+//go:build !go1.9 && (!amd64 || appengine)
// +build !go1.9
// +build !amd64 appengine
diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go
index c52b61be9..12336e76a 100644
--- a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go
+++ b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go
@@ -1,3 +1,4 @@
+//go:build !go1.9
// +build !go1.9
package bitset
diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go
index 36a988e71..cfb0a8409 100644
--- a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go
+++ b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go
@@ -1,3 +1,4 @@
+//go:build go1.9
// +build go1.9
package bitset
diff --git a/vendor/github.com/blevesearch/bleve/v2/README.md b/vendor/github.com/blevesearch/bleve/v2/README.md
index dbe5b7898..80499ec53 100644
--- a/vendor/github.com/blevesearch/bleve/v2/README.md
+++ b/vendor/github.com/blevesearch/bleve/v2/README.md
@@ -9,21 +9,21 @@
[](https://sourcegraph.com/github.com/blevesearch/bleve?badge)
[](https://opensource.org/licenses/Apache-2.0)
-A modern text indexing library in go
+A modern indexing library in GO
## Features
* Index any go data structure (including JSON)
* Intelligent defaults backed up by powerful configuration
* Supported field types:
- * Text, Numeric, Datetime, Boolean
+ * `text`, `number`, `datetime`, `boolean`, `geopoint`, `geoshape`, `IP`, `vector`
* Supported query types:
* Term, Phrase, Match, Match Phrase, Prefix, Fuzzy
- * Conjunction, Disjunction, Boolean (must/should/must_not)
+ * Conjunction, Disjunction, Boolean (`must`/`should`/`must_not`)
* Term Range, Numeric Range, Date Range
* [Geo Spatial](https://github.com/blevesearch/bleve/blob/master/geo/README.md)
* Simple [query string syntax](http://www.blevesearch.com/docs/Query-String-Query/)
- * [Vector Search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md)
+ * Approximate k-nearest neighbors over [vectors](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md)
* [tf-idf](https://en.wikipedia.org/wiki/Tf-idf) Scoring
* Query time boosting
* Search result match highlighting with document fragments
diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds/microseconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds/microseconds.go
new file mode 100644
index 000000000..a0e2c9495
--- /dev/null
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds/microseconds.go
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package microseconds
+
+import (
+ "math"
+ "strconv"
+ "time"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "unix_micro"
+
+type DateTimeParser struct {
+}
+
+var minBound int64 = math.MinInt64 / 1000
+var maxBound int64 = math.MaxInt64 / 1000
+
+func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
+ // unix timestamp is milliseconds since UNIX epoch
+ timestamp, err := strconv.ParseInt(input, 10, 64)
+ if err != nil {
+ return time.Time{}, "", analysis.ErrInvalidTimestampString
+ }
+ if timestamp < minBound || timestamp > maxBound {
+ return time.Time{}, "", analysis.ErrInvalidTimestampRange
+ }
+ return time.UnixMicro(timestamp), Name, nil
+}
+
+func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
+ return &DateTimeParser{}, nil
+}
+
+func init() {
+ registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds/milliseconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds/milliseconds.go
new file mode 100644
index 000000000..63826b451
--- /dev/null
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds/milliseconds.go
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package milliseconds
+
+import (
+ "math"
+ "strconv"
+ "time"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "unix_milli"
+
+type DateTimeParser struct {
+}
+
+var minBound int64 = math.MinInt64 / 1000000
+var maxBound int64 = math.MaxInt64 / 1000000
+
+func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
+ // unix timestamp is milliseconds since UNIX epoch
+ timestamp, err := strconv.ParseInt(input, 10, 64)
+ if err != nil {
+ return time.Time{}, "", analysis.ErrInvalidTimestampString
+ }
+ if timestamp < minBound || timestamp > maxBound {
+ return time.Time{}, "", analysis.ErrInvalidTimestampRange
+ }
+ return time.UnixMilli(timestamp), Name, nil
+}
+
+func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
+ return &DateTimeParser{}, nil
+}
+
+func init() {
+ registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds/nanoseconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds/nanoseconds.go
new file mode 100644
index 000000000..8bb1ab1b6
--- /dev/null
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds/nanoseconds.go
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nanoseconds
+
+import (
+ "math"
+ "strconv"
+ "time"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "unix_nano"
+
+type DateTimeParser struct {
+}
+
+var minBound int64 = math.MinInt64
+var maxBound int64 = math.MaxInt64
+
+func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
+ // unix timestamp is milliseconds since UNIX epoch
+ timestamp, err := strconv.ParseInt(input, 10, 64)
+ if err != nil {
+ return time.Time{}, "", analysis.ErrInvalidTimestampString
+ }
+ if timestamp < minBound || timestamp > maxBound {
+ return time.Time{}, "", analysis.ErrInvalidTimestampRange
+ }
+ return time.Unix(0, timestamp), Name, nil
+}
+
+func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
+ return &DateTimeParser{}, nil
+}
+
+func init() {
+ registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds/seconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds/seconds.go
new file mode 100644
index 000000000..58e947c80
--- /dev/null
+++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds/seconds.go
@@ -0,0 +1,52 @@
+// Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seconds
+
+import (
+ "math"
+ "strconv"
+ "time"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "unix_sec"
+
+type DateTimeParser struct {
+}
+
+var minBound int64 = math.MinInt64 / 1000000000
+var maxBound int64 = math.MaxInt64 / 1000000000
+
+func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
+ // unix timestamp is seconds since UNIX epoch
+ timestamp, err := strconv.ParseInt(input, 10, 64)
+ if err != nil {
+ return time.Time{}, "", analysis.ErrInvalidTimestampString
+ }
+ if timestamp < minBound || timestamp > maxBound {
+ return time.Time{}, "", analysis.ErrInvalidTimestampRange
+ }
+ return time.Unix(timestamp, 0), Name, nil
+}
+
+func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
+ return &DateTimeParser{}, nil
+}
+
+func init() {
+ registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go b/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go
index fdf3cc0e5..8c2987a7f 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go
@@ -43,10 +43,15 @@ type BooleanField struct {
}
func (b *BooleanField) Size() int {
+ var freqSize int
+ if b.frequencies != nil {
+ freqSize = b.frequencies.Size()
+ }
return reflectStaticSizeBooleanField + size.SizeOfPtr +
len(b.name) +
len(b.arrayPositions)*size.SizeOfUint64 +
- len(b.value)
+ len(b.value) +
+ freqSize
}
func (b *BooleanField) Name() string {
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_composite.go b/vendor/github.com/blevesearch/bleve/v2/document/field_composite.go
index 8c47643f5..e0ba8af7a 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_composite.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_composite.go
@@ -68,13 +68,16 @@ func (c *CompositeField) Size() int {
sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr +
len(c.name)
- for k, _ := range c.includedFields {
+ for k := range c.includedFields {
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
}
- for k, _ := range c.excludedFields {
+ for k := range c.excludedFields {
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
}
+ if c.compositeFrequencies != nil {
+ sizeInBytes += c.compositeFrequencies.Size()
+ }
return sizeInBytes
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_datetime.go b/vendor/github.com/blevesearch/bleve/v2/document/field_datetime.go
index efdd26b60..f3b859c43 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_datetime.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_datetime.go
@@ -53,9 +53,15 @@ type DateTimeField struct {
}
func (n *DateTimeField) Size() int {
+ var freqSize int
+ if n.frequencies != nil {
+ freqSize = n.frequencies.Size()
+ }
return reflectStaticSizeDateTimeField + size.SizeOfPtr +
len(n.name) +
- len(n.arrayPositions)*size.SizeOfUint64
+ len(n.arrayPositions)*size.SizeOfUint64 +
+ len(n.value) +
+ freqSize
}
func (n *DateTimeField) Name() string {
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go b/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go
index 719d18c35..5795043f2 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go
@@ -47,9 +47,15 @@ type GeoPointField struct {
}
func (n *GeoPointField) Size() int {
+ var freqSize int
+ if n.frequencies != nil {
+ freqSize = n.frequencies.Size()
+ }
return reflectStaticSizeGeoPointField + size.SizeOfPtr +
len(n.name) +
- len(n.arrayPositions)*size.SizeOfUint64
+ len(n.arrayPositions)*size.SizeOfUint64 +
+ len(n.value) +
+ freqSize
}
func (n *GeoPointField) Name() string {
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go b/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go
index a20ff1837..6bf7b010a 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go
@@ -48,9 +48,16 @@ type GeoShapeField struct {
}
func (n *GeoShapeField) Size() int {
+ var freqSize int
+ if n.frequencies != nil {
+ freqSize = n.frequencies.Size()
+ }
return reflectStaticSizeGeoShapeField + size.SizeOfPtr +
len(n.name) +
- len(n.arrayPositions)*size.SizeOfUint64
+ len(n.arrayPositions)*size.SizeOfUint64 +
+ len(n.encodedValue) +
+ len(n.value) +
+ freqSize
}
func (n *GeoShapeField) Name() string {
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go b/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go
index 1e5be5006..80a353a01 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go
@@ -44,10 +44,15 @@ type IPField struct {
}
func (b *IPField) Size() int {
+ var freqSize int
+ if b.frequencies != nil {
+ freqSize = b.frequencies.Size()
+ }
return reflectStaticSizeIPField + size.SizeOfPtr +
len(b.name) +
len(b.arrayPositions)*size.SizeOfUint64 +
- len(b.value)
+ len(b.value) +
+ freqSize
}
func (b *IPField) Name() string {
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_numeric.go b/vendor/github.com/blevesearch/bleve/v2/document/field_numeric.go
index a54b082b4..1ee7b75ee 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_numeric.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_numeric.go
@@ -46,9 +46,15 @@ type NumericField struct {
}
func (n *NumericField) Size() int {
+ var freqSize int
+ if n.frequencies != nil {
+ freqSize = n.frequencies.Size()
+ }
return reflectStaticSizeNumericField + size.SizeOfPtr +
len(n.name) +
- len(n.arrayPositions)*size.SizeOfPtr
+ len(n.arrayPositions)*size.SizeOfUint64 +
+ len(n.value) +
+ freqSize
}
func (n *NumericField) Name() string {
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_text.go b/vendor/github.com/blevesearch/bleve/v2/document/field_text.go
index fddc59d09..d35e74732 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_text.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_text.go
@@ -44,10 +44,15 @@ type TextField struct {
}
func (t *TextField) Size() int {
+ var freqSize int
+ if t.frequencies != nil {
+ freqSize = t.frequencies.Size()
+ }
return reflectStaticSizeTextField + size.SizeOfPtr +
len(t.name) +
len(t.arrayPositions)*size.SizeOfUint64 +
- len(t.value)
+ len(t.value) +
+ freqSize
}
func (t *TextField) Name() string {
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go b/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go
index b019361cb..53334d202 100644
--- a/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go
@@ -47,6 +47,8 @@ type VectorField struct {
func (n *VectorField) Size() int {
return reflectStaticSizeVectorField + size.SizeOfPtr +
len(n.name) +
+ len(n.similarity) +
+ len(n.vectorIndexOptimizedFor) +
int(numBytesFloat32s(n.value))
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_vector_base64.go b/vendor/github.com/blevesearch/bleve/v2/document/field_vector_base64.go
new file mode 100644
index 000000000..31d6cbffd
--- /dev/null
+++ b/vendor/github.com/blevesearch/bleve/v2/document/field_vector_base64.go
@@ -0,0 +1,163 @@
+// Copyright (c) 2024 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build vectors
+// +build vectors
+
+package document
+
+import (
+ "encoding/base64"
+ "encoding/binary"
+ "fmt"
+ "math"
+ "reflect"
+
+ "github.com/blevesearch/bleve/v2/size"
+ "github.com/blevesearch/bleve/v2/util"
+ index "github.com/blevesearch/bleve_index_api"
+)
+
+var reflectStaticSizeVectorBase64Field int
+
+func init() {
+ var f VectorBase64Field
+ reflectStaticSizeVectorBase64Field = int(reflect.TypeOf(f).Size())
+}
+
+type VectorBase64Field struct {
+ vectorField *VectorField
+ base64Encoding string
+}
+
+func (n *VectorBase64Field) Size() int {
+ var vecFieldSize int
+ if n.vectorField != nil {
+ vecFieldSize = n.vectorField.Size()
+ }
+ return reflectStaticSizeVectorBase64Field + size.SizeOfPtr +
+ len(n.base64Encoding) +
+ vecFieldSize
+}
+
+func (n *VectorBase64Field) Name() string {
+ return n.vectorField.Name()
+}
+
+func (n *VectorBase64Field) ArrayPositions() []uint64 {
+ return n.vectorField.ArrayPositions()
+}
+
+func (n *VectorBase64Field) Options() index.FieldIndexingOptions {
+ return n.vectorField.Options()
+}
+
+func (n *VectorBase64Field) NumPlainTextBytes() uint64 {
+ return n.vectorField.NumPlainTextBytes()
+}
+
+func (n *VectorBase64Field) AnalyzedLength() int {
+ return n.vectorField.AnalyzedLength()
+}
+
+func (n *VectorBase64Field) EncodedFieldType() byte {
+ return 'e'
+}
+
+func (n *VectorBase64Field) AnalyzedTokenFrequencies() index.TokenFrequencies {
+ return n.vectorField.AnalyzedTokenFrequencies()
+}
+
+func (n *VectorBase64Field) Analyze() {
+}
+
+func (n *VectorBase64Field) Value() []byte {
+ return n.vectorField.Value()
+}
+
+func (n *VectorBase64Field) GoString() string {
+ return fmt.Sprintf("&document.vectorFieldBase64Field{Name:%s, Options: %s, "+
+ "Value: %+v}", n.vectorField.Name(), n.vectorField.Options(), n.vectorField.Value())
+}
+
+// For the sake of not polluting the API, we are keeping arrayPositions as a
+// parameter, but it is not used.
+func NewVectorBase64Field(name string, arrayPositions []uint64, vectorBase64 string,
+ dims int, similarity, vectorIndexOptimizedFor string) (*VectorBase64Field, error) {
+
+ decodedVector, err := DecodeVector(vectorBase64)
+ if err != nil {
+ return nil, err
+ }
+
+ return &VectorBase64Field{
+ vectorField: NewVectorFieldWithIndexingOptions(name, arrayPositions,
+ decodedVector, dims, similarity,
+ vectorIndexOptimizedFor, DefaultVectorIndexingOptions),
+
+ base64Encoding: vectorBase64,
+ }, nil
+}
+
+// This function takes a base64 encoded string and decodes it into
+// a vector.
+func DecodeVector(encodedValue string) ([]float32, error) {
+ // We first decode the encoded string into a byte array.
+ decodedString, err := base64.StdEncoding.DecodeString(encodedValue)
+ if err != nil {
+ return nil, err
+ }
+
+ // The array is expected to be divisible by 4 because each float32
+ // should occupy 4 bytes
+ if len(decodedString)%size.SizeOfFloat32 != 0 {
+ return nil, fmt.Errorf("decoded byte array not divisible by %d", size.SizeOfFloat32)
+ }
+ dims := int(len(decodedString) / size.SizeOfFloat32)
+
+ if dims <= 0 {
+ return nil, fmt.Errorf("unable to decode encoded vector")
+ }
+
+ decodedVector := make([]float32, dims)
+
+ // We iterate through the array 4 bytes at a time and convert each of
+ // them to a float32 value by reading them in a little endian notation
+ for i := 0; i < dims; i++ {
+ bytes := decodedString[i*size.SizeOfFloat32 : (i+1)*size.SizeOfFloat32]
+ entry := math.Float32frombits(binary.LittleEndian.Uint32(bytes))
+ if !util.IsValidFloat32(float64(entry)) {
+ return nil, fmt.Errorf("invalid float32 value: %f", entry)
+ }
+ decodedVector[i] = entry
+ }
+
+ return decodedVector, nil
+}
+
+func (n *VectorBase64Field) Vector() []float32 {
+ return n.vectorField.Vector()
+}
+
+func (n *VectorBase64Field) Dims() int {
+ return n.vectorField.Dims()
+}
+
+func (n *VectorBase64Field) Similarity() string {
+ return n.vectorField.Similarity()
+}
+
+func (n *VectorBase64Field) IndexOptimizedFor() string {
+ return n.vectorField.IndexOptimizedFor()
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/README.md b/vendor/github.com/blevesearch/bleve/v2/index/scorch/README.md
index 9794aed70..fe2abde55 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/README.md
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/README.md
@@ -218,7 +218,7 @@ A term search for term T in field F will look something like this:
}
```
-The searchResultPostings will be a new implementation of the TermFieldReader inteface.
+The searchResultPostings will be a new implementation of the TermFieldReader interface.
As a reminder this interface is:
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/event.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/event.go
index 31c9e80c9..0f653ccf4 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/event.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/event.go
@@ -22,7 +22,8 @@ var RegistryAsyncErrorCallbacks = map[string]func(error, string){}
// RegistryEventCallbacks should be treated as read-only after
// process init()'ialization.
-var RegistryEventCallbacks = map[string]func(Event){}
+// In the event of not having a callback, these return true.
+var RegistryEventCallbacks = map[string]func(Event) bool{}
// Event represents the information provided in an OnEvent() callback.
type Event struct {
@@ -62,3 +63,7 @@ var EventKindMergeTaskIntroductionStart = EventKind(7)
// EventKindMergeTaskIntroduction is fired when the merger has completed
// the introduction of merged segment from a single merge task.
var EventKindMergeTaskIntroduction = EventKind(8)
+
+// EventKindPreMergeCheck is fired before the merge begins to check if
+// the caller should proceed with the merge.
+var EventKindPreMergeCheck = EventKind(9)
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go
index 339ec5969..b74504ca1 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go
@@ -72,6 +72,17 @@ OUTER:
ctrlMsg = ctrlMsgDflt
}
if ctrlMsg != nil {
+ continueMerge := s.fireEvent(EventKindPreMergeCheck, 0)
+ // The default, if there's no handler, is to continue the merge.
+ if !continueMerge {
+ // If it's decided that this merge can't take place now,
+ // begin the merge process all over again.
+ // Retry instead of blocking/waiting here since a long wait
+ // can result in more segments introduced i.e. s.root will
+ // be updated.
+ continue OUTER
+ }
+
startTime := time.Now()
// lets get started
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go
index 752350662..ac6d8b22b 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go
@@ -37,6 +37,11 @@ type Segment interface {
// Size of the live data of the segment; i.e., FullSize() minus
// any logical deletions.
LiveSize() int64
+
+ HasVector() bool
+
+ // Size of the persisted segment file.
+ FileSize() int64
}
// Plan() will functionally compute a merge plan. A segment will be
@@ -76,6 +81,11 @@ type MergePlanOptions struct {
// planner’s predicted sizes.
MaxSegmentSize int64
+ // Max size (in bytes) of the persisted segment file that contains the
+ // vectors. This is used to prevent merging of segments that
+ // contain vectors that are too large.
+ MaxSegmentFileSize int64
+
// The growth factor for each tier in a staircase of idealized
// segments computed by CalcBudget().
TierGrowth float64
@@ -128,6 +138,7 @@ var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limi
var DefaultMergePlanOptions = MergePlanOptions{
MaxSegmentsPerTier: 10,
MaxSegmentSize: 5000000,
+ MaxSegmentFileSize: 4000000000, // 4GB
TierGrowth: 10.0,
SegmentsPerMergeTask: 10,
FloorSegmentSize: 2000,
@@ -139,6 +150,7 @@ var DefaultMergePlanOptions = MergePlanOptions{
var SingleSegmentMergePlanOptions = MergePlanOptions{
MaxSegmentsPerTier: 1,
MaxSegmentSize: 1 << 30,
+ MaxSegmentFileSize: 1 << 40,
TierGrowth: 1.0,
SegmentsPerMergeTask: 10,
FloorSegmentSize: 1 << 30,
@@ -170,8 +182,17 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
minLiveSize = segment.LiveSize()
}
+ isEligible := segment.LiveSize() < o.MaxSegmentSize/2
+ // An eligible segment (based on #documents) may be too large
+ // and thus need a stricter check based on the file size.
+ // This is particularly important for segments that contain
+ // vectors.
+ if isEligible && segment.HasVector() && o.MaxSegmentFileSize > 0 {
+ isEligible = segment.FileSize() < o.MaxSegmentFileSize/2
+ }
+
// Only small-enough segments are eligible.
- if segment.LiveSize() < o.MaxSegmentSize/2 {
+ if isEligible {
eligibles = append(eligibles, segment)
eligiblesLiveSize += segment.LiveSize()
}
@@ -215,14 +236,25 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
for startIdx := 0; startIdx < len(eligibles); startIdx++ {
var roster []Segment
var rosterLiveSize int64
+ var rosterFileSize int64 // useful for segments with vectors
for idx := startIdx; idx < len(eligibles) && len(roster) < o.SegmentsPerMergeTask; idx++ {
eligible := eligibles[idx]
- if rosterLiveSize+eligible.LiveSize() < o.MaxSegmentSize {
- roster = append(roster, eligible)
- rosterLiveSize += eligible.LiveSize()
+ if rosterLiveSize+eligible.LiveSize() >= o.MaxSegmentSize {
+ continue
}
+
+ if eligible.HasVector() {
+ efs := eligible.FileSize()
+ if rosterFileSize+efs >= o.MaxSegmentFileSize {
+ continue
+ }
+ rosterFileSize += efs
+ }
+
+ roster = append(roster, eligible)
+ rosterLiveSize += eligible.LiveSize()
}
if len(roster) > 0 {
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go
index 330e214f3..6b10a207c 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go
@@ -77,7 +77,7 @@ func (o *OptimizeVR) Finish() error {
wg.Done()
}()
for field, vrs := range o.vrs {
- vecIndex, err := segment.InterpretVectorIndex(field)
+ vecIndex, err := segment.InterpretVectorIndex(field, origSeg.deleted)
if err != nil {
errorsM.Lock()
errors = append(errors, err)
@@ -91,7 +91,7 @@ func (o *OptimizeVR) Finish() error {
for _, vr := range vrs {
// for each VR, populate postings list and iterators
// by passing the obtained vector index and getting similar vectors.
- pl, err := vecIndex.Search(vr.vector, vr.k, origSeg.deleted)
+ pl, err := vecIndex.Search(vr.vector, vr.k, vr.searchParams)
if err != nil {
errorsM.Lock()
errors = append(errors, err)
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go
index afd518dde..d59f733df 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go
@@ -549,11 +549,14 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string,
val := make([]byte, 8)
bytesWritten := atomic.LoadUint64(&snapshot.parent.stats.TotBytesWrittenAtIndexTime)
binary.LittleEndian.PutUint64(val, bytesWritten)
- internalBucket.Put(TotBytesWrittenKey, val)
+ err = internalBucket.Put(TotBytesWrittenKey, val)
+ if err != nil {
+ return nil, nil, err
+ }
}
- var filenames []string
- newSegmentPaths := make(map[uint64]string)
+ filenames := make([]string, 0, len(snapshot.segment))
+ newSegmentPaths := make(map[uint64]string, len(snapshot.segment))
// first ensure that each segment in this snapshot has been persisted
for _, segmentSnapshot := range snapshot.segment {
@@ -826,6 +829,10 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
for k, _ := c.First(); k != nil; k, _ = c.Next() {
if k[0] == boltInternalKey[0] {
internalBucket := snapshot.Bucket(k)
+ if internalBucket == nil {
+ _ = rv.DecRef()
+ return nil, fmt.Errorf("internal bucket missing")
+ }
err := internalBucket.ForEach(func(key []byte, val []byte) error {
copiedVal := append([]byte(nil), val...)
rv.internal[string(key)] = copiedVal
@@ -982,7 +989,7 @@ func getTimeSeriesSnapshots(maxDataPoints int, interval time.Duration,
return ptr, rv
}
-// getProtectedEpochs aims to fetch the epochs keep based on a timestamp basis.
+// getProtectedSnapshots aims to fetch the epochs keep based on a timestamp basis.
// It tries to get NumSnapshotsToKeep snapshots, each of which are separated
// by a time duration of RollbackSamplingInterval.
func getProtectedSnapshots(rollbackSamplingInterval time.Duration,
@@ -1133,7 +1140,7 @@ func (s *Scorch) removeOldZapFiles() error {
for _, f := range files {
fname := f.Name()
if filepath.Ext(fname) == ".zap" {
- if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] {
+ if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] && (s.copyScheduled[fname] <= 0) {
err := os.Remove(s.path + string(os.PathSeparator) + fname)
if err != nil {
log.Printf("got err removing file: %s, err: %v", fname, err)
@@ -1198,6 +1205,9 @@ func (s *Scorch) rootBoltSnapshotMetaData() ([]*snapshotMetaData, error) {
}
snapshot := snapshots.Bucket(sk)
+ if snapshot == nil {
+ continue
+ }
metaBucket := snapshot.Bucket(boltMetaDataKey)
if metaBucket == nil {
continue
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go
index 067220e6f..895f939dd 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go
@@ -89,6 +89,10 @@ func RollbackPoints(path string) ([]*RollbackPoint, error) {
for j, _ := c2.First(); j != nil; j, _ = c2.Next() {
if j[0] == boltInternalKey[0] {
internalBucket := snapshot.Bucket(j)
+ if internalBucket == nil {
+ err = fmt.Errorf("internal bucket missing")
+ break
+ }
err = internalBucket.ForEach(func(key []byte, val []byte) error {
copiedVal := append([]byte(nil), val...)
meta[string(key)] = copiedVal
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go
index 2e6435ee0..7966d844d 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go
@@ -18,6 +18,7 @@ import (
"encoding/json"
"fmt"
"os"
+ "path/filepath"
"sync"
"sync/atomic"
"time"
@@ -49,6 +50,7 @@ type Scorch struct {
unsafeBatch bool
rootLock sync.RWMutex
+
root *IndexSnapshot // holds 1 ref-count on the root
rootPersisted []chan error // closed when root is persisted
persistedCallbacks []index.BatchCallback
@@ -56,6 +58,12 @@ type Scorch struct {
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
+ // keeps track of segments scheduled for online copy/backup operation. Each segment's filename maps to
+ // the count of copy schedules. Segments with non-zero counts are protected from removal by the cleanup
+ // operation. Counts decrement upon successful copy, allowing removal of segments with zero or absent counts.
+ // must be accessed within the rootLock as it is accessed by the asynchronous cleanup routine.
+ copyScheduled map[string]int
+
numSnapshotsToKeep int
rollbackRetentionFactor float64
checkPoints []*snapshotMetaData
@@ -69,7 +77,7 @@ type Scorch struct {
rootBolt *bolt.DB
asyncTasks sync.WaitGroup
- onEvent func(event Event)
+ onEvent func(event Event) bool
onAsyncError func(err error, path string)
forceMergeRequestCh chan *mergerCtrl
@@ -112,6 +120,7 @@ func NewScorch(storeName string,
ineligibleForRemoval: map[string]bool{},
forceMergeRequestCh: make(chan *mergerCtrl, 1),
segPlugin: defaultSegmentPlugin,
+ copyScheduled: map[string]int{},
}
forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config)
@@ -175,12 +184,14 @@ func (s *Scorch) NumEventsBlocking() uint64 {
return eventsStarted - eventsCompleted
}
-func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
+func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) bool {
+ res := true
if s.onEvent != nil {
atomic.AddUint64(&s.stats.TotEventTriggerStarted, 1)
- s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
+ res = s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
atomic.AddUint64(&s.stats.TotEventTriggerCompleted, 1)
}
+ return res
}
func (s *Scorch) fireAsyncError(err error) {
@@ -822,6 +833,10 @@ func (fs *fieldStats) Aggregate(stats segment.FieldStats) {
// Returns the stats map
func (fs *fieldStats) Fetch() map[string]map[string]uint64 {
+ if fs == nil {
+ return nil
+ }
+
return fs.statMap
}
@@ -832,3 +847,34 @@ func newFieldStats() *fieldStats {
}
return rv
}
+
+// CopyReader returns a low-level accessor for index data, ensuring persisted segments
+// remain on disk for backup, preventing race conditions with the persister/merger cleanup.
+// Close the reader after backup to allow segment removal by the persister/merger.
+func (s *Scorch) CopyReader() index.CopyReader {
+ s.rootLock.Lock()
+ rv := s.root
+ if rv != nil {
+ rv.AddRef()
+ var fileName string
+ // schedule a backup for all the segments from the root. Note that the
+ // both the unpersisted and persisted segments are scheduled for backup.
+ // because during the backup, the unpersisted segments may get persisted and
+ // hence we need to protect both the unpersisted and persisted segments from removal
+ // by the cleanup routine during the online backup
+ for _, seg := range rv.segment {
+ if perSeg, ok := seg.segment.(segment.PersistedSegment); ok {
+ // segment is persisted
+ fileName = filepath.Base(perSeg.Path())
+ } else {
+ // segment is not persisted
+ // the name of the segment file that is generated if the
+ // the segment is persisted in the future.
+ fileName = zapFileName(seg.id)
+ }
+ rv.parent.copyScheduled[fileName]++
+ }
+ }
+ s.rootLock.Unlock()
+ return rv
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go
index 59828e875..f0e7ae1cf 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go
@@ -905,3 +905,26 @@ func (is *IndexSnapshot) GetSpatialAnalyzerPlugin(typ string) (
}
return rv, nil
}
+
+func (is *IndexSnapshot) CloseCopyReader() error {
+ // first unmark the segments that were marked for backup by this index snapshot
+ is.parent.rootLock.Lock()
+ for _, seg := range is.segment {
+ var fileName string
+ if perSeg, ok := seg.segment.(segment.PersistedSegment); ok {
+ // segment is persisted
+ fileName = filepath.Base(perSeg.Path())
+ } else {
+ // segment is not persisted
+ // the name of the segment file that is generated if the
+ // the segment is persisted in the future.
+ fileName = zapFileName(seg.id)
+ }
+ if is.parent.copyScheduled[fileName]--; is.parent.copyScheduled[fileName] <= 0 {
+ delete(is.parent.copyScheduled, fileName)
+ }
+ }
+ is.parent.rootLock.Unlock()
+ // close the index snapshot normally
+ return is.Close()
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go
index 04a9e0e6d..30e03dcba 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go
@@ -20,6 +20,7 @@ package scorch
import (
"bytes"
"context"
+ "encoding/json"
"fmt"
"reflect"
@@ -48,11 +49,15 @@ type IndexSnapshotVectorReader struct {
currPosting segment_api.VecPosting
currID index.IndexInternalID
ctx context.Context
+
+ searchParams json.RawMessage
}
func (i *IndexSnapshotVectorReader) Size() int {
sizeInBytes := reflectStaticSizeIndexSnapshotVectorReader + size.SizeOfPtr +
- len(i.vector) + len(i.field) + len(i.currID)
+ len(i.vector)*size.SizeOfFloat32 +
+ len(i.field) +
+ len(i.currID)
for _, entry := range i.postings {
sizeInBytes += entry.Size()
@@ -103,7 +108,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
- i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k)
+ i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, i.searchParams)
if err != nil {
return nil, err
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go
index 1c14af726..96e59a31d 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go
@@ -16,6 +16,7 @@ package scorch
import (
"bytes"
+ "os"
"sync"
"sync/atomic"
@@ -66,6 +67,31 @@ func (s *SegmentSnapshot) LiveSize() int64 {
return int64(s.Count())
}
+func (s *SegmentSnapshot) HasVector() bool {
+ // number of vectors, for each vector field in the segment
+ numVecs := s.stats.Fetch()["num_vectors"]
+ return len(numVecs) > 0
+}
+
+func (s *SegmentSnapshot) FileSize() int64 {
+ ps, ok := s.segment.(segment.PersistedSegment)
+ if !ok {
+ return 0
+ }
+
+ path := ps.Path()
+ if path == "" {
+ return 0
+ }
+
+ fi, err := os.Stat(path)
+ if err != nil {
+ return 0
+ }
+
+ return fi.Size()
+}
+
func (s *SegmentSnapshot) Close() error {
return s.segment.Close()
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go
index 9d6f0700e..70546d4e3 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go
@@ -19,20 +19,22 @@ package scorch
import (
"context"
+ "encoding/json"
index "github.com/blevesearch/bleve_index_api"
segment_api "github.com/blevesearch/scorch_segment_api/v2"
)
func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32,
- field string, k int64) (
+ field string, k int64, searchParams json.RawMessage) (
index.VectorReader, error) {
rv := &IndexSnapshotVectorReader{
- vector: vector,
- field: field,
- k: k,
- snapshot: is,
+ vector: vector,
+ field: field,
+ k: k,
+ snapshot: is,
+ searchParams: searchParams,
}
if rv.postings == nil {
diff --git a/vendor/github.com/blevesearch/bleve/v2/index_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_impl.go
index a52547352..55212e3e6 100644
--- a/vendor/github.com/blevesearch/bleve/v2/index_impl.go
+++ b/vendor/github.com/blevesearch/bleve/v2/index_impl.go
@@ -25,6 +25,10 @@ import (
"sync/atomic"
"time"
+ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds"
+ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds"
+ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds"
+ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds"
"github.com/blevesearch/bleve/v2/document"
"github.com/blevesearch/bleve/v2/index/scorch"
"github.com/blevesearch/bleve/v2/index/upsidedown"
@@ -738,10 +742,28 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest,
datetime, layout, err := docF.DateTime()
if err == nil {
if layout == "" {
- // layout not set probably means it was indexed as a timestamp
- value = strconv.FormatInt(datetime.UnixNano(), 10)
+ // missing layout means we fallback to
+ // the default layout which is RFC3339
+ value = datetime.Format(time.RFC3339)
} else {
- value = datetime.Format(layout)
+ // the layout here can now either be representative
+ // of an actual datetime layout or a timestamp
+ switch layout {
+ case seconds.Name:
+ value = strconv.FormatInt(datetime.Unix(), 10)
+ case milliseconds.Name:
+ value = strconv.FormatInt(datetime.UnixMilli(), 10)
+ case microseconds.Name:
+ value = strconv.FormatInt(datetime.UnixMicro(), 10)
+ case nanoseconds.Name:
+ value = strconv.FormatInt(datetime.UnixNano(), 10)
+ default:
+ // the layout for formatting the date to a string
+ // is provided by a datetime parser which is not
+ // handling the timestamp case, hence the layout
+ // can be directly used to format the date
+ value = datetime.Format(layout)
+ }
}
}
case index.BooleanField:
@@ -1052,22 +1074,23 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) {
return ErrorIndexClosed
}
- indexReader, err := i.i.Reader()
- if err != nil {
- return err
+ copyIndex, ok := i.i.(index.CopyIndex)
+ if !ok {
+ return fmt.Errorf("index implementation does not support copy reader")
}
+
+ copyReader := copyIndex.CopyReader()
+ if copyReader == nil {
+ return fmt.Errorf("index's copyReader is nil")
+ }
+
defer func() {
- if cerr := indexReader.Close(); err == nil && cerr != nil {
+ if cerr := copyReader.CloseCopyReader(); err == nil && cerr != nil {
err = cerr
}
}()
- irc, ok := indexReader.(IndexCopyable)
- if !ok {
- return fmt.Errorf("index implementation does not support copy")
- }
-
- err = irc.CopyTo(d)
+ err = copyReader.CopyTo(d)
if err != nil {
return fmt.Errorf("error copying index metadata: %v", err)
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go
index 73bb124db..847326e41 100644
--- a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go
+++ b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go
@@ -443,6 +443,8 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
fieldMapping.processGeoShape(property, pathString, path, indexes, context)
} else if fieldMapping.Type == "geopoint" {
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
+ } else if fieldMapping.Type == "vector_base64" {
+ fieldMapping.processVectorBase64(property, pathString, path, indexes, context)
} else {
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
}
@@ -532,33 +534,33 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
dm.walkDocument(property, path, indexes, context)
}
case reflect.Map, reflect.Slice:
- var isPropertyVector bool
- var isPropertyVectorInitialized bool
- if subDocMapping != nil {
+ walkDocument := false
+ if subDocMapping != nil && len(subDocMapping.Fields) != 0 {
for _, fieldMapping := range subDocMapping.Fields {
switch fieldMapping.Type {
case "vector":
- processed := fieldMapping.processVector(property, pathString, path,
+ fieldMapping.processVector(property, pathString, path,
indexes, context)
- if !isPropertyVectorInitialized {
- isPropertyVector = processed
- isPropertyVectorInitialized = true
- } else {
- isPropertyVector = isPropertyVector && processed
- }
case "geopoint":
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
+ walkDocument = true
case "IP":
ip, ok := property.(net.IP)
if ok {
fieldMapping.processIP(ip, pathString, path, indexes, context)
}
+ walkDocument = true
case "geoshape":
fieldMapping.processGeoShape(property, pathString, path, indexes, context)
+ walkDocument = true
+ default:
+ walkDocument = true
}
}
+ } else {
+ walkDocument = true
}
- if !isPropertyVector {
+ if walkDocument {
dm.walkDocument(property, path, indexes, context)
}
case reflect.Ptr:
diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go
index f4339b384..5c064fddd 100644
--- a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go
+++ b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go
@@ -102,7 +102,7 @@ func newTextFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
return rv
}
-// NewKeyworFieldMapping returns a default field mapping for text with analyzer "keyword".
+// NewKeywordFieldMapping returns a default field mapping for text with analyzer "keyword".
func NewKeywordFieldMapping() *FieldMapping {
return &FieldMapping{
Type: "text",
diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go
index 171ee1a72..fe8c96713 100644
--- a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go
+++ b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go
@@ -437,24 +437,16 @@ func (im *IndexMappingImpl) FieldAnalyzer(field string) string {
func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping {
if im.TypeMapping != nil {
for _, v := range im.TypeMapping {
- for field, property := range v.Properties {
- for _, v1 := range property.Fields {
- if field == path {
- // Return field mapping if the name matches the path param.
- return *v1
- }
- }
+ fm := v.fieldDescribedByPath(path)
+ if fm != nil {
+ return *fm
}
}
}
- for field, property := range im.DefaultMapping.Properties {
- for _, v1 := range property.Fields {
- if field == path {
- // Return field mapping if the name matches the path param.
- return *v1
- }
- }
+ fm := im.DefaultMapping.fieldDescribedByPath(path)
+ if fm != nil {
+ return *fm
}
return FieldMapping{}
diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_no_vectors.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_no_vectors.go
index f9f35f57c..90cb1e225 100644
--- a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_no_vectors.go
+++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_no_vectors.go
@@ -21,11 +21,20 @@ func NewVectorFieldMapping() *FieldMapping {
return nil
}
+func NewVectorBase64FieldMapping() *FieldMapping {
+ return nil
+}
+
func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
pathString string, path []string, indexes []uint64, context *walkContext) bool {
return false
}
+func (fm *FieldMapping) processVectorBase64(propertyMightBeVector interface{},
+ pathString string, path []string, indexes []uint64, context *walkContext) {
+
+}
+
// -----------------------------------------------------------------------------
// document validation functions
diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go
index a0b712608..a3879c4bd 100644
--- a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go
+++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go
@@ -26,10 +26,11 @@ import (
index "github.com/blevesearch/bleve_index_api"
)
-// Min and Max allowed dimensions for a vector field
-const (
+// Min and Max allowed dimensions for a vector field;
+// p.s must be set/updated at process init() _only_
+var (
MinVectorDims = 1
- MaxVectorDims = 2048
+ MaxVectorDims = 4096
)
func NewVectorFieldMapping() *FieldMapping {
@@ -43,6 +44,17 @@ func NewVectorFieldMapping() *FieldMapping {
}
}
+func NewVectorBase64FieldMapping() *FieldMapping {
+ return &FieldMapping{
+ Type: "vector_base64",
+ Store: false,
+ Index: true,
+ IncludeInAll: false,
+ DocValues: false,
+ SkipFreqNorm: true,
+ }
+}
+
// validate and process a flat vector
func processFlatVector(vecV reflect.Value, dims int) ([]float32, bool) {
if vecV.Len() != dims {
@@ -140,13 +152,35 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
return true
}
+func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interface{},
+ pathString string, path []string, indexes []uint64, context *walkContext) {
+ encodedString, ok := propertyMightBeVectorBase64.(string)
+ if !ok {
+ return
+ }
+
+ decodedVector, err := document.DecodeVector(encodedString)
+ if err != nil || len(decodedVector) != fm.Dims {
+ return
+ }
+
+ fieldName := getFieldName(pathString, path, fm)
+ options := fm.Options()
+ field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector,
+ fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options)
+ context.doc.AddField(field)
+
+ // "_all" composite field is not applicable for vector_base64 field
+ context.excludedFromAll = append(context.excludedFromAll, fieldName)
+}
+
// -----------------------------------------------------------------------------
// document validation functions
func validateFieldMapping(field *FieldMapping, parentName string,
fieldAliasCtx map[string]*FieldMapping) error {
switch field.Type {
- case "vector":
+ case "vector", "vector_base64":
return validateVectorFieldAlias(field, parentName, fieldAliasCtx)
default: // non-vector field
return validateFieldType(field)
diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping_vector.go b/vendor/github.com/blevesearch/bleve/v2/mapping_vector.go
index 594313861..c73dac9e5 100644
--- a/vendor/github.com/blevesearch/bleve/v2/mapping_vector.go
+++ b/vendor/github.com/blevesearch/bleve/v2/mapping_vector.go
@@ -22,3 +22,7 @@ import "github.com/blevesearch/bleve/v2/mapping"
func NewVectorFieldMapping() *mapping.FieldMapping {
return mapping.NewVectorFieldMapping()
}
+
+func NewVectorBase64FieldMapping() *mapping.FieldMapping {
+ return mapping.NewVectorBase64FieldMapping()
+}
diff --git a/vendor/github.com/blevesearch/bleve/v2/query.go b/vendor/github.com/blevesearch/bleve/v2/query.go
index 3af750a06..93e662b9e 100644
--- a/vendor/github.com/blevesearch/bleve/v2/query.go
+++ b/vendor/github.com/blevesearch/bleve/v2/query.go
@@ -83,7 +83,7 @@ func NewDateRangeStringQuery(start, end string) *query.DateRangeStringQuery {
return query.NewDateRangeStringQuery(start, end)
}
-// NewDateRangeStringQuery creates a new Query for ranges
+// NewDateRangeInclusiveStringQuery creates a new Query for ranges
// of date values.
// Date strings are parsed using the DateTimeParser set using
//
diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/date_range_string.go b/vendor/github.com/blevesearch/bleve/v2/search/query/date_range_string.go
index b5e5c1701..ac1071959 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search/query/date_range_string.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search/query/date_range_string.go
@@ -53,7 +53,7 @@ func NewDateRangeStringQuery(start, end string) *DateRangeStringQuery {
return NewDateRangeStringInclusiveQuery(start, end, nil, nil)
}
-// NewDateRangeStringQuery creates a new Query for ranges
+// NewDateRangeStringInclusiveQuery creates a new Query for ranges
// of date values.
// Date strings are parsed using the DateTimeParser field of the query struct,
// which is a custom date time parser defined in the index mapping.
diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go
index 030483e54..17e855416 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go
@@ -19,6 +19,7 @@ package query
import (
"context"
+ "encoding/json"
"fmt"
"github.com/blevesearch/bleve/v2/mapping"
@@ -32,6 +33,9 @@ type KNNQuery struct {
Vector []float32 `json:"vector"`
K int64 `json:"k"`
BoostVal *Boost `json:"boost,omitempty"`
+
+ // see KNNRequest.Params for description
+ Params json.RawMessage `json:"params"`
}
func NewKNNQuery(vector []float32) *KNNQuery {
@@ -59,6 +63,10 @@ func (q *KNNQuery) Boost() float64 {
return q.BoostVal.Value()
}
+func (q *KNNQuery) SetParams(params json.RawMessage) {
+ q.Params = params
+}
+
func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader,
m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
fieldMapping := m.FieldMappingForPath(q.VectorField)
@@ -70,5 +78,5 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader,
return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty")
}
return searcher.NewKNNSearcher(ctx, i, m, options, q.VectorField,
- q.Vector, q.K, q.BoostVal.Value(), similarityMetric)
+ q.Vector, q.K, q.BoostVal.Value(), similarityMetric, q.Params)
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go
index 26ab656e7..d263a0e54 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go
@@ -65,7 +65,7 @@ type ValidatableQuery interface {
Validate() error
}
-// ParseQuery deserializes a JSON representation of
+// ParsePreSearchData deserializes a JSON representation of
// a PreSearchData object.
func ParsePreSearchData(input []byte) (map[string]interface{}, error) {
var rv map[string]interface{}
diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go
index 326b435d6..2f832efab 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go
@@ -47,7 +47,8 @@ type KNNQueryScorer struct {
func (s *KNNQueryScorer) Size() int {
sizeInBytes := reflectStaticSizeKNNQueryScorer + size.SizeOfPtr +
- (len(s.queryVector) * size.SizeOfFloat32) + len(s.queryField)
+ (len(s.queryVector) * size.SizeOfFloat32) + len(s.queryField) +
+ len(s.similarityMetric)
if s.queryWeightExplanation != nil {
sizeInBytes += s.queryWeightExplanation.Size()
diff --git a/vendor/github.com/blevesearch/bleve/v2/search/search.go b/vendor/github.com/blevesearch/bleve/v2/search/search.go
index 515a320f7..8cc5115dc 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search/search.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search/search.go
@@ -147,7 +147,7 @@ type DocumentMatch struct {
Index string `json:"index,omitempty"`
ID string `json:"id"`
IndexInternalID index.IndexInternalID `json:"-"`
- Score float64 `json:"score,omitempty"`
+ Score float64 `json:"score"`
Expl *Explanation `json:"explanation,omitempty"`
Locations FieldTermLocationMap `json:"locations,omitempty"`
Fragments FieldFragmentMap `json:"fragments,omitempty"`
diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go
index 8f146b3e8..e17bb7a0f 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go
@@ -19,6 +19,7 @@ package searcher
import (
"context"
+ "encoding/json"
"reflect"
"github.com/blevesearch/bleve/v2/mapping"
@@ -48,9 +49,11 @@ type KNNSearcher struct {
func NewKNNSearcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping,
options search.SearcherOptions, field string, vector []float32, k int64,
- boost float64, similarityMetric string) (search.Searcher, error) {
+ boost float64, similarityMetric string, searchParams json.RawMessage) (
+ search.Searcher, error) {
+
if vr, ok := i.(index.VectorIndexReader); ok {
- vectorReader, err := vr.VectorReader(ctx, vector, field, k)
+ vectorReader, err := vr.VectorReader(ctx, vector, field, k, searchParams)
if err != nil {
return nil, err
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/search_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_knn.go
index 683771418..008a3615c 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search_knn.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search_knn.go
@@ -23,18 +23,22 @@ import (
"fmt"
"sort"
+ "github.com/blevesearch/bleve/v2/document"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/collector"
"github.com/blevesearch/bleve/v2/search/query"
index "github.com/blevesearch/bleve_index_api"
)
+const supportForVectorSearch = true
+
type knnOperator string
// Must be updated only at init
var BleveMaxK = int64(10000)
type SearchRequest struct {
+ ClientContextID string `json:"client_context_id,omitempty"`
Query query.Query `json:"query"`
Size int `json:"size"`
From int `json:"from"`
@@ -66,11 +70,23 @@ type SearchRequest struct {
sortFunc func(sort.Interface)
}
+// Vector takes precedence over vectorBase64 in case both fields are given
type KNNRequest struct {
- Field string `json:"field"`
- Vector []float32 `json:"vector"`
- K int64 `json:"k"`
- Boost *query.Boost `json:"boost,omitempty"`
+ Field string `json:"field"`
+ Vector []float32 `json:"vector"`
+ VectorBase64 string `json:"vector_base64"`
+ K int64 `json:"k"`
+ Boost *query.Boost `json:"boost,omitempty"`
+
+ // Search parameters for the field's vector index part of the segment.
+ // Value of it depends on the field's backing vector index implementation.
+ //
+ // For Faiss IVF index, supported search params are:
+ // - ivf_nprobe_pct : int // percentage of total clusters to search
+ // - ivf_max_codes_pct : float // percentage of total vectors to visit to do a query (across all clusters)
+ //
+ // Consult go-faiss to know all supported search params
+ Params json.RawMessage `json:"params"`
}
func (r *SearchRequest) AddKNN(field string, vector []float32, k int64, boost float64) {
@@ -208,6 +224,7 @@ func createKNNQuery(req *SearchRequest) (query.Query, []int64, int64, error) {
knnQuery.SetFieldVal(knn.Field)
knnQuery.SetK(knn.K)
knnQuery.SetBoost(knn.Boost.Value())
+ knnQuery.SetParams(knn.Params)
subQueries = append(subQueries, knnQuery)
kArray = append(kArray, knn.K)
sumOfK += knn.K
@@ -230,6 +247,15 @@ func validateKNN(req *SearchRequest) error {
if q == nil {
return fmt.Errorf("knn query cannot be nil")
}
+ if len(q.Vector) == 0 && q.VectorBase64 != "" {
+ // consider vector_base64 only if vector is not provided
+ decodedVector, err := document.DecodeVector(q.VectorBase64)
+ if err != nil {
+ return err
+ }
+
+ q.Vector = decodedVector
+ }
if q.K <= 0 || len(q.Vector) == 0 {
return fmt.Errorf("k must be greater than 0 and vector must be non-empty")
}
diff --git a/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go
index aff826115..bb72e15a9 100644
--- a/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go
+++ b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go
@@ -28,6 +28,8 @@ import (
index "github.com/blevesearch/bleve_index_api"
)
+const supportForVectorSearch = false
+
// A SearchRequest describes all the parameters
// needed to search the index.
// Query is required.
diff --git a/vendor/github.com/blevesearch/bleve/v2/util/extract.go b/vendor/github.com/blevesearch/bleve/v2/util/extract.go
index e963d0c3a..0d3decfaf 100644
--- a/vendor/github.com/blevesearch/bleve/v2/util/extract.go
+++ b/vendor/github.com/blevesearch/bleve/v2/util/extract.go
@@ -48,7 +48,7 @@ func ExtractNumericValFloat32(v interface{}) (float32, bool) {
switch {
case val.CanFloat():
floatVal := val.Float()
- if floatVal > math.MaxFloat32 {
+ if !IsValidFloat32(floatVal) {
return 0, false
}
return float32(floatVal), true
@@ -60,3 +60,7 @@ func ExtractNumericValFloat32(v interface{}) (float32, bool) {
return 0, false
}
+
+func IsValidFloat32(val float64) bool {
+ return !math.IsNaN(val) && !math.IsInf(val, 0) && val <= math.MaxFloat32
+}
diff --git a/vendor/github.com/blevesearch/bleve_index_api/index.go b/vendor/github.com/blevesearch/bleve_index_api/index.go
index 4c916d5c2..a0035560e 100644
--- a/vendor/github.com/blevesearch/bleve_index_api/index.go
+++ b/vendor/github.com/blevesearch/bleve_index_api/index.go
@@ -48,6 +48,15 @@ type Index interface {
StatsMap() map[string]interface{}
}
+// CopyIndex is an extended index that supports copying to a new location online.
+// Use the CopyReader method to obtain a reader for initiating the copy operation.
+type CopyIndex interface {
+ Index
+ // Obtain a copy reader for the online copy/backup operation,
+ // to handle necessary bookkeeping, instead of using the regular IndexReader.
+ CopyReader() CopyReader
+}
+
type IndexReader interface {
TermFieldReader(ctx context.Context, term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
@@ -79,6 +88,15 @@ type IndexReader interface {
Close() error
}
+// CopyReader is an extended index reader for backup or online copy operations, replacing the regular index reader.
+type CopyReader interface {
+ IndexReader
+ // CopyTo performs an online copy or backup of the index to the specified directory.
+ CopyTo(d Directory) error
+ // CloseCopyReader must be used instead of Close() to close the copy reader.
+ CloseCopyReader() error
+}
+
type IndexReaderRegexp interface {
FieldDictRegexp(field string, regex string) (FieldDict, error)
}
diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector.go b/vendor/github.com/blevesearch/bleve_index_api/vector.go
index 12c543346..3eff52cae 100644
--- a/vendor/github.com/blevesearch/bleve_index_api/vector.go
+++ b/vendor/github.com/blevesearch/bleve_index_api/vector.go
@@ -51,19 +51,22 @@ var SupportedSimilarityMetrics = map[string]struct{}{
// -----------------------------------------------------------------------------
const (
- IndexOptimizedForRecall = "recall"
- IndexOptimizedForLatency = "latency"
+ IndexOptimizedForRecall = "recall"
+ IndexOptimizedForLatency = "latency"
+ IndexOptimizedForMemoryEfficient = "memory-efficient"
)
const DefaultIndexOptimization = IndexOptimizedForRecall
var SupportedVectorIndexOptimizations = map[string]int{
- IndexOptimizedForRecall: 0,
- IndexOptimizedForLatency: 1,
+ IndexOptimizedForRecall: 0,
+ IndexOptimizedForLatency: 1,
+ IndexOptimizedForMemoryEfficient: 2,
}
// Reverse maps vector index optimizations': int -> string
var VectorIndexOptimizationsReverseLookup = map[int]string{
0: IndexOptimizedForRecall,
1: IndexOptimizedForLatency,
+ 2: IndexOptimizedForMemoryEfficient,
}
diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector_index.go b/vendor/github.com/blevesearch/bleve_index_api/vector_index.go
index fa736b969..da0a74ae9 100644
--- a/vendor/github.com/blevesearch/bleve_index_api/vector_index.go
+++ b/vendor/github.com/blevesearch/bleve_index_api/vector_index.go
@@ -19,6 +19,7 @@ package index
import (
"context"
+ "encoding/json"
"reflect"
)
@@ -47,7 +48,7 @@ type VectorReader interface {
}
type VectorIndexReader interface {
- VectorReader(ctx context.Context, vector []float32, field string, k int64) (
+ VectorReader(ctx context.Context, vector []float32, field string, k int64, searchParams json.RawMessage) (
VectorReader, error)
}
diff --git a/vendor/github.com/blevesearch/go-faiss/autotune.go b/vendor/github.com/blevesearch/go-faiss/autotune.go
index 0c06c4cc1..4b818d31b 100644
--- a/vendor/github.com/blevesearch/go-faiss/autotune.go
+++ b/vendor/github.com/blevesearch/go-faiss/autotune.go
@@ -6,7 +6,6 @@ package faiss
*/
import "C"
import (
- "runtime"
"unsafe"
)
@@ -16,9 +15,6 @@ type ParameterSpace struct {
// NewParameterSpace creates a new ParameterSpace.
func NewParameterSpace() (*ParameterSpace, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
var ps *C.FaissParameterSpace
if c := C.faiss_ParameterSpace_new(&ps); c != 0 {
return nil, getLastError()
@@ -28,12 +24,10 @@ func NewParameterSpace() (*ParameterSpace, error) {
// SetIndexParameter sets one of the parameters.
func (p *ParameterSpace) SetIndexParameter(idx Index, name string, val float64) error {
- runtime.LockOSThread()
cname := C.CString(name)
defer func() {
C.free(unsafe.Pointer(cname))
- runtime.UnlockOSThread()
}()
c := C.faiss_ParameterSpace_set_index_parameter(
diff --git a/vendor/github.com/blevesearch/go-faiss/index.go b/vendor/github.com/blevesearch/go-faiss/index.go
index 76bc1758b..b58a6149f 100644
--- a/vendor/github.com/blevesearch/go-faiss/index.go
+++ b/vendor/github.com/blevesearch/go-faiss/index.go
@@ -12,8 +12,8 @@ package faiss
*/
import "C"
import (
+ "encoding/json"
"fmt"
- "runtime"
"unsafe"
)
@@ -49,7 +49,7 @@ type Index interface {
// corresponding distances.
Search(x []float32, k int64) (distances []float32, labels []int64, err error)
- SearchWithoutIDs(x []float32, k int64, exclude []int64) (distances []float32,
+ SearchWithoutIDs(x []float32, k int64, exclude []int64, params json.RawMessage) (distances []float32,
labels []int64, err error)
Reconstruct(key int64) ([]float32, error)
@@ -108,9 +108,6 @@ func (idx *faissIndex) MetricType() int {
}
func (idx *faissIndex) Train(x []float32) error {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
n := len(x) / idx.D()
if c := C.faiss_Index_train(idx.idx, C.idx_t(n), (*C.float)(&x[0])); c != 0 {
return getLastError()
@@ -119,9 +116,6 @@ func (idx *faissIndex) Train(x []float32) error {
}
func (idx *faissIndex) Add(x []float32) error {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
n := len(x) / idx.D()
if c := C.faiss_Index_add(idx.idx, C.idx_t(n), (*C.float)(&x[0])); c != 0 {
return getLastError()
@@ -130,9 +124,6 @@ func (idx *faissIndex) Add(x []float32) error {
}
func (idx *faissIndex) AddWithIDs(x []float32, xids []int64) error {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
n := len(x) / idx.D()
if c := C.faiss_Index_add_with_ids(
idx.idx,
@@ -148,8 +139,6 @@ func (idx *faissIndex) AddWithIDs(x []float32, xids []int64) error {
func (idx *faissIndex) Search(x []float32, k int64) (
distances []float32, labels []int64, err error,
) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
n := len(x) / idx.D()
distances = make([]float32, int64(n)*k)
@@ -168,52 +157,35 @@ func (idx *faissIndex) Search(x []float32, k int64) (
return
}
-func (idx *faissIndex) SearchWithoutIDs(x []float32, k int64, exclude []int64) (
+func (idx *faissIndex) SearchWithoutIDs(x []float32, k int64, exclude []int64, params json.RawMessage) (
distances []float32, labels []int64, err error,
) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
- if len(exclude) <= 0 {
+ if params == nil && len(exclude) == 0 {
return idx.Search(x, k)
}
- excludeSelector, err := NewIDSelectorNot(exclude)
+ var selector *C.FaissIDSelector
+ if len(exclude) > 0 {
+ excludeSelector, err := NewIDSelectorNot(exclude)
+ if err != nil {
+ return nil, nil, err
+ }
+ selector = excludeSelector.sel
+ defer excludeSelector.Delete()
+ }
+
+ searchParams, err := NewSearchParams(idx, params, selector)
+ defer searchParams.Delete()
if err != nil {
return nil, nil, err
}
- var sp *C.FaissSearchParameters
- C.faiss_SearchParameters_new(&sp, (*C.FaissIDSelector)(excludeSelector.sel))
- ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr())
- if ivfPtr != nil {
- sp = C.faiss_SearchParametersIVF_cast(sp)
- C.faiss_SearchParametersIVF_new_with_sel(&sp, (*C.FaissIDSelector)(excludeSelector.sel))
- }
+ distances, labels, err = idx.searchWithParams(x, k, searchParams.sp)
- n := len(x) / idx.D()
- distances = make([]float32, int64(n)*k)
- labels = make([]int64, int64(n)*k)
-
- if c := C.faiss_Index_search_with_params(
- idx.idx,
- C.idx_t(n),
- (*C.float)(&x[0]),
- C.idx_t(k), sp,
- (*C.float)(&distances[0]),
- (*C.idx_t)(&labels[0]),
- ); c != 0 {
- err = getLastError()
- }
- excludeSelector.Delete()
- C.faiss_SearchParameters_free(sp)
return
}
func (idx *faissIndex) Reconstruct(key int64) (recons []float32, err error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
rv := make([]float32, idx.D())
if c := C.faiss_Index_reconstruct(
idx.idx,
@@ -227,9 +199,6 @@ func (idx *faissIndex) Reconstruct(key int64) (recons []float32, err error) {
}
func (idx *faissIndex) ReconstructBatch(keys []int64, recons []float32) ([]float32, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
var err error
n := int64(len(keys))
if c := C.faiss_Index_reconstruct_batch(
@@ -252,9 +221,6 @@ func (i *IndexImpl) MergeFrom(other Index, add_id int64) error {
}
func (idx *faissIndex) MergeFrom(other Index, add_id int64) (err error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
otherIdx, ok := other.(*faissIndex)
if !ok {
return fmt.Errorf("merge api not supported")
@@ -274,9 +240,6 @@ func (idx *faissIndex) MergeFrom(other Index, add_id int64) (err error) {
func (idx *faissIndex) RangeSearch(x []float32, radius float32) (
*RangeSearchResult, error,
) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
n := len(x) / idx.D()
var rsr *C.FaissRangeSearchResult
if c := C.faiss_RangeSearchResult_new(&rsr, C.idx_t(n)); c != 0 {
@@ -295,9 +258,6 @@ func (idx *faissIndex) RangeSearch(x []float32, radius float32) (
}
func (idx *faissIndex) Reset() error {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
if c := C.faiss_Index_reset(idx.idx); c != 0 {
return getLastError()
}
@@ -305,9 +265,6 @@ func (idx *faissIndex) Reset() error {
}
func (idx *faissIndex) RemoveIDs(sel *IDSelector) (int, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
var nRemoved C.size_t
if c := C.faiss_Index_remove_ids(idx.idx, sel.sel, &nRemoved); c != 0 {
return 0, getLastError()
@@ -319,6 +276,30 @@ func (idx *faissIndex) Close() {
C.faiss_Index_free(idx.idx)
}
+func (idx *faissIndex) searchWithParams(x []float32, k int64, searchParams *C.FaissSearchParameters) (
+ distances []float32, labels []int64, err error,
+) {
+ n := len(x) / idx.D()
+ distances = make([]float32, int64(n)*k)
+ labels = make([]int64, int64(n)*k)
+
+ if c := C.faiss_Index_search_with_params(
+ idx.idx,
+ C.idx_t(n),
+ (*C.float)(&x[0]),
+ C.idx_t(k),
+ searchParams,
+ (*C.float)(&distances[0]),
+ (*C.idx_t)(&labels[0]),
+ ); c != 0 {
+ err = getLastError()
+ }
+
+ return
+}
+
+// -----------------------------------------------------------------------------
+
// RangeSearchResult is the result of a range search.
type RangeSearchResult struct {
rsr *C.FaissRangeSearchResult
@@ -364,9 +345,6 @@ type IndexImpl struct {
// IndexFactory builds a composite index.
// description is a comma-separated list of components.
func IndexFactory(d int, description string, metric int) (*IndexImpl, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
cdesc := C.CString(description)
defer C.free(unsafe.Pointer(cdesc))
var idx faissIndex
@@ -376,3 +354,7 @@ func IndexFactory(d int, description string, metric int) (*IndexImpl, error) {
}
return &IndexImpl{&idx}, nil
}
+
+func SetOMPThreads(n uint) {
+ C.faiss_set_omp_threads(C.uint(n))
+}
diff --git a/vendor/github.com/blevesearch/go-faiss/index_io.go b/vendor/github.com/blevesearch/go-faiss/index_io.go
index ba8eaf7e7..608f4d75f 100644
--- a/vendor/github.com/blevesearch/go-faiss/index_io.go
+++ b/vendor/github.com/blevesearch/go-faiss/index_io.go
@@ -8,7 +8,6 @@ package faiss
*/
import "C"
import (
- "runtime"
"unsafe"
)
@@ -23,11 +22,8 @@ func WriteIndex(idx Index, filename string) error {
}
func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
// the values to be returned by the faiss APIs
- tempBuf := (*C.uchar)(C.malloc(C.size_t(0)))
+ tempBuf := (*C.uchar)(nil)
bufSize := C.size_t(0)
if c := C.faiss_write_index_buf(
@@ -35,7 +31,7 @@ func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
&bufSize,
&tempBuf,
); c != 0 {
- C.free(unsafe.Pointer(tempBuf))
+ C.faiss_free_buf(&tempBuf)
return nil, getLastError()
}
@@ -72,9 +68,11 @@ func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
// cheaper.
copy(rv, val)
- // safe to free the c memory allocated while serializing the index;
+ // safe to free the c memory allocated (tempBuf) while serializing the index (must be done
+ // within C runtime for it was allocated there);
// rv is from go runtime - so different address space altogether
- C.free(unsafe.Pointer(tempBuf))
+ C.faiss_free_buf(&tempBuf)
+
// p.s: no need to free "val" since the underlying memory is same as tempBuf (deferred free)
val = nil
@@ -82,9 +80,6 @@ func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
}
func ReadIndexFromBuffer(buf []byte, ioflags int) (*IndexImpl, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
ptr := (*C.uchar)(unsafe.Pointer(&buf[0]))
size := C.size_t(len(buf))
@@ -107,8 +102,10 @@ func ReadIndexFromBuffer(buf []byte, ioflags int) (*IndexImpl, error) {
}
const (
- IOFlagMmap = C.FAISS_IO_FLAG_MMAP
- IOFlagReadOnly = C.FAISS_IO_FLAG_READ_ONLY
+ IOFlagMmap = C.FAISS_IO_FLAG_MMAP
+ IOFlagReadOnly = C.FAISS_IO_FLAG_READ_ONLY
+ IOFlagReadMmap = C.FAISS_IO_FLAG_READ_MMAP | C.FAISS_IO_FLAG_ONDISK_IVF
+ IOFlagSkipPrefetch = C.FAISS_IO_FLAG_SKIP_PREFETCH
)
// ReadIndex reads an index from a file.
diff --git a/vendor/github.com/blevesearch/go-faiss/index_ivf.go b/vendor/github.com/blevesearch/go-faiss/index_ivf.go
index 88266f115..2d84e4ab9 100644
--- a/vendor/github.com/blevesearch/go-faiss/index_ivf.go
+++ b/vendor/github.com/blevesearch/go-faiss/index_ivf.go
@@ -10,12 +10,9 @@ package faiss
import "C"
import (
"fmt"
- "runtime"
)
func (idx *IndexImpl) SetDirectMap(mapType int) (err error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr())
if ivfPtr == nil {
@@ -31,8 +28,6 @@ func (idx *IndexImpl) SetDirectMap(mapType int) (err error) {
}
func (idx *IndexImpl) GetSubIndex() (*IndexImpl, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
ptr := C.faiss_IndexIDMap2_cast(idx.cPtr())
if ptr == nil {
@@ -54,5 +49,5 @@ func (idx *IndexImpl) SetNProbe(nprobe int32) {
if ivfPtr == nil {
return
}
- C.faiss_IndexIVF_set_nprobe(ivfPtr, C.ulong(nprobe))
+ C.faiss_IndexIVF_set_nprobe(ivfPtr, C.size_t(nprobe))
}
diff --git a/vendor/github.com/blevesearch/go-faiss/search_params.go b/vendor/github.com/blevesearch/go-faiss/search_params.go
new file mode 100644
index 000000000..17575d5fb
--- /dev/null
+++ b/vendor/github.com/blevesearch/go-faiss/search_params.go
@@ -0,0 +1,99 @@
+package faiss
+
+/*
+#include
+#include
+#include
+*/
+import "C"
+import (
+ "encoding/json"
+ "fmt"
+)
+
+type SearchParams struct {
+ sp *C.FaissSearchParameters
+}
+
+// Delete frees the memory associated with s.
+func (s *SearchParams) Delete() {
+ if s == nil || s.sp == nil {
+ return
+ }
+ C.faiss_SearchParameters_free(s.sp)
+}
+
+type searchParamsIVF struct {
+ NprobePct float32 `json:"ivf_nprobe_pct,omitempty"`
+ MaxCodesPct float32 `json:"ivf_max_codes_pct,omitempty"`
+}
+
+func (s *searchParamsIVF) Validate() error {
+ if s.NprobePct < 0 || s.NprobePct > 100 {
+ return fmt.Errorf("invalid IVF search params, ivf_nprobe_pct:%v, "+
+ "should be in range [0, 100]", s.NprobePct)
+ }
+
+ if s.MaxCodesPct < 0 || s.MaxCodesPct > 100 {
+ return fmt.Errorf("invalid IVF search params, ivf_max_codes_pct:%v, "+
+ "should be in range [0, 100]", s.MaxCodesPct)
+ }
+
+ return nil
+}
+
+// Always return a valid SearchParams object,
+// thus caller must clean up the object
+// by invoking Delete() method, even if an error is returned.
+func NewSearchParams(idx Index, params json.RawMessage, sel *C.FaissIDSelector,
+) (*SearchParams, error) {
+ rv := &SearchParams{}
+ if c := C.faiss_SearchParameters_new(&rv.sp, sel); c != 0 {
+ return rv, fmt.Errorf("failed to create faiss search params")
+ }
+
+ // # check if the index is IVF and set the search params
+ if ivfIdx := C.faiss_IndexIVF_cast(idx.cPtr()); ivfIdx != nil {
+ rv.sp = C.faiss_SearchParametersIVF_cast(rv.sp)
+ if len(params) == 0 {
+ return rv, nil
+ }
+
+ var ivfParams searchParamsIVF
+ if err := json.Unmarshal(params, &ivfParams); err != nil {
+ return rv, fmt.Errorf("failed to unmarshal IVF search params, "+
+ "err:%v", err)
+ }
+ if err := ivfParams.Validate(); err != nil {
+ return rv, err
+ }
+
+ var nprobe, maxCodes int
+
+ if ivfParams.NprobePct > 0 {
+ nlist := float32(C.faiss_IndexIVF_nlist(ivfIdx))
+ nprobe = int(nlist * (ivfParams.NprobePct / 100))
+ } else {
+ // It's important to set nprobe to the value decided at the time of
+ // index creation. Otherwise, nprobe will be set to the default
+ // value of 1.
+ nprobe = int(C.faiss_IndexIVF_nprobe(ivfIdx))
+ }
+
+ if ivfParams.MaxCodesPct > 0 {
+ nvecs := C.faiss_Index_ntotal(idx.cPtr())
+ maxCodes = int(float32(nvecs) * (ivfParams.MaxCodesPct / 100))
+ } // else, maxCodes will be set to the default value of 0, which means no limit
+
+ if c := C.faiss_SearchParametersIVF_new_with(
+ &rv.sp,
+ sel,
+ C.size_t(nprobe),
+ C.size_t(maxCodes),
+ ); c != 0 {
+ return rv, fmt.Errorf("failed to create faiss IVF search params")
+ }
+ }
+
+ return rv, nil
+}
diff --git a/vendor/github.com/blevesearch/go-faiss/selector.go b/vendor/github.com/blevesearch/go-faiss/selector.go
index 84161a507..d372006b9 100644
--- a/vendor/github.com/blevesearch/go-faiss/selector.go
+++ b/vendor/github.com/blevesearch/go-faiss/selector.go
@@ -4,18 +4,42 @@ package faiss
#include
*/
import "C"
-import "runtime"
// IDSelector represents a set of IDs to remove.
type IDSelector struct {
sel *C.FaissIDSelector
}
+// Delete frees the memory associated with s.
+func (s *IDSelector) Delete() {
+ if s == nil || s.sel == nil {
+ return
+ }
+
+ C.faiss_IDSelector_free(s.sel)
+}
+
+type IDSelectorBatch struct {
+ sel *C.FaissIDSelector
+ batchSel *C.FaissIDSelector
+}
+
+// Delete frees the memory associated with s.
+func (s *IDSelectorBatch) Delete() {
+ if s == nil {
+ return
+ }
+
+ if s.sel != nil {
+ C.faiss_IDSelector_free(s.sel)
+ }
+ if s.batchSel != nil {
+ C.faiss_IDSelector_free(s.batchSel)
+ }
+}
+
// NewIDSelectorRange creates a selector that removes IDs on [imin, imax).
func NewIDSelectorRange(imin, imax int64) (*IDSelector, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
var sel *C.FaissIDSelectorRange
c := C.faiss_IDSelectorRange_new(&sel, C.idx_t(imin), C.idx_t(imax))
if c != 0 {
@@ -26,9 +50,6 @@ func NewIDSelectorRange(imin, imax int64) (*IDSelector, error) {
// NewIDSelectorBatch creates a new batch selector.
func NewIDSelectorBatch(indices []int64) (*IDSelector, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
var sel *C.FaissIDSelectorBatch
if c := C.faiss_IDSelectorBatch_new(
&sel,
@@ -42,10 +63,7 @@ func NewIDSelectorBatch(indices []int64) (*IDSelector, error) {
// NewIDSelectorNot creates a new Not selector, wrapped arround a
// batch selector, with the IDs in 'exclude'.
-func NewIDSelectorNot(exclude []int64) (*IDSelector, error) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
+func NewIDSelectorNot(exclude []int64) (*IDSelectorBatch, error) {
batchSelector, err := NewIDSelectorBatch(exclude)
if err != nil {
return nil, err
@@ -56,12 +74,8 @@ func NewIDSelectorNot(exclude []int64) (*IDSelector, error) {
&sel,
batchSelector.sel,
); c != 0 {
+ batchSelector.Delete()
return nil, getLastError()
}
- return &IDSelector{(*C.FaissIDSelector)(sel)}, nil
-}
-
-// Delete frees the memory associated with s.
-func (s *IDSelector) Delete() {
- C.faiss_IDSelector_free(s.sel)
+ return &IDSelectorBatch{sel: (*C.FaissIDSelector)(sel), batchSel: batchSelector.sel}, nil
}
diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go
index bc00796cc..3af08e25e 100644
--- a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go
+++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go
@@ -18,6 +18,8 @@
package segment
import (
+ "encoding/json"
+
"github.com/RoaringBitmap/roaring"
)
@@ -55,14 +57,15 @@ type VecPostingsIterator interface {
}
type VectorIndex interface {
- Search(qVector []float32, k int64, except *roaring.Bitmap) (VecPostingsList, error)
+ // @params: Search params for backing vector index (like IVF, HNSW, etc.)
+ Search(qVector []float32, k int64, params json.RawMessage) (VecPostingsList, error)
Close()
Size() uint64
}
type VectorSegment interface {
Segment
- InterpretVectorIndex(field string) (VectorIndex, error)
+ InterpretVectorIndex(field string, except *roaring.Bitmap) (VectorIndex, error)
}
type VecPosting interface {
diff --git a/vendor/github.com/blevesearch/zapx/v16/build.go b/vendor/github.com/blevesearch/zapx/v16/build.go
index a545b072b..99635739f 100644
--- a/vendor/github.com/blevesearch/zapx/v16/build.go
+++ b/vendor/github.com/blevesearch/zapx/v16/build.go
@@ -166,7 +166,6 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32,
memCRC: memCRC,
chunkMode: chunkMode,
fieldsMap: fieldsMap,
- fieldsInv: fieldsInv,
numDocs: numDocs,
storedIndexOffset: storedIndexOffset,
fieldsIndexOffset: sectionsIndexOffset,
@@ -175,6 +174,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32,
docValueOffset: 0, // docValueOffsets identified automatically by the section
dictLocs: dictLocs,
fieldFSTs: make(map[uint16]*vellum.FST),
+ vecIndexCache: newVectorIndexCache(),
}
sb.updateSize()
diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache.go
new file mode 100644
index 000000000..893da2d5f
--- /dev/null
+++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache.go
@@ -0,0 +1,299 @@
+// Copyright (c) 2024 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build vectors
+// +build vectors
+
+package zap
+
+import (
+ "encoding/binary"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "github.com/RoaringBitmap/roaring"
+ faiss "github.com/blevesearch/go-faiss"
+)
+
+func newVectorIndexCache() *vectorIndexCache {
+ return &vectorIndexCache{
+ cache: make(map[uint16]*cacheEntry),
+ closeCh: make(chan struct{}),
+ }
+}
+
+type vectorIndexCache struct {
+ closeCh chan struct{}
+ m sync.RWMutex
+ cache map[uint16]*cacheEntry
+}
+
+func (vc *vectorIndexCache) Clear() {
+ vc.m.Lock()
+ close(vc.closeCh)
+
+ // forcing a close on all indexes to avoid memory leaks.
+ for _, entry := range vc.cache {
+ entry.close()
+ }
+ vc.cache = nil
+ vc.m.Unlock()
+}
+
+func (vc *vectorIndexCache) loadOrCreate(fieldID uint16, mem []byte, except *roaring.Bitmap) (
+ index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, vecIDsToExclude []int64, err error) {
+ var found bool
+ index, vecDocIDMap, vecIDsToExclude, found = vc.loadFromCache(fieldID, except)
+ if !found {
+ index, vecDocIDMap, vecIDsToExclude, err = vc.createAndCache(fieldID, mem, except)
+ }
+ return index, vecDocIDMap, vecIDsToExclude, err
+}
+
+func (vc *vectorIndexCache) loadFromCache(fieldID uint16, except *roaring.Bitmap) (
+ index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, vecIDsToExclude []int64, found bool) {
+ vc.m.RLock()
+ defer vc.m.RUnlock()
+
+ entry, ok := vc.cache[fieldID]
+ if !ok {
+ return nil, nil, nil, false
+ }
+
+ index, vecDocIDMap = entry.load()
+ vecIDsToExclude = getVecIDsToExclude(vecDocIDMap, except)
+
+ return index, vecDocIDMap, vecIDsToExclude, true
+}
+
+func (vc *vectorIndexCache) createAndCache(fieldID uint16, mem []byte, except *roaring.Bitmap) (
+ index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, vecIDsToExclude []int64, err error) {
+ vc.m.Lock()
+ defer vc.m.Unlock()
+
+ // when there are multiple threads trying to build the index, guard redundant
+ // index creation by doing a double check and return if already created and
+ // cached.
+ entry, ok := vc.cache[fieldID]
+ if ok {
+ index, vecDocIDMap = entry.load()
+ vecIDsToExclude = getVecIDsToExclude(vecDocIDMap, except)
+ return index, vecDocIDMap, vecIDsToExclude, nil
+ }
+
+ // if the cache doesn't have entry, construct the vector to doc id map and the
+ // vector index out of the mem bytes and update the cache under lock.
+ pos := 0
+ numVecs, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64])
+ pos += n
+
+ vecDocIDMap = make(map[int64]uint32, numVecs)
+ isExceptNotEmpty := except != nil && !except.IsEmpty()
+ for i := 0; i < int(numVecs); i++ {
+ vecID, n := binary.Varint(mem[pos : pos+binary.MaxVarintLen64])
+ pos += n
+ docID, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64])
+ pos += n
+
+ docIDUint32 := uint32(docID)
+ if isExceptNotEmpty && except.Contains(docIDUint32) {
+ vecIDsToExclude = append(vecIDsToExclude, vecID)
+ continue
+ }
+ vecDocIDMap[vecID] = docIDUint32
+ }
+
+ indexSize, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64])
+ pos += n
+
+ index, err = faiss.ReadIndexFromBuffer(mem[pos:pos+int(indexSize)], faissIOFlags)
+ if err != nil {
+ return nil, nil, nil, err
+ }
+
+ vc.insertLOCKED(fieldID, index, vecDocIDMap)
+ return index, vecDocIDMap, vecIDsToExclude, nil
+}
+
+func (vc *vectorIndexCache) insertLOCKED(fieldIDPlus1 uint16,
+ index *faiss.IndexImpl, vecDocIDMap map[int64]uint32) {
+ // the first time we've hit the cache, try to spawn a monitoring routine
+ // which will reconcile the moving averages for all the fields being hit
+ if len(vc.cache) == 0 {
+ go vc.monitor()
+ }
+
+ _, ok := vc.cache[fieldIDPlus1]
+ if !ok {
+ // initializing the alpha with 0.4 essentially means that we are favoring
+ // the history a little bit more relative to the current sample value.
+ // this makes the average to be kept above the threshold value for a
+ // longer time and thereby the index to be resident in the cache
+ // for longer time.
+ vc.cache[fieldIDPlus1] = createCacheEntry(index, vecDocIDMap, 0.4)
+ }
+}
+
+func (vc *vectorIndexCache) incHit(fieldIDPlus1 uint16) {
+ vc.m.RLock()
+ entry, ok := vc.cache[fieldIDPlus1]
+ if ok {
+ entry.incHit()
+ }
+ vc.m.RUnlock()
+}
+
+func (vc *vectorIndexCache) decRef(fieldIDPlus1 uint16) {
+ vc.m.RLock()
+ entry, ok := vc.cache[fieldIDPlus1]
+ if ok {
+ entry.decRef()
+ }
+ vc.m.RUnlock()
+}
+
+func (vc *vectorIndexCache) cleanup() bool {
+ vc.m.Lock()
+ cache := vc.cache
+
+ // for every field reconcile the average with the current sample values
+ for fieldIDPlus1, entry := range cache {
+ sample := atomic.LoadUint64(&entry.tracker.sample)
+ entry.tracker.add(sample)
+
+ refCount := atomic.LoadInt64(&entry.refs)
+ // the comparison threshold as of now is (1 - a). mathematically it
+ // means that there is only 1 query per second on average as per history.
+ // and in the current second, there were no queries performed against
+ // this index.
+ if entry.tracker.avg <= (1-entry.tracker.alpha) && refCount <= 0 {
+ atomic.StoreUint64(&entry.tracker.sample, 0)
+ delete(vc.cache, fieldIDPlus1)
+ entry.close()
+ continue
+ }
+ atomic.StoreUint64(&entry.tracker.sample, 0)
+ }
+
+ rv := len(vc.cache) == 0
+ vc.m.Unlock()
+ return rv
+}
+
+var monitorFreq = 1 * time.Second
+
+func (vc *vectorIndexCache) monitor() {
+ ticker := time.NewTicker(monitorFreq)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-vc.closeCh:
+ return
+ case <-ticker.C:
+ exit := vc.cleanup()
+ if exit {
+ // no entries to be monitored, exit
+ return
+ }
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+type ewma struct {
+ alpha float64
+ avg float64
+ // every hit to the cache entry is recorded as part of a sample
+ // which will be used to calculate the average in the next cycle of average
+ // computation (which is average traffic for the field till now). this is
+ // used to track the per second hits to the cache entries.
+ sample uint64
+}
+
+func (e *ewma) add(val uint64) {
+ if e.avg == 0.0 {
+ e.avg = float64(val)
+ } else {
+ // the exponentially weighted moving average
+ // X(t) = a.v + (1 - a).X(t-1)
+ e.avg = e.alpha*float64(val) + (1-e.alpha)*e.avg
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+func createCacheEntry(index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, alpha float64) *cacheEntry {
+ return &cacheEntry{
+ index: index,
+ vecDocIDMap: vecDocIDMap,
+ tracker: &ewma{
+ alpha: alpha,
+ sample: 1,
+ },
+ refs: 1,
+ }
+}
+
+type cacheEntry struct {
+ tracker *ewma
+
+ // this is used to track the live references to the cache entry,
+ // such that while we do a cleanup() and we see that the avg is below a
+ // threshold we close/cleanup only if the live refs to the cache entry is 0.
+ refs int64
+
+ index *faiss.IndexImpl
+ vecDocIDMap map[int64]uint32
+}
+
+func (ce *cacheEntry) incHit() {
+ atomic.AddUint64(&ce.tracker.sample, 1)
+}
+
+func (ce *cacheEntry) addRef() {
+ atomic.AddInt64(&ce.refs, 1)
+}
+
+func (ce *cacheEntry) decRef() {
+ atomic.AddInt64(&ce.refs, -1)
+}
+
+func (ce *cacheEntry) load() (*faiss.IndexImpl, map[int64]uint32) {
+ ce.incHit()
+ ce.addRef()
+ return ce.index, ce.vecDocIDMap
+}
+
+func (ce *cacheEntry) close() {
+ go func() {
+ ce.index.Close()
+ ce.index = nil
+ ce.vecDocIDMap = nil
+ }()
+}
+
+// -----------------------------------------------------------------------------
+
+func getVecIDsToExclude(vecDocIDMap map[int64]uint32, except *roaring.Bitmap) (vecIDsToExclude []int64) {
+ if except != nil && !except.IsEmpty() {
+ for vecID, docID := range vecDocIDMap {
+ if except.Contains(docID) {
+ vecIDsToExclude = append(vecIDsToExclude, vecID)
+ }
+ }
+ }
+ return vecIDsToExclude
+}
diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache_nosup.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache_nosup.go
new file mode 100644
index 000000000..ff152f95c
--- /dev/null
+++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache_nosup.go
@@ -0,0 +1,27 @@
+// Copyright (c) 2024 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !vectors
+// +build !vectors
+
+package zap
+
+type vectorIndexCache struct {
+}
+
+func newVectorIndexCache() *vectorIndexCache {
+ return nil
+}
+
+func (v *vectorIndexCache) Clear() {}
diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_io_flags_unix.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_io_flags_unix.go
new file mode 100644
index 000000000..a4bb8a829
--- /dev/null
+++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_io_flags_unix.go
@@ -0,0 +1,22 @@
+// Copyright (c) 2024 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build vectors && !windows
+// +build vectors,!windows
+
+package zap
+
+import faiss "github.com/blevesearch/go-faiss"
+
+const faissIOFlags = faiss.IOFlagReadMmap | faiss.IOFlagSkipPrefetch
diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_io_flags_win.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_io_flags_win.go
new file mode 100644
index 000000000..de99f64d0
--- /dev/null
+++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_io_flags_win.go
@@ -0,0 +1,22 @@
+// Copyright (c) 2024 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build vectors && windows
+// +build vectors,windows
+
+package zap
+
+import faiss "github.com/blevesearch/go-faiss"
+
+const faissIOFlags = faiss.IOFlagReadOnly
diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go
index 2104d53f9..e4275d76c 100644
--- a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go
+++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go
@@ -19,6 +19,7 @@ package zap
import (
"encoding/binary"
+ "encoding/json"
"math"
"reflect"
@@ -266,14 +267,14 @@ func (vpl *VecPostingsIterator) BytesWritten() uint64 {
// vectorIndexWrapper conforms to scorch_segment_api's VectorIndex interface
type vectorIndexWrapper struct {
- search func(qVector []float32, k int64, except *roaring.Bitmap) (segment.VecPostingsList, error)
+ search func(qVector []float32, k int64, params json.RawMessage) (segment.VecPostingsList, error)
close func()
size func() uint64
}
-func (i *vectorIndexWrapper) Search(qVector []float32, k int64, except *roaring.Bitmap) (
+func (i *vectorIndexWrapper) Search(qVector []float32, k int64, params json.RawMessage) (
segment.VecPostingsList, error) {
- return i.search(qVector, k, except)
+ return i.search(qVector, k, params)
}
func (i *vectorIndexWrapper) Close() {
@@ -284,21 +285,23 @@ func (i *vectorIndexWrapper) Size() uint64 {
return i.size()
}
-// InterpretVectorIndex returns closures that will allow the caller to -
-// (1) SearchVectorIndex - search within an attached vector index
-// (2) CloseVectorIndex - close attached vector index
-//
-// These function pointers may be nil, when InterpretVectorIndex return a non-nil err.
-// It is on the caller to ensure CloseVectorIndex is invoked (sync or async) after
-// their business with the attached vector index concludes.
-func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex, error) {
+// InterpretVectorIndex returns a construct of closures (vectorIndexWrapper)
+// that will allow the caller to -
+// (1) search within an attached vector index
+// (2) close attached vector index
+// (3) get the size of the attached vector index
+func (sb *SegmentBase) InterpretVectorIndex(field string, except *roaring.Bitmap) (
+ segment.VectorIndex, error) {
// Params needed for the closures
var vecIndex *faiss.IndexImpl
- vecDocIDMap := make(map[int64]uint32)
+ var vecDocIDMap map[int64]uint32
+ var vectorIDsToExclude []int64
+ var fieldIDPlus1 uint16
+ var vecIndexSize uint64
var (
wrapVecIndex = &vectorIndexWrapper{
- search: func(qVector []float32, k int64, except *roaring.Bitmap) (segment.VecPostingsList, error) {
+ search: func(qVector []float32, k int64, params json.RawMessage) (segment.VecPostingsList, error) {
// 1. returned postings list (of type PostingsList) has two types of information - docNum and its score.
// 2. both the values can be represented using roaring bitmaps.
// 3. the Iterator (of type PostingsIterator) returned would operate in terms of VecPostings.
@@ -315,17 +318,7 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
return rv, nil
}
- var vectorIDsToExclude []int64
- // iterate through the vector doc ID map and if the doc ID is one to be
- // deleted, add it to the list
- for vecID, docID := range vecDocIDMap {
- if except != nil && except.Contains(docID) {
- vectorIDsToExclude = append(vectorIDsToExclude, vecID)
- }
- }
-
- scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k, vectorIDsToExclude)
-
+ scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k, vectorIDsToExclude, params)
if err != nil {
return nil, err
}
@@ -335,7 +328,7 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
vecID := ids[i]
// Checking if it's present in the vecDocIDMap.
// If -1 is returned as an ID(insufficient vectors), this will ensure
- // they it isn't added to the final postings list.
+ // it isn't added to the final postings list.
if docID, ok := vecDocIDMap[vecID]; ok {
code := getVectorCode(docID, scores[i])
rv.postings.Add(uint64(code))
@@ -345,22 +338,19 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
return rv, nil
},
close: func() {
- if vecIndex != nil {
- vecIndex.Close()
- }
+ // skipping the closing because the index is cached and it's being
+ // deferred to a later point of time.
+ sb.vecIndexCache.decRef(fieldIDPlus1)
},
size: func() uint64 {
- if vecIndex != nil {
- return vecIndex.Size()
- }
- return 0
+ return vecIndexSize
},
}
err error
)
- fieldIDPlus1 := sb.fieldsMap[field]
+ fieldIDPlus1 = sb.fieldsMap[field]
if fieldIDPlus1 <= 0 {
return wrapVecIndex, nil
}
@@ -382,25 +372,13 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
pos += n
}
- // read the number vectors indexed for this field and load the vector to docID mapping.
- // todo: cache the vecID to docIDs mapping for a fieldID
- numVecs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
- pos += n
- for i := 0; i < int(numVecs); i++ {
- vecID, n := binary.Varint(sb.mem[pos : pos+binary.MaxVarintLen64])
- pos += n
- docID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
- pos += n
- vecDocIDMap[vecID] = uint32(docID)
+ vecIndex, vecDocIDMap, vectorIDsToExclude, err =
+ sb.vecIndexCache.loadOrCreate(fieldIDPlus1, sb.mem[pos:], except)
+
+ if vecIndex != nil {
+ vecIndexSize = vecIndex.Size()
}
- // todo: not a good idea to cache the vector index perhaps, since it could be quite huge.
- indexSize, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
- pos += n
- indexBytes := sb.mem[pos : pos+int(indexSize)]
- pos += int(indexSize)
-
- vecIndex, err = faiss.ReadIndexFromBuffer(indexBytes, faiss.IOFlagReadOnly)
return wrapVecIndex, err
}
diff --git a/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go b/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go
index 2102fb5a9..c73bf0111 100644
--- a/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go
+++ b/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go
@@ -31,6 +31,8 @@ import (
seg "github.com/blevesearch/scorch_segment_api/v2"
)
+const defaultFaissOMPThreads = 1
+
func init() {
rand.Seed(time.Now().UTC().UnixNano())
registerSegmentSection(SectionFaissVectorIndex, &faissVectorIndexSection{})
@@ -38,6 +40,7 @@ func init() {
_, ok := field.(index.VectorField)
return ok
}
+ faiss.SetOMPThreads(defaultFaissOMPThreads)
}
type faissVectorIndexSection struct {
@@ -73,7 +76,7 @@ type vecIndexMeta struct {
indexOptimizedFor string
}
-// keep in mind with respect to update and delete operations with resepct to vectors
+// keep in mind with respect to update and delete operations with respect to vectors
func (v *faissVectorIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase,
drops []*roaring.Bitmap, fieldsInv []string,
newDocNumsIn [][]uint64, w *CountHashWriter, closeCh chan struct{}) error {
@@ -275,7 +278,7 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
indexes []*vecIndexMeta, w *CountHashWriter, closeCh chan struct{}) error {
vecIndexes := make([]*faiss.IndexImpl, 0, len(sbs))
- reconsCap := 0
+ var finalVecIDCap, indexDataCap, reconsCap int
for segI, segBase := range sbs {
// Considering merge operations on vector indexes are expensive, it is
// worth including an early exit if the merge is aborted, saving us
@@ -286,14 +289,18 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
}
// read the index bytes. todo: parallelize this
indexBytes := segBase.mem[indexes[segI].startOffset : indexes[segI].startOffset+int(indexes[segI].indexSize)]
- index, err := faiss.ReadIndexFromBuffer(indexBytes, faiss.IOFlagReadOnly)
+ index, err := faiss.ReadIndexFromBuffer(indexBytes, faissIOFlags)
if err != nil {
freeReconstructedIndexes(vecIndexes)
return err
}
- indexReconsLen := len(indexes[segI].vecIds) * index.D()
- if indexReconsLen > reconsCap {
- reconsCap = indexReconsLen
+ if len(indexes[segI].vecIds) > 0 {
+ indexReconsLen := len(indexes[segI].vecIds) * index.D()
+ if indexReconsLen > reconsCap {
+ reconsCap = indexReconsLen
+ }
+ indexDataCap += indexReconsLen
+ finalVecIDCap += len(indexes[segI].vecIds)
}
vecIndexes = append(vecIndexes, index)
}
@@ -303,13 +310,6 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
return nil
}
- var mergedIndexBytes []byte
-
- // capacities for the finalVecIDs and indexData slices
- // to avoid multiple allocations, via append.
- finalVecIDCap := len(indexes[0].vecIds) * len(vecIndexes)
- indexDataCap := finalVecIDCap * vecIndexes[0].D()
-
finalVecIDs := make([]int64, 0, finalVecIDCap)
// merging of indexes with reconstruction method.
// the indexes[i].vecIds has only the valid vecs of this vector
@@ -347,25 +347,27 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
freeReconstructedIndexes(vecIndexes)
return nil
}
+ recons = nil
nvecs := len(finalVecIDs)
- // index type to be created after merge based on the number of vectors in
- // indexData added into the index.
- nlist := determineCentroids(nvecs)
- indexDescription, indexClass := determineIndexToUse(nvecs, nlist)
-
// safe to assume that all the indexes are of the same config values, given
// that they are extracted from the field mapping info.
dims := vecIndexes[0].D()
metric := vecIndexes[0].MetricType()
indexOptimizedFor := indexes[0].indexOptimizedFor
+ // index type to be created after merge based on the number of vectors
+ // in indexData added into the index.
+ nlist := determineCentroids(nvecs)
+ indexDescription, indexClass := determineIndexToUse(nvecs, nlist, indexOptimizedFor)
+
// freeing the reconstructed indexes immediately - waiting till the end
// to do the same is not needed because the following operations don't need
// the reconstructed ones anymore and doing so will hold up memory which can
// be detrimental while creating indexes during introduction.
freeReconstructedIndexes(vecIndexes)
+ vecIndexes = nil
faissIndex, err := faiss.IndexFactory(dims, indexDescription, metric)
if err != nil {
@@ -400,6 +402,9 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
return err
}
+ indexData = nil
+ finalVecIDs = nil
+ var mergedIndexBytes []byte
mergedIndexBytes, err = faiss.WriteIndexIntoBuffer(faissIndex)
if err != nil {
return err
@@ -435,10 +440,7 @@ func determineCentroids(nvecs int) int {
var nlist int
switch {
- // At 1M vectors, nlist = 4k gave a reasonably high recall with the right nprobe,
- // whereas 1M/100 = 10000 centroids would increase training time without
- // corresponding increase in recall
- case nvecs >= 1000000:
+ case nvecs >= 200000:
nlist = int(4 * math.Sqrt(float64(nvecs)))
case nvecs >= 1000:
// 100 points per cluster is a reasonable default, considering the default
@@ -457,7 +459,16 @@ const (
// Returns a description string for the index and quantizer type
// and an index type.
-func determineIndexToUse(nvecs, nlist int) (string, int) {
+func determineIndexToUse(nvecs, nlist int, indexOptimizedFor string) (string, int) {
+ if indexOptimizedFor == index.IndexOptimizedForMemoryEfficient {
+ switch {
+ case nvecs >= 1000:
+ return fmt.Sprintf("IVF%d,SQ4", nlist), IndexTypeIVF
+ default:
+ return "IDMap2,Flat", IndexTypeFlat
+ }
+ }
+
switch {
case nvecs >= 10000:
return fmt.Sprintf("IVF%d,SQ8", nlist), IndexTypeIVF
@@ -476,11 +487,11 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
for fieldID, content := range vo.vecFieldMap {
// calculate the capacity of the vecs and ids slices
// to avoid multiple allocations.
- vecs := make([]float32, 0, uint16(len(content.vecs))*content.dim)
+ vecs := make([]float32, 0, len(content.vecs)*int(content.dim))
ids := make([]int64, 0, len(content.vecs))
for hash, vecInfo := range content.vecs {
vecs = append(vecs, vecInfo.vec...)
- ids = append(ids, int64(hash))
+ ids = append(ids, hash)
}
var metric = faiss.MetricL2
@@ -490,7 +501,8 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
nvecs := len(ids)
nlist := determineCentroids(nvecs)
- indexDescription, indexClass := determineIndexToUse(nvecs, nlist)
+ indexDescription, indexClass := determineIndexToUse(nvecs, nlist,
+ content.indexOptimizedFor)
faissIndex, err := faiss.IndexFactory(int(content.dim), indexDescription, metric)
if err != nil {
return 0, err
@@ -518,12 +530,6 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
return 0, err
}
- // serialize the built index into a byte slice
- buf, err := faiss.WriteIndexIntoBuffer(faissIndex)
- if err != nil {
- return 0, err
- }
-
fieldStart := w.Count()
// writing out two offset values to indicate that the current field's
// vector section doesn't have valid doc value content within it.
@@ -557,7 +563,7 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
// section would be help avoiding in paging in this data as part of a page
// (which is to load a non-cacheable info like index). this could help the
// paging costs
- for vecID, _ := range content.vecs {
+ for vecID := range content.vecs {
docID := vo.vecIDMap[vecID].docID
// write the vecID
n = binary.PutVarint(tempBuf, vecID)
@@ -573,6 +579,12 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
}
}
+ // serialize the built index into a byte slice
+ buf, err := faiss.WriteIndexIntoBuffer(faissIndex)
+ if err != nil {
+ return 0, err
+ }
+
// record the fieldStart value for this section.
// write the vecID -> docID mapping
// write the index bytes and its length
diff --git a/vendor/github.com/blevesearch/zapx/v16/segment.go b/vendor/github.com/blevesearch/zapx/v16/segment.go
index 062abf2c3..8dce0856a 100644
--- a/vendor/github.com/blevesearch/zapx/v16/segment.go
+++ b/vendor/github.com/blevesearch/zapx/v16/segment.go
@@ -55,6 +55,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) {
SegmentBase: SegmentBase{
fieldsMap: make(map[string]uint16),
fieldFSTs: make(map[uint16]*vellum.FST),
+ vecIndexCache: newVectorIndexCache(),
fieldDvReaders: make([]map[uint16]*docValueReader, len(segmentSections)),
},
f: f,
@@ -81,7 +82,6 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) {
_ = rv.Close()
return nil, err
}
-
return rv, nil
}
@@ -110,6 +110,9 @@ type SegmentBase struct {
m sync.Mutex
fieldFSTs map[uint16]*vellum.FST
+
+ // this cache comes into play when vectors are supported in builds.
+ vecIndexCache *vectorIndexCache
}
func (sb *SegmentBase) Size() int {
@@ -146,7 +149,7 @@ func (sb *SegmentBase) updateSize() {
func (sb *SegmentBase) AddRef() {}
func (sb *SegmentBase) DecRef() (err error) { return nil }
-func (sb *SegmentBase) Close() (err error) { return nil }
+func (sb *SegmentBase) Close() (err error) { sb.vecIndexCache.Clear(); return nil }
// Segment implements a persisted segment.Segment interface, by
// embedding an mmap()'ed SegmentBase.
@@ -319,13 +322,29 @@ func (s *SegmentBase) loadFieldsNew() error {
return s.loadFields()
}
+ seek := pos + binary.MaxVarintLen64
+ if seek > uint64(len(s.mem)) {
+ // handling a buffer overflow case.
+ // a rare case where the backing buffer is not large enough to be read directly via
+ // a pos+binary.MaxVarinLen64 seek. For eg, this can happen when there is only
+ // one field to be indexed in the entire batch of data and while writing out
+ // these fields metadata, you write 1 + 8 bytes whereas the MaxVarintLen64 = 10.
+ seek = uint64(len(s.mem))
+ }
+
// read the number of fields
- numFields, sz := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64])
+ numFields, sz := binary.Uvarint(s.mem[pos:seek])
+ // here, the pos is incremented by the valid number bytes read from the buffer
+ // so in the edge case pointed out above the numFields = 1, the sz = 1 as well.
pos += uint64(sz)
s.incrementBytesRead(uint64(sz))
+ // the following loop will be executed only once in the edge case pointed out above
+ // since there is only field's offset store which occupies 8 bytes.
+ // the pointer then seeks to a position preceding the sectionsIndexOffset, at
+ // which point the responbility of handling the out-of-bounds cases shifts to
+ // the specific section's parsing logic.
var fieldID uint64
-
for fieldID < numFields {
addr := binary.BigEndian.Uint64(s.mem[pos : pos+8])
s.incrementBytesRead(8)
@@ -629,6 +648,9 @@ func (s *Segment) Close() (err error) {
}
func (s *Segment) closeActual() (err error) {
+ // clear contents from the vector index cache before un-mmapping
+ s.vecIndexCache.Clear()
+
if s.mm != nil {
err = s.mm.Unmap()
}
@@ -640,6 +662,7 @@ func (s *Segment) closeActual() (err error) {
err = err2
}
}
+
return
}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 43a0b01e6..691e05a75 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -64,7 +64,7 @@ github.com/ProtonMail/go-crypto/openpgp/internal/ecc
github.com/ProtonMail/go-crypto/openpgp/internal/encoding
github.com/ProtonMail/go-crypto/openpgp/packet
github.com/ProtonMail/go-crypto/openpgp/s2k
-# github.com/RoaringBitmap/roaring v1.2.3
+# github.com/RoaringBitmap/roaring v1.9.3
## explicit; go 1.14
github.com/RoaringBitmap/roaring
github.com/RoaringBitmap/roaring/internal
@@ -157,11 +157,11 @@ github.com/beorn7/perks/quantile
# github.com/bitly/go-simplejson v0.5.0
## explicit
github.com/bitly/go-simplejson
-# github.com/bits-and-blooms/bitset v1.2.1
-## explicit; go 1.14
+# github.com/bits-and-blooms/bitset v1.12.0
+## explicit; go 1.16
github.com/bits-and-blooms/bitset
-# github.com/blevesearch/bleve/v2 v2.4.0
-## explicit; go 1.20
+# github.com/blevesearch/bleve/v2 v2.4.2
+## explicit; go 1.21
github.com/blevesearch/bleve/v2
github.com/blevesearch/bleve/v2/analysis
github.com/blevesearch/bleve/v2/analysis/analyzer/custom
@@ -169,6 +169,10 @@ github.com/blevesearch/bleve/v2/analysis/analyzer/keyword
github.com/blevesearch/bleve/v2/analysis/analyzer/standard
github.com/blevesearch/bleve/v2/analysis/datetime/flexible
github.com/blevesearch/bleve/v2/analysis/datetime/optional
+github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds
+github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds
+github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds
+github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds
github.com/blevesearch/bleve/v2/analysis/lang/en
github.com/blevesearch/bleve/v2/analysis/token/lowercase
github.com/blevesearch/bleve/v2/analysis/token/porter
@@ -198,14 +202,14 @@ github.com/blevesearch/bleve/v2/search/scorer
github.com/blevesearch/bleve/v2/search/searcher
github.com/blevesearch/bleve/v2/size
github.com/blevesearch/bleve/v2/util
-# github.com/blevesearch/bleve_index_api v1.1.6
+# github.com/blevesearch/bleve_index_api v1.1.10
## explicit; go 1.20
github.com/blevesearch/bleve_index_api
# github.com/blevesearch/geo v0.1.20
## explicit; go 1.18
github.com/blevesearch/geo/geojson
github.com/blevesearch/geo/s2
-# github.com/blevesearch/go-faiss v1.0.13
+# github.com/blevesearch/go-faiss v1.0.20
## explicit; go 1.19
github.com/blevesearch/go-faiss
# github.com/blevesearch/go-porterstemmer v1.0.3
@@ -217,7 +221,7 @@ github.com/blevesearch/gtreap
# github.com/blevesearch/mmap-go v1.0.4
## explicit; go 1.13
github.com/blevesearch/mmap-go
-# github.com/blevesearch/scorch_segment_api/v2 v2.2.9
+# github.com/blevesearch/scorch_segment_api/v2 v2.2.15
## explicit; go 1.20
github.com/blevesearch/scorch_segment_api/v2
# github.com/blevesearch/segment v0.9.1
@@ -251,7 +255,7 @@ github.com/blevesearch/zapx/v14
# github.com/blevesearch/zapx/v15 v15.3.13
## explicit; go 1.19
github.com/blevesearch/zapx/v15
-# github.com/blevesearch/zapx/v16 v16.0.12
+# github.com/blevesearch/zapx/v16 v16.1.5
## explicit; go 1.20
github.com/blevesearch/zapx/v16
# github.com/bluele/gcache v0.0.2