mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-05-04 01:39:16 -05:00
chore(deps): bump github.com/blevesearch/bleve/v2 from 2.4.0 to 2.4.2
Bumps [github.com/blevesearch/bleve/v2](https://github.com/blevesearch/bleve) from 2.4.0 to 2.4.2. - [Release notes](https://github.com/blevesearch/bleve/releases) - [Commits](https://github.com/blevesearch/bleve/compare/v2.4.0...v2.4.2) --- updated-dependencies: - dependency-name: github.com/blevesearch/bleve/v2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>
This commit is contained in:
committed by
Ralf Haferkamp
parent
840eb734fa
commit
d31e179e86
@@ -11,7 +11,7 @@ require (
|
||||
github.com/Nerzal/gocloak/v13 v13.9.0
|
||||
github.com/bbalet/stopwords v1.0.0
|
||||
github.com/beevik/etree v1.4.0
|
||||
github.com/blevesearch/bleve/v2 v2.4.0
|
||||
github.com/blevesearch/bleve/v2 v2.4.2
|
||||
github.com/cenkalti/backoff v2.2.1+incompatible
|
||||
github.com/coreos/go-oidc/v3 v3.10.0
|
||||
github.com/cs3org/go-cs3apis v0.0.0-20240724121416-062c4e3046cb
|
||||
@@ -126,7 +126,7 @@ require (
|
||||
github.com/Microsoft/go-winio v0.6.2 // indirect
|
||||
github.com/OneOfOne/xxhash v1.2.8 // indirect
|
||||
github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 // indirect
|
||||
github.com/RoaringBitmap/roaring v1.2.3 // indirect
|
||||
github.com/RoaringBitmap/roaring v1.9.3 // indirect
|
||||
github.com/agnivade/levenshtein v1.1.1 // indirect
|
||||
github.com/ajg/form v1.5.1 // indirect
|
||||
github.com/alexedwards/argon2id v1.0.0 // indirect
|
||||
@@ -137,14 +137,14 @@ require (
|
||||
github.com/aws/aws-sdk-go v1.45.1 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/bitly/go-simplejson v0.5.0 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.2.1 // indirect
|
||||
github.com/blevesearch/bleve_index_api v1.1.6 // indirect
|
||||
github.com/bits-and-blooms/bitset v1.12.0 // indirect
|
||||
github.com/blevesearch/bleve_index_api v1.1.10 // indirect
|
||||
github.com/blevesearch/geo v0.1.20 // indirect
|
||||
github.com/blevesearch/go-faiss v1.0.13 // indirect
|
||||
github.com/blevesearch/go-faiss v1.0.20 // indirect
|
||||
github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
|
||||
github.com/blevesearch/gtreap v0.1.1 // indirect
|
||||
github.com/blevesearch/mmap-go v1.0.4 // indirect
|
||||
github.com/blevesearch/scorch_segment_api/v2 v2.2.9 // indirect
|
||||
github.com/blevesearch/scorch_segment_api/v2 v2.2.15 // indirect
|
||||
github.com/blevesearch/segment v0.9.1 // indirect
|
||||
github.com/blevesearch/snowballstem v0.9.0 // indirect
|
||||
github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect
|
||||
@@ -154,7 +154,7 @@ require (
|
||||
github.com/blevesearch/zapx/v13 v13.3.10 // indirect
|
||||
github.com/blevesearch/zapx/v14 v14.3.10 // indirect
|
||||
github.com/blevesearch/zapx/v15 v15.3.13 // indirect
|
||||
github.com/blevesearch/zapx/v16 v16.0.12 // indirect
|
||||
github.com/blevesearch/zapx/v16 v16.1.5 // indirect
|
||||
github.com/bluele/gcache v0.0.2 // indirect
|
||||
github.com/bmizerany/pat v0.0.0-20210406213842-e4b6760bdd6f // indirect
|
||||
github.com/bombsimon/logrusr/v3 v3.1.0 // indirect
|
||||
|
||||
@@ -822,8 +822,8 @@ github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdII
|
||||
github.com/OpenDNS/vegadns2client v0.0.0-20180418235048-a3fa4a771d87/go.mod h1:iGLljf5n9GjT6kc0HBvyI1nOKnGQbNB66VzSNbK5iks=
|
||||
github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 h1:kkhsdkhsCvIsutKu5zLMgWtgh9YxGCNAw8Ad8hjwfYg=
|
||||
github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0=
|
||||
github.com/RoaringBitmap/roaring v1.2.3 h1:yqreLINqIrX22ErkKI0vY47/ivtJr6n+kMhVOVmhWBY=
|
||||
github.com/RoaringBitmap/roaring v1.2.3/go.mod h1:plvDsJQpxOC5bw8LRteu/MLWHsHez/3y6cubLI4/1yE=
|
||||
github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4S2OByM=
|
||||
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
|
||||
github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
|
||||
github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI=
|
||||
github.com/aduffeck/gowebdav v0.0.0-20231215102054-212d4a4374f6 h1:ws0yvsikTQdmheKINP16tBzAHdttrHwbz/q3Fgl9X1Y=
|
||||
@@ -893,26 +893,25 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
|
||||
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
|
||||
github.com/bitly/go-simplejson v0.5.0 h1:6IH+V8/tVMab511d5bn4M7EwGXZf9Hj6i2xSwkNEM+Y=
|
||||
github.com/bitly/go-simplejson v0.5.0/go.mod h1:cXHtHw4XUPsvGaxgjIAn8PhEWG9NfngEKAMDJEczWVA=
|
||||
github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
|
||||
github.com/bits-and-blooms/bitset v1.2.1 h1:M+/hrU9xlMp7t4TyTDQW97d3tRPVuKFC6zBEK16QnXY=
|
||||
github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
|
||||
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
|
||||
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84=
|
||||
github.com/blevesearch/bleve/v2 v2.4.0 h1:2xyg+Wv60CFHYccXc+moGxbL+8QKT/dZK09AewHgKsg=
|
||||
github.com/blevesearch/bleve/v2 v2.4.0/go.mod h1:IhQHoFAbHgWKYavb9rQgQEJJVMuY99cKdQ0wPpst2aY=
|
||||
github.com/blevesearch/bleve_index_api v1.1.6 h1:orkqDFCBuNU2oHW9hN2YEJmet+TE9orml3FCGbl1cKk=
|
||||
github.com/blevesearch/bleve_index_api v1.1.6/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
|
||||
github.com/blevesearch/bleve/v2 v2.4.2 h1:NooYP1mb3c0StkiY9/xviiq2LGSaE8BQBCc/pirMx0U=
|
||||
github.com/blevesearch/bleve/v2 v2.4.2/go.mod h1:ATNKj7Yl2oJv/lGuF4kx39bST2dveX6w0th2FFYLkc8=
|
||||
github.com/blevesearch/bleve_index_api v1.1.10 h1:PDLFhVjrjQWr6jCuU7TwlmByQVCSEURADHdCqVS9+g0=
|
||||
github.com/blevesearch/bleve_index_api v1.1.10/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
|
||||
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
|
||||
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
|
||||
github.com/blevesearch/go-faiss v1.0.13 h1:zfFs7ZYD0NqXVSY37j0JZjZT1BhE9AE4peJfcx/NB4A=
|
||||
github.com/blevesearch/go-faiss v1.0.13/go.mod h1:jrxHrbl42X/RnDPI+wBoZU8joxxuRwedrxqswQ3xfU8=
|
||||
github.com/blevesearch/go-faiss v1.0.20 h1:AIkdTQFWuZ5LQmKQSebgMR4RynGNw8ZseJXaan5kvtI=
|
||||
github.com/blevesearch/go-faiss v1.0.20/go.mod h1:jrxHrbl42X/RnDPI+wBoZU8joxxuRwedrxqswQ3xfU8=
|
||||
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
|
||||
github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M=
|
||||
github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y=
|
||||
github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk=
|
||||
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
|
||||
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
|
||||
github.com/blevesearch/scorch_segment_api/v2 v2.2.9 h1:3nBaSBRFokjE4FtPW3eUDgcAu3KphBg1GP07zy/6Uyk=
|
||||
github.com/blevesearch/scorch_segment_api/v2 v2.2.9/go.mod h1:ckbeb7knyOOvAdZinn/ASbB7EA3HoagnJkmEV3J7+sg=
|
||||
github.com/blevesearch/scorch_segment_api/v2 v2.2.15 h1:prV17iU/o+A8FiZi9MXmqbagd8I0bCqM7OKUYPbnb5Y=
|
||||
github.com/blevesearch/scorch_segment_api/v2 v2.2.15/go.mod h1:db0cmP03bPNadXrCDuVkKLV6ywFSiRgPFT1YVrestBc=
|
||||
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
|
||||
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
|
||||
github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
|
||||
@@ -931,8 +930,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7
|
||||
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
|
||||
github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ=
|
||||
github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg=
|
||||
github.com/blevesearch/zapx/v16 v16.0.12 h1:Uccxvjmn+hQ6ywQP+wIiTpdq9LnAviGoryJOmGwAo/I=
|
||||
github.com/blevesearch/zapx/v16 v16.0.12/go.mod h1:MYnOshRfSm4C4drxx1LGRI+MVFByykJ2anDY1fxdk9Q=
|
||||
github.com/blevesearch/zapx/v16 v16.1.5 h1:b0sMcarqNFxuXvjoXsF8WtwVahnxyhEvBSRJi/AUHjU=
|
||||
github.com/blevesearch/zapx/v16 v16.1.5/go.mod h1:J4mSF39w1QELc11EWRSBFkPeZuO7r/NPKkHzDCoiaI8=
|
||||
github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw=
|
||||
github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0=
|
||||
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY=
|
||||
|
||||
-107
@@ -1,107 +0,0 @@
|
||||
.PHONY: help all test format fmtcheck vet lint qa deps clean nuke ser fetch-real-roaring-datasets
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Display general help about this command
|
||||
help:
|
||||
@echo ""
|
||||
@echo "The following commands are available:"
|
||||
@echo ""
|
||||
@echo " make qa : Run all the tests"
|
||||
@echo " make test : Run the unit tests"
|
||||
@echo ""
|
||||
@echo " make format : Format the source code"
|
||||
@echo " make fmtcheck : Check if the source code has been formatted"
|
||||
@echo " make vet : Check for suspicious constructs"
|
||||
@echo " make lint : Check for style errors"
|
||||
@echo ""
|
||||
@echo " make deps : Get the dependencies"
|
||||
@echo " make clean : Remove any build artifact"
|
||||
@echo " make nuke : Deletes any intermediate file"
|
||||
@echo ""
|
||||
@echo " make fuzz-smat : Fuzzy testing with smat"
|
||||
@echo " make fuzz-stream : Fuzzy testing with stream deserialization"
|
||||
@echo " make fuzz-buffer : Fuzzy testing with buffer deserialization"
|
||||
@echo ""
|
||||
|
||||
# Alias for help target
|
||||
all: help
|
||||
test:
|
||||
go test
|
||||
go test -race -run TestConcurrent*
|
||||
# Format the source code
|
||||
format:
|
||||
@find ./ -type f -name "*.go" -exec gofmt -w {} \;
|
||||
|
||||
# Check if the source code has been formatted
|
||||
fmtcheck:
|
||||
@mkdir -p target
|
||||
@find ./ -type f -name "*.go" -exec gofmt -d {} \; | tee target/format.diff
|
||||
@test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; }
|
||||
|
||||
# Check for syntax errors
|
||||
vet:
|
||||
GOPATH=$(GOPATH) go vet ./...
|
||||
|
||||
# Check for style errors
|
||||
lint:
|
||||
GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint ./...
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Alias to run all quality-assurance checks
|
||||
qa: fmtcheck test vet lint
|
||||
|
||||
# --- INSTALL ---
|
||||
|
||||
# Get the dependencies
|
||||
deps:
|
||||
GOPATH=$(GOPATH) go get github.com/stretchr/testify
|
||||
GOPATH=$(GOPATH) go get github.com/bits-and-blooms/bitset
|
||||
GOPATH=$(GOPATH) go get github.com/golang/lint/golint
|
||||
GOPATH=$(GOPATH) go get github.com/mschoch/smat
|
||||
GOPATH=$(GOPATH) go get github.com/dvyukov/go-fuzz/go-fuzz
|
||||
GOPATH=$(GOPATH) go get github.com/dvyukov/go-fuzz/go-fuzz-build
|
||||
GOPATH=$(GOPATH) go get github.com/glycerine/go-unsnap-stream
|
||||
GOPATH=$(GOPATH) go get github.com/philhofer/fwd
|
||||
GOPATH=$(GOPATH) go get github.com/jtolds/gls
|
||||
|
||||
fuzz-smat:
|
||||
go test -tags=gofuzz -run=TestGenerateSmatCorpus
|
||||
go-fuzz-build -func FuzzSmat github.com/RoaringBitmap/roaring
|
||||
go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200
|
||||
|
||||
|
||||
fuzz-stream:
|
||||
go-fuzz-build -func FuzzSerializationStream github.com/RoaringBitmap/roaring
|
||||
go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200
|
||||
|
||||
|
||||
fuzz-buffer:
|
||||
go-fuzz-build -func FuzzSerializationBuffer github.com/RoaringBitmap/roaring
|
||||
go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200
|
||||
|
||||
# Remove any build artifact
|
||||
clean:
|
||||
GOPATH=$(GOPATH) go clean ./...
|
||||
|
||||
# Deletes any intermediate file
|
||||
nuke:
|
||||
rm -rf ./target
|
||||
GOPATH=$(GOPATH) go clean -i ./...
|
||||
|
||||
cover:
|
||||
go test -coverprofile=coverage.out
|
||||
go tool cover -html=coverage.out
|
||||
|
||||
fetch-real-roaring-datasets:
|
||||
# pull github.com/RoaringBitmap/real-roaring-datasets -> testdata/real-roaring-datasets
|
||||
git submodule init
|
||||
git submodule update
|
||||
+8
-6
@@ -1,5 +1,7 @@
|
||||
roaring [](https://godoc.org/github.com/RoaringBitmap/roaring/roaring64) [](https://goreportcard.com/report/github.com/RoaringBitmap/roaring)
|
||||
[](https://cloud.drone.io/RoaringBitmap/roaring)
|
||||
# roaring
|
||||
|
||||
[](https://godoc.org/github.com/RoaringBitmap/roaring) [](https://goreportcard.com/report/github.com/RoaringBitmap/roaring)
|
||||
|
||||

|
||||

|
||||

|
||||
@@ -31,17 +33,17 @@ Roaring bitmaps are found to work well in many important applications:
|
||||
|
||||
The ``roaring`` Go library is used by
|
||||
* [anacrolix/torrent]
|
||||
* [runv](https://github.com/hyperhq/runv)
|
||||
* [InfluxDB](https://www.influxdata.com)
|
||||
* [Pilosa](https://www.pilosa.com/)
|
||||
* [Bleve](http://www.blevesearch.com)
|
||||
* [Weaviate](https://github.com/weaviate/weaviate)
|
||||
* [lindb](https://github.com/lindb/lindb)
|
||||
* [Elasticell](https://github.com/deepfabric/elasticell)
|
||||
* [SourceGraph](https://github.com/sourcegraph/sourcegraph)
|
||||
* [M3](https://github.com/m3db/m3)
|
||||
* [trident](https://github.com/NetApp/trident)
|
||||
* [Husky](https://www.datadoghq.com/blog/engineering/introducing-husky/)
|
||||
|
||||
* [FrostDB](https://github.com/polarsignals/frostdb)
|
||||
|
||||
This library is used in production in several systems, it is part of the [Awesome Go collection](https://awesome-go.com).
|
||||
|
||||
@@ -99,7 +101,7 @@ whether you like it or not. That can become very wasteful.
|
||||
|
||||
This being said, there are definitively cases where attempting to use compressed bitmaps is wasteful.
|
||||
For example, if you have a small universe size. E.g., your bitmaps represent sets of integers
|
||||
from [0,n) where n is small (e.g., n=64 or n=128). If you are able to uncompressed BitSet and
|
||||
from [0,n) where n is small (e.g., n=64 or n=128). If you can use uncompressed BitSet and
|
||||
it does not blow up your memory usage, then compressed bitmaps are probably not useful
|
||||
to you. In fact, if you do not need compression, then a BitSet offers remarkable speed.
|
||||
|
||||
@@ -134,7 +136,7 @@ There is a big problem with these formats however that can hurt you badly in som
|
||||
|
||||
Roaring solves this problem. It works in the following manner. It divides the data into chunks of 2<sup>16</sup> integers
|
||||
(e.g., [0, 2<sup>16</sup>), [2<sup>16</sup>, 2 x 2<sup>16</sup>), ...). Within a chunk, it can use an uncompressed bitmap, a simple list of integers,
|
||||
or a list of runs. Whatever format it uses, they all allow you to check for the present of any one value quickly
|
||||
or a list of runs. Whatever format it uses, they all allow you to check for the presence of any one value quickly
|
||||
(e.g., with a binary search). The net result is that Roaring can compute many operations much faster than run-length-encoded
|
||||
formats like WAH, EWAH, Concise... Maybe surprisingly, Roaring also generally offers better compression ratios.
|
||||
|
||||
|
||||
+58
-5
@@ -17,8 +17,17 @@ func (ac *arrayContainer) String() string {
|
||||
}
|
||||
|
||||
func (ac *arrayContainer) fillLeastSignificant16bits(x []uint32, i int, mask uint32) int {
|
||||
if i < 0 {
|
||||
panic("negative index")
|
||||
}
|
||||
if len(ac.content) == 0 {
|
||||
return i
|
||||
}
|
||||
_ = x[len(ac.content)-1+i]
|
||||
_ = ac.content[len(ac.content)-1]
|
||||
for k := 0; k < len(ac.content); k++ {
|
||||
x[k+i] = uint32(ac.content[k]) | mask
|
||||
x[k+i] =
|
||||
uint32(ac.content[k]) | mask
|
||||
}
|
||||
return i + len(ac.content)
|
||||
}
|
||||
@@ -655,10 +664,54 @@ func (ac *arrayContainer) iandNot(a container) container {
|
||||
}
|
||||
|
||||
func (ac *arrayContainer) iandNotRun16(rc *runContainer16) container {
|
||||
rcb := rc.toBitmapContainer()
|
||||
acb := ac.toBitmapContainer()
|
||||
acb.iandNotBitmapSurely(rcb)
|
||||
*ac = *(acb.toArrayContainer())
|
||||
// Fast path: if either the array container or the run container is empty, the result is the array.
|
||||
if ac.isEmpty() || rc.isEmpty() {
|
||||
// Empty
|
||||
return ac
|
||||
}
|
||||
// Fast path: if the run container is full, the result is empty.
|
||||
if rc.isFull() {
|
||||
ac.content = ac.content[:0]
|
||||
return ac
|
||||
}
|
||||
current_run := 0
|
||||
// All values in [start_run, end_end] are part of the run
|
||||
start_run := rc.iv[current_run].start
|
||||
end_end := start_run + rc.iv[current_run].length
|
||||
// We are going to read values in the array at index i, and we are
|
||||
// going to write them at index pos. So we do in-place processing.
|
||||
// We always have that pos <= i by construction. So we can either
|
||||
// overwrite a value just read, or a value that was previous read.
|
||||
pos := 0
|
||||
i := 0
|
||||
for ; i < len(ac.content); i++ {
|
||||
if ac.content[i] < start_run {
|
||||
// the value in the array appears before the run [start_run, end_end]
|
||||
ac.content[pos] = ac.content[i]
|
||||
pos++
|
||||
} else if ac.content[i] <= end_end {
|
||||
// nothing to do, the value is in the array but also in the run.
|
||||
} else {
|
||||
// We have the value in the array after the run. We cannot tell
|
||||
// whether we need to keep it or not. So let us move to another run.
|
||||
if current_run+1 < len(rc.iv) {
|
||||
current_run++
|
||||
start_run = rc.iv[current_run].start
|
||||
end_end = start_run + rc.iv[current_run].length
|
||||
i-- // retry with the same i
|
||||
} else {
|
||||
// We have exhausted the number of runs. We can keep the rest of the values
|
||||
// from i to len(ac.content) - 1 inclusively.
|
||||
break // We are done, the rest of the array will be kept
|
||||
}
|
||||
}
|
||||
}
|
||||
for ; i < len(ac.content); i++ {
|
||||
ac.content[pos] = ac.content[i]
|
||||
pos++
|
||||
}
|
||||
// We 'shink' the slice.
|
||||
ac.content = ac.content[:pos]
|
||||
return ac
|
||||
}
|
||||
|
||||
|
||||
+58
-5
@@ -888,13 +888,67 @@ func (bc *bitmapContainer) iandNot(a container) container {
|
||||
}
|
||||
|
||||
func (bc *bitmapContainer) iandNotArray(ac *arrayContainer) container {
|
||||
acb := ac.toBitmapContainer()
|
||||
return bc.iandNotBitmapSurely(acb)
|
||||
if ac.isEmpty() || bc.isEmpty() {
|
||||
// Nothing to do.
|
||||
return bc
|
||||
}
|
||||
|
||||
// Word by word, we remove the elements in ac from bc. The approach is to build
|
||||
// a mask of the elements to remove, and then apply it to the bitmap.
|
||||
wordIdx := uint16(0)
|
||||
mask := uint64(0)
|
||||
for i, v := range ac.content {
|
||||
if v/64 != wordIdx {
|
||||
// Flush the current word.
|
||||
if i != 0 {
|
||||
// We're removing bits that are set in the mask and in the current word.
|
||||
// To figure out the cardinality change, we count the number of bits that
|
||||
// are set in the mask and in the current word.
|
||||
mask &= bc.bitmap[wordIdx]
|
||||
bc.bitmap[wordIdx] &= ^mask
|
||||
bc.cardinality -= int(popcount(mask))
|
||||
}
|
||||
|
||||
wordIdx = v / 64
|
||||
mask = 0
|
||||
}
|
||||
mask |= 1 << (v % 64)
|
||||
}
|
||||
|
||||
// Flush the last word.
|
||||
mask &= bc.bitmap[wordIdx]
|
||||
bc.bitmap[wordIdx] &= ^mask
|
||||
bc.cardinality -= int(popcount(mask))
|
||||
|
||||
if bc.getCardinality() <= arrayDefaultMaxSize {
|
||||
return bc.toArrayContainer()
|
||||
}
|
||||
return bc
|
||||
}
|
||||
|
||||
func (bc *bitmapContainer) iandNotRun16(rc *runContainer16) container {
|
||||
rcb := rc.toBitmapContainer()
|
||||
return bc.iandNotBitmapSurely(rcb)
|
||||
if rc.isEmpty() || bc.isEmpty() {
|
||||
// Nothing to do.
|
||||
return bc
|
||||
}
|
||||
|
||||
wordRangeStart := rc.iv[0].start / 64
|
||||
wordRangeEnd := (rc.iv[len(rc.iv)-1].last()) / 64 // inclusive
|
||||
|
||||
cardinalityChange := popcntSlice(bc.bitmap[wordRangeStart : wordRangeEnd+1]) // before cardinality - after cardinality (for word range)
|
||||
|
||||
for _, iv := range rc.iv {
|
||||
resetBitmapRange(bc.bitmap, int(iv.start), int(iv.last())+1)
|
||||
}
|
||||
|
||||
cardinalityChange -= popcntSlice(bc.bitmap[wordRangeStart : wordRangeEnd+1])
|
||||
|
||||
bc.cardinality -= int(cardinalityChange)
|
||||
|
||||
if bc.getCardinality() <= arrayDefaultMaxSize {
|
||||
return bc.toArrayContainer()
|
||||
}
|
||||
return bc
|
||||
}
|
||||
|
||||
func (bc *bitmapContainer) andNotArray(value2 *arrayContainer) container {
|
||||
@@ -1062,7 +1116,6 @@ func (bc *bitmapContainer) PrevSetBit(i int) int {
|
||||
|
||||
// reference the java implementation
|
||||
// https://github.com/RoaringBitmap/RoaringBitmap/blob/master/src/main/java/org/roaringbitmap/BitmapContainer.java#L875-L892
|
||||
//
|
||||
func (bc *bitmapContainer) numberOfRuns() int {
|
||||
if bc.cardinality == 0 {
|
||||
return 0
|
||||
|
||||
+56
-7
@@ -10,6 +10,11 @@ type ByteInput interface {
|
||||
// Next returns a slice containing the next n bytes from the buffer,
|
||||
// advancing the buffer as if the bytes had been returned by Read.
|
||||
Next(n int) ([]byte, error)
|
||||
// NextReturnsSafeSlice returns true if Next() returns a safe slice as opposed
|
||||
// to a slice that points to an underlying buffer possibly owned by another system.
|
||||
// When NextReturnsSafeSlice returns false, the result from Next() should be copied
|
||||
// before it is modified (i.e., it is immutable).
|
||||
NextReturnsSafeSlice() bool
|
||||
// ReadUInt32 reads uint32 with LittleEndian order
|
||||
ReadUInt32() (uint32, error)
|
||||
// ReadUInt16 reads uint16 with LittleEndian order
|
||||
@@ -42,6 +47,25 @@ type ByteBuffer struct {
|
||||
off int
|
||||
}
|
||||
|
||||
// NewByteBuffer creates a new ByteBuffer.
|
||||
func NewByteBuffer(buf []byte) *ByteBuffer {
|
||||
return &ByteBuffer{
|
||||
buf: buf,
|
||||
}
|
||||
}
|
||||
|
||||
var _ io.Reader = (*ByteBuffer)(nil)
|
||||
|
||||
// Read implements io.Reader.
|
||||
func (b *ByteBuffer) Read(p []byte) (int, error) {
|
||||
data, err := b.Next(len(p))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
copy(p, data)
|
||||
return len(data), nil
|
||||
}
|
||||
|
||||
// Next returns a slice containing the next n bytes from the reader
|
||||
// If there are fewer bytes than the given n, io.ErrUnexpectedEOF will be returned
|
||||
func (b *ByteBuffer) Next(n int) ([]byte, error) {
|
||||
@@ -57,6 +81,12 @@ func (b *ByteBuffer) Next(n int) ([]byte, error) {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// NextReturnsSafeSlice returns false since ByteBuffer might hold
|
||||
// an array owned by some other systems.
|
||||
func (b *ByteBuffer) NextReturnsSafeSlice() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ReadUInt32 reads uint32 with LittleEndian order
|
||||
func (b *ByteBuffer) ReadUInt32() (uint32, error) {
|
||||
if len(b.buf)-b.off < 4 {
|
||||
@@ -109,26 +139,45 @@ func (b *ByteBuffer) Reset(buf []byte) {
|
||||
type ByteInputAdapter struct {
|
||||
r io.Reader
|
||||
readBytes int
|
||||
buf [4]byte
|
||||
}
|
||||
|
||||
var _ io.Reader = (*ByteInputAdapter)(nil)
|
||||
|
||||
// Read implements io.Reader.
|
||||
func (b *ByteInputAdapter) Read(buf []byte) (int, error) {
|
||||
m, err := io.ReadAtLeast(b.r, buf, len(buf))
|
||||
b.readBytes += m
|
||||
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// Next returns a slice containing the next n bytes from the buffer,
|
||||
// advancing the buffer as if the bytes had been returned by Read.
|
||||
func (b *ByteInputAdapter) Next(n int) ([]byte, error) {
|
||||
buf := make([]byte, n)
|
||||
m, err := io.ReadAtLeast(b.r, buf, n)
|
||||
b.readBytes += m
|
||||
_, err := b.Read(buf)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// NextReturnsSafeSlice returns true since ByteInputAdapter always returns a slice
|
||||
// allocated with make([]byte, ...)
|
||||
func (b *ByteInputAdapter) NextReturnsSafeSlice() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// ReadUInt32 reads uint32 with LittleEndian order
|
||||
func (b *ByteInputAdapter) ReadUInt32() (uint32, error) {
|
||||
buf, err := b.Next(4)
|
||||
|
||||
buf := b.buf[:4]
|
||||
_, err := b.Read(buf)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
@@ -138,8 +187,8 @@ func (b *ByteInputAdapter) ReadUInt32() (uint32, error) {
|
||||
|
||||
// ReadUInt16 reads uint16 with LittleEndian order
|
||||
func (b *ByteInputAdapter) ReadUInt16() (uint16, error) {
|
||||
buf, err := b.Next(2)
|
||||
|
||||
buf := b.buf[:2]
|
||||
_, err := b.Read(buf)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
+235
-32
@@ -13,6 +13,7 @@ import (
|
||||
"strconv"
|
||||
|
||||
"github.com/RoaringBitmap/roaring/internal"
|
||||
"github.com/bits-and-blooms/bitset"
|
||||
)
|
||||
|
||||
// Bitmap represents a compressed bitmap where you can add integers.
|
||||
@@ -53,17 +54,186 @@ func (rb *Bitmap) ToBytes() ([]byte, error) {
|
||||
return rb.highlowcontainer.toBytes()
|
||||
}
|
||||
|
||||
const wordSize = uint64(64)
|
||||
const log2WordSize = uint64(6)
|
||||
const capacity = ^uint64(0)
|
||||
const bitmapContainerSize = (1 << 16) / 64 // bitmap size in words
|
||||
|
||||
// DenseSize returns the size of the bitmap when stored as a dense bitmap.
|
||||
func (rb *Bitmap) DenseSize() uint64 {
|
||||
if rb.highlowcontainer.size() == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
maximum := 1 + uint64(rb.Maximum())
|
||||
if maximum > (capacity - wordSize + 1) {
|
||||
return uint64(capacity >> log2WordSize)
|
||||
}
|
||||
|
||||
return uint64((maximum + (wordSize - 1)) >> log2WordSize)
|
||||
}
|
||||
|
||||
// ToDense returns a slice of uint64s representing the bitmap as a dense bitmap.
|
||||
// Useful to convert a roaring bitmap to a format that can be used by other libraries
|
||||
// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
|
||||
func (rb *Bitmap) ToDense() []uint64 {
|
||||
sz := rb.DenseSize()
|
||||
if sz == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
bitmap := make([]uint64, sz)
|
||||
rb.WriteDenseTo(bitmap)
|
||||
return bitmap
|
||||
}
|
||||
|
||||
// FromDense creates a bitmap from a slice of uint64s representing the bitmap as a dense bitmap.
|
||||
// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
|
||||
// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
|
||||
//
|
||||
// This function will not create any run containers, only array and bitmap containers. It's up to
|
||||
// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
|
||||
//
|
||||
// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
|
||||
// This is useful when the bitmap is going to be modified after this function returns or if it's
|
||||
// undesirable to hold references to large bitmaps which the GC would not be able to collect.
|
||||
// One copy can still happen even when doCopy is false if the bitmap length is not divisible
|
||||
// by bitmapContainerSize.
|
||||
//
|
||||
// See also FromBitSet.
|
||||
func FromDense(bitmap []uint64, doCopy bool) *Bitmap {
|
||||
sz := (len(bitmap) + bitmapContainerSize - 1) / bitmapContainerSize // round up
|
||||
rb := &Bitmap{
|
||||
highlowcontainer: roaringArray{
|
||||
containers: make([]container, 0, sz),
|
||||
keys: make([]uint16, 0, sz),
|
||||
needCopyOnWrite: make([]bool, 0, sz),
|
||||
},
|
||||
}
|
||||
rb.FromDense(bitmap, doCopy)
|
||||
return rb
|
||||
}
|
||||
|
||||
// FromDense unmarshalls from a slice of uint64s representing the bitmap as a dense bitmap.
|
||||
// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
|
||||
// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
|
||||
// Callers are responsible for ensuring that the bitmap is empty before calling this function.
|
||||
//
|
||||
// This function will not create any run containers, only array and bitmap containers. It is up to
|
||||
// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
|
||||
//
|
||||
// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
|
||||
// This is useful when the bitmap is going to be modified after this function returns or if it's
|
||||
// undesirable to hold references to large bitmaps which the GC would not be able to collect.
|
||||
// One copy can still happen even when doCopy is false if the bitmap length is not divisible
|
||||
// by bitmapContainerSize.
|
||||
//
|
||||
// See FromBitSet.
|
||||
func (rb *Bitmap) FromDense(bitmap []uint64, doCopy bool) {
|
||||
if len(bitmap) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
var k uint16
|
||||
const size = bitmapContainerSize
|
||||
|
||||
for len(bitmap) > 0 {
|
||||
hi := size
|
||||
if len(bitmap) < size {
|
||||
hi = len(bitmap)
|
||||
}
|
||||
|
||||
words := bitmap[:hi]
|
||||
count := int(popcntSlice(words))
|
||||
|
||||
switch {
|
||||
case count > arrayDefaultMaxSize:
|
||||
c := &bitmapContainer{cardinality: count, bitmap: words}
|
||||
cow := true
|
||||
|
||||
if doCopy || len(words) < size {
|
||||
c.bitmap = make([]uint64, size)
|
||||
copy(c.bitmap, words)
|
||||
cow = false
|
||||
}
|
||||
|
||||
rb.highlowcontainer.appendContainer(k, c, cow)
|
||||
|
||||
case count > 0:
|
||||
c := &arrayContainer{content: make([]uint16, count)}
|
||||
var pos, base int
|
||||
for _, w := range words {
|
||||
for w != 0 {
|
||||
t := w & -w
|
||||
c.content[pos] = uint16(base + int(popcount(t-1)))
|
||||
pos++
|
||||
w ^= t
|
||||
}
|
||||
base += 64
|
||||
}
|
||||
rb.highlowcontainer.appendContainer(k, c, false)
|
||||
}
|
||||
|
||||
bitmap = bitmap[hi:]
|
||||
k++
|
||||
}
|
||||
}
|
||||
|
||||
// WriteDenseTo writes to a slice of uint64s representing the bitmap as a dense bitmap.
|
||||
// Callers are responsible for allocating enough space in the bitmap using DenseSize.
|
||||
// Useful to convert a roaring bitmap to a format that can be used by other libraries
|
||||
// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
|
||||
func (rb *Bitmap) WriteDenseTo(bitmap []uint64) {
|
||||
for i, ct := range rb.highlowcontainer.containers {
|
||||
hb := uint32(rb.highlowcontainer.keys[i]) << 16
|
||||
|
||||
switch c := ct.(type) {
|
||||
case *arrayContainer:
|
||||
for _, x := range c.content {
|
||||
n := int(hb | uint32(x))
|
||||
bitmap[n>>log2WordSize] |= uint64(1) << uint(x%64)
|
||||
}
|
||||
|
||||
case *bitmapContainer:
|
||||
copy(bitmap[int(hb)>>log2WordSize:], c.bitmap)
|
||||
|
||||
case *runContainer16:
|
||||
for j := range c.iv {
|
||||
start := uint32(c.iv[j].start)
|
||||
end := start + uint32(c.iv[j].length) + 1
|
||||
lo := int(hb|start) >> log2WordSize
|
||||
hi := int(hb|(end-1)) >> log2WordSize
|
||||
|
||||
if lo == hi {
|
||||
bitmap[lo] |= (^uint64(0) << uint(start%64)) &
|
||||
(^uint64(0) >> (uint(-end) % 64))
|
||||
continue
|
||||
}
|
||||
|
||||
bitmap[lo] |= ^uint64(0) << uint(start%64)
|
||||
for n := lo + 1; n < hi; n++ {
|
||||
bitmap[n] = ^uint64(0)
|
||||
}
|
||||
bitmap[hi] |= ^uint64(0) >> (uint(-end) % 64)
|
||||
}
|
||||
default:
|
||||
panic("unsupported container type")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Checksum computes a hash (currently FNV-1a) for a bitmap that is suitable for
|
||||
// using bitmaps as elements in hash sets or as keys in hash maps, as well as
|
||||
// generally quicker comparisons.
|
||||
// The implementation is biased towards efficiency in little endian machines, so
|
||||
// expect some extra CPU cycles and memory to be used if your machine is big endian.
|
||||
// Likewise, don't use this to verify integrity unless you're certain you'll load
|
||||
// the bitmap on a machine with the same endianess used to create it.
|
||||
// Likewise, do not use this to verify integrity unless you are certain you will load
|
||||
// the bitmap on a machine with the same endianess used to create it. (Thankfully
|
||||
// very few people use big endian machines these days.)
|
||||
func (rb *Bitmap) Checksum() uint64 {
|
||||
const (
|
||||
offset = 14695981039346656037
|
||||
prime = 1099511628211
|
||||
prime = 1099511628211
|
||||
)
|
||||
|
||||
var bytes []byte
|
||||
@@ -106,6 +276,20 @@ func (rb *Bitmap) Checksum() uint64 {
|
||||
return hash
|
||||
}
|
||||
|
||||
// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
|
||||
// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
|
||||
// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
|
||||
// The containers in the resulting bitmap are immutable containers tied to the provided byte array and they rely on
|
||||
// copy-on-write which means that modifying them creates copies. Thus FromUnsafeBytes is more likely to be appropriate for read-only use cases,
|
||||
// when the resulting bitmap can be considered immutable.
|
||||
//
|
||||
// See also the FromBuffer function.
|
||||
// See https://github.com/RoaringBitmap/roaring/pull/395 for more details.
|
||||
func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) {
|
||||
stream := internal.NewByteBuffer(data)
|
||||
return rb.ReadFrom(stream)
|
||||
}
|
||||
|
||||
// ReadFrom reads a serialized version of this bitmap from stream.
|
||||
// The format is compatible with other RoaringBitmap
|
||||
// implementations (Java, C) and is documented here:
|
||||
@@ -114,12 +298,18 @@ func (rb *Bitmap) Checksum() uint64 {
|
||||
// So add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom.
|
||||
// It is not necessary to pass cookieHeader when call roaring.ReadFrom to read the roaring32 data directly.
|
||||
func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) {
|
||||
stream := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
|
||||
stream.Reset(reader)
|
||||
stream, ok := reader.(internal.ByteInput)
|
||||
if !ok {
|
||||
byteInputAdapter := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
|
||||
byteInputAdapter.Reset(reader)
|
||||
stream = byteInputAdapter
|
||||
}
|
||||
|
||||
p, err = rb.highlowcontainer.readFrom(stream, cookieHeader...)
|
||||
internal.ByteInputAdapterPool.Put(stream)
|
||||
|
||||
if !ok {
|
||||
internal.ByteInputAdapterPool.Put(stream.(*internal.ByteInputAdapter))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -139,12 +329,17 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err
|
||||
// You should *not* change the copy-on-write status of the resulting
|
||||
// bitmaps (SetCopyOnWrite).
|
||||
//
|
||||
// Thus FromBuffer is more likely to be appropriate for read-only use cases,
|
||||
// when the resulting bitmap can be considered immutable.
|
||||
//
|
||||
// If buf becomes unavailable, then a bitmap created with
|
||||
// FromBuffer would be effectively broken. Furthermore, any
|
||||
// bitmap derived from this bitmap (e.g., via Or, And) might
|
||||
// also be broken. Thus, before making buf unavailable, you should
|
||||
// call CloneCopyOnWriteContainers on all such bitmaps.
|
||||
//
|
||||
// See also the FromUnsafeBytes function which can have better performance
|
||||
// in some cases.
|
||||
func (rb *Bitmap) FromBuffer(buf []byte) (p int64, err error) {
|
||||
stream := internal.ByteBufferPool.Get().(*internal.ByteBuffer)
|
||||
stream.Reset(buf)
|
||||
@@ -194,6 +389,16 @@ func (rb *Bitmap) Clear() {
|
||||
rb.highlowcontainer.clear()
|
||||
}
|
||||
|
||||
// ToBitSet copies the content of the RoaringBitmap into a bitset.BitSet instance
|
||||
func (rb *Bitmap) ToBitSet() *bitset.BitSet {
|
||||
return bitset.From(rb.ToDense())
|
||||
}
|
||||
|
||||
// FromBitSet creates a new RoaringBitmap from a bitset.BitSet instance
|
||||
func FromBitSet(bitset *bitset.BitSet) *Bitmap {
|
||||
return FromDense(bitset.Bytes(), false)
|
||||
}
|
||||
|
||||
// ToArray creates a new slice containing all of the integers stored in the Bitmap in sorted order
|
||||
func (rb *Bitmap) ToArray() []uint32 {
|
||||
array := make([]uint32, rb.GetCardinality())
|
||||
@@ -233,7 +438,7 @@ func BoundSerializedSizeInBytes(cardinality uint64, universeSize uint64) uint64
|
||||
contnbr := (universeSize + uint64(65535)) / uint64(65536)
|
||||
if contnbr > cardinality {
|
||||
contnbr = cardinality
|
||||
// we can't have more containers than we have values
|
||||
// we cannot have more containers than we have values
|
||||
}
|
||||
headermax := 8*contnbr + 4
|
||||
if 4 > (contnbr+7)/8 {
|
||||
@@ -276,9 +481,9 @@ type intIterator struct {
|
||||
// This way, instead of making up-to 64k allocations per full iteration
|
||||
// we get a single allocation and simply reinitialize the appropriate
|
||||
// iterator and point to it in the generic `iter` member on each key bound.
|
||||
shortIter shortIterator
|
||||
runIter runIterator16
|
||||
bitmapIter bitmapContainerShortIterator
|
||||
shortIter shortIterator
|
||||
runIter runIterator16
|
||||
bitmapIter bitmapContainerShortIterator
|
||||
}
|
||||
|
||||
// HasNext returns true if there are more integers to iterate over
|
||||
@@ -341,14 +546,13 @@ func (ii *intIterator) AdvanceIfNeeded(minval uint32) {
|
||||
// IntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
|
||||
type IntIterator = intIterator
|
||||
|
||||
|
||||
// Initialize configures the existing iterator so that it can iterate through the values of
|
||||
// the provided bitmap.
|
||||
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
|
||||
func (p *intIterator) Initialize(a *Bitmap) {
|
||||
p.pos = 0
|
||||
p.highlowcontainer = &a.highlowcontainer
|
||||
p.init()
|
||||
func (ii *intIterator) Initialize(a *Bitmap) {
|
||||
ii.pos = 0
|
||||
ii.highlowcontainer = &a.highlowcontainer
|
||||
ii.init()
|
||||
}
|
||||
|
||||
type intReverseIterator struct {
|
||||
@@ -357,9 +561,9 @@ type intReverseIterator struct {
|
||||
iter shortIterable
|
||||
highlowcontainer *roaringArray
|
||||
|
||||
shortIter reverseIterator
|
||||
runIter runReverseIterator16
|
||||
bitmapIter reverseBitmapContainerShortIterator
|
||||
shortIter reverseIterator
|
||||
runIter runReverseIterator16
|
||||
bitmapIter reverseBitmapContainerShortIterator
|
||||
}
|
||||
|
||||
// HasNext returns true if there are more integers to iterate over
|
||||
@@ -414,10 +618,10 @@ type IntReverseIterator = intReverseIterator
|
||||
// Initialize configures the existing iterator so that it can iterate through the values of
|
||||
// the provided bitmap.
|
||||
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
|
||||
func (p *intReverseIterator) Initialize(a *Bitmap) {
|
||||
p.highlowcontainer = &a.highlowcontainer
|
||||
p.pos = a.highlowcontainer.size() - 1
|
||||
p.init()
|
||||
func (ii *intReverseIterator) Initialize(a *Bitmap) {
|
||||
ii.highlowcontainer = &a.highlowcontainer
|
||||
ii.pos = a.highlowcontainer.size() - 1
|
||||
ii.init()
|
||||
}
|
||||
|
||||
// ManyIntIterable allows you to iterate over the values in a Bitmap
|
||||
@@ -434,9 +638,9 @@ type manyIntIterator struct {
|
||||
iter manyIterable
|
||||
highlowcontainer *roaringArray
|
||||
|
||||
shortIter shortIterator
|
||||
runIter runIterator16
|
||||
bitmapIter bitmapContainerManyIterator
|
||||
shortIter shortIterator
|
||||
runIter runIterator16
|
||||
bitmapIter bitmapContainerManyIterator
|
||||
}
|
||||
|
||||
func (ii *manyIntIterator) init() {
|
||||
@@ -495,17 +699,16 @@ func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int {
|
||||
return n
|
||||
}
|
||||
|
||||
|
||||
// ManyIntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
|
||||
type ManyIntIterator = manyIntIterator
|
||||
|
||||
// Initialize configures the existing iterator so that it can iterate through the values of
|
||||
// the provided bitmap.
|
||||
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
|
||||
func (p *manyIntIterator) Initialize(a *Bitmap) {
|
||||
p.pos = 0
|
||||
p.highlowcontainer = &a.highlowcontainer
|
||||
p.init()
|
||||
func (ii *manyIntIterator) Initialize(a *Bitmap) {
|
||||
ii.pos = 0
|
||||
ii.highlowcontainer = &a.highlowcontainer
|
||||
ii.init()
|
||||
}
|
||||
|
||||
// String creates a string representation of the Bitmap
|
||||
@@ -569,7 +772,7 @@ func (rb *Bitmap) Iterate(cb func(x uint32) bool) {
|
||||
// Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order;
|
||||
// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove).
|
||||
func (rb *Bitmap) Iterator() IntPeekable {
|
||||
p := new(intIterator)
|
||||
p := new(intIterator)
|
||||
p.Initialize(rb)
|
||||
return p
|
||||
}
|
||||
@@ -847,7 +1050,7 @@ func (rb *Bitmap) Select(x uint32) (uint32, error) {
|
||||
return uint32(key)<<16 + uint32(c.selectInt(uint16(remaining))), nil
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("can't find %dth integer in a bitmap with only %d items", x, rb.GetCardinality())
|
||||
return 0, fmt.Errorf("cannot find %dth integer in a bitmap with only %d items", x, rb.GetCardinality())
|
||||
}
|
||||
|
||||
// And computes the intersection between two bitmaps and stores the result in the current bitmap
|
||||
|
||||
-1
@@ -33,7 +33,6 @@ help:
|
||||
all: help
|
||||
test:
|
||||
go test
|
||||
go test -race -run TestConcurrent*
|
||||
# Format the source code
|
||||
format:
|
||||
@find ./ -type f -name "*.go" -exec gofmt -w {} \;
|
||||
|
||||
+131
-70
@@ -2,6 +2,7 @@ package roaring64
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"math/bits"
|
||||
"runtime"
|
||||
"sync"
|
||||
@@ -24,8 +25,8 @@ const (
|
||||
// It depends upon the bitmap libraries. It is not thread safe, so
|
||||
// upstream concurrency guards must be provided.
|
||||
type BSI struct {
|
||||
bA []*Bitmap
|
||||
eBM *Bitmap // Existence BitMap
|
||||
bA []Bitmap
|
||||
eBM Bitmap // Existence BitMap
|
||||
MaxValue int64
|
||||
MinValue int64
|
||||
runOptimized bool
|
||||
@@ -39,11 +40,8 @@ func NewBSI(maxValue int64, minValue int64) *BSI {
|
||||
if bits.Len64(uint64(maxValue)) > bitsz {
|
||||
bitsz = bits.Len64(uint64(maxValue))
|
||||
}
|
||||
ba := make([]*Bitmap, bitsz)
|
||||
for i := 0; i < len(ba); i++ {
|
||||
ba[i] = NewBitmap()
|
||||
}
|
||||
return &BSI{bA: ba, eBM: NewBitmap(), MaxValue: maxValue, MinValue: minValue}
|
||||
ba := make([]Bitmap, bitsz)
|
||||
return &BSI{bA: ba, MaxValue: maxValue, MinValue: minValue}
|
||||
}
|
||||
|
||||
// NewDefaultBSI constructs an auto-sized BSI
|
||||
@@ -67,7 +65,7 @@ func (b *BSI) HasRunCompression() bool {
|
||||
|
||||
// GetExistenceBitmap returns a pointer to the underlying existence bitmap of the BSI
|
||||
func (b *BSI) GetExistenceBitmap() *Bitmap {
|
||||
return b.eBM
|
||||
return &b.eBM
|
||||
}
|
||||
|
||||
// ValueExists tests whether the value exists.
|
||||
@@ -83,54 +81,41 @@ func (b *BSI) GetCardinality() uint64 {
|
||||
|
||||
// BitCount returns the number of bits needed to represent values.
|
||||
func (b *BSI) BitCount() int {
|
||||
|
||||
return len(b.bA)
|
||||
}
|
||||
|
||||
// SetValue sets a value for a given columnID.
|
||||
func (b *BSI) SetValue(columnID uint64, value int64) {
|
||||
|
||||
// If max/min values are set to zero then automatically determine bit array size
|
||||
if b.MaxValue == 0 && b.MinValue == 0 {
|
||||
ba := make([]*Bitmap, bits.Len64(uint64(value)))
|
||||
for i := len(ba) - b.BitCount(); i > 0; i-- {
|
||||
b.bA = append(b.bA, NewBitmap())
|
||||
if b.runOptimized {
|
||||
b.bA[i].RunOptimize()
|
||||
}
|
||||
minBits := bits.Len64(uint64(value))
|
||||
for len(b.bA) < minBits {
|
||||
b.bA = append(b.bA, Bitmap{})
|
||||
}
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for i := 0; i < b.BitCount(); i++ {
|
||||
wg.Add(1)
|
||||
go func(j int) {
|
||||
defer wg.Done()
|
||||
if uint64(value)&(1<<uint64(j)) > 0 {
|
||||
b.bA[j].Add(uint64(columnID))
|
||||
} else {
|
||||
b.bA[j].Remove(uint64(columnID))
|
||||
}
|
||||
}(i)
|
||||
if uint64(value)&(1<<uint64(i)) > 0 {
|
||||
b.bA[i].Add(columnID)
|
||||
} else {
|
||||
b.bA[i].Remove(columnID)
|
||||
}
|
||||
}
|
||||
wg.Wait()
|
||||
b.eBM.Add(uint64(columnID))
|
||||
b.eBM.Add(columnID)
|
||||
}
|
||||
|
||||
// GetValue gets the value at the column ID. Second param will be false for non-existant values.
|
||||
func (b *BSI) GetValue(columnID uint64) (int64, bool) {
|
||||
value := int64(0)
|
||||
exists := b.eBM.Contains(uint64(columnID))
|
||||
// GetValue gets the value at the column ID. Second param will be false for non-existent values.
|
||||
func (b *BSI) GetValue(columnID uint64) (value int64, exists bool) {
|
||||
exists = b.eBM.Contains(columnID)
|
||||
if !exists {
|
||||
return value, exists
|
||||
return
|
||||
}
|
||||
for i := 0; i < b.BitCount(); i++ {
|
||||
if b.bA[i].Contains(uint64(columnID)) {
|
||||
value |= (1 << uint64(i))
|
||||
if b.bA[i].Contains(columnID) {
|
||||
value |= 1 << i
|
||||
}
|
||||
}
|
||||
return int64(value), exists
|
||||
return
|
||||
}
|
||||
|
||||
type action func(t *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.WaitGroup)
|
||||
@@ -261,13 +246,12 @@ type task struct {
|
||||
// For the RANGE parameter the comparison criteria is >= valueOrStart and <= end.
|
||||
// The parallelism parameter indicates the number of CPU threads to be applied for processing. A value
|
||||
// of zero indicates that all available CPU resources will be potentially utilized.
|
||||
//
|
||||
func (b *BSI) CompareValue(parallelism int, op Operation, valueOrStart, end int64,
|
||||
foundSet *Bitmap) *Bitmap {
|
||||
|
||||
comp := &task{bsi: b, op: op, valueOrStart: valueOrStart, end: end}
|
||||
if foundSet == nil {
|
||||
return parallelExecutor(parallelism, comp, compareValue, b.eBM)
|
||||
return parallelExecutor(parallelism, comp, compareValue, &b.eBM)
|
||||
}
|
||||
return parallelExecutor(parallelism, comp, compareValue, foundSet)
|
||||
}
|
||||
@@ -522,7 +506,6 @@ func (b *BSI) minOrMax(op Operation, batch []uint64, resultsChan chan int64, wg
|
||||
|
||||
// Sum all values contained within the foundSet. As a convenience, the cardinality of the foundSet
|
||||
// is also returned (for calculating the average).
|
||||
//
|
||||
func (b *BSI) Sum(foundSet *Bitmap) (sum int64, count uint64) {
|
||||
|
||||
count = foundSet.GetCardinality()
|
||||
@@ -531,7 +514,7 @@ func (b *BSI) Sum(foundSet *Bitmap) (sum int64, count uint64) {
|
||||
wg.Add(1)
|
||||
go func(j int) {
|
||||
defer wg.Done()
|
||||
atomic.AddInt64(&sum, int64(foundSet.AndCardinality(b.bA[j])<<uint(j)))
|
||||
atomic.AddInt64(&sum, int64(foundSet.AndCardinality(&b.bA[j])<<uint(j)))
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
@@ -540,7 +523,7 @@ func (b *BSI) Sum(foundSet *Bitmap) (sum int64, count uint64) {
|
||||
|
||||
// Transpose calls b.IntersectAndTranspose(0, b.eBM)
|
||||
func (b *BSI) Transpose() *Bitmap {
|
||||
return b.IntersectAndTranspose(0, b.eBM)
|
||||
return b.IntersectAndTranspose(0, &b.eBM)
|
||||
}
|
||||
|
||||
// IntersectAndTranspose is a matrix transpose function. Return a bitmap such that the values are represented as column IDs
|
||||
@@ -549,7 +532,6 @@ func (b *BSI) Transpose() *Bitmap {
|
||||
// vectoring one set of integers to another.
|
||||
//
|
||||
// TODO: This implementation is functional but not performant, needs to be re-written perhaps using SIMD SSE2 instructions.
|
||||
//
|
||||
func (b *BSI) IntersectAndTranspose(parallelism int, foundSet *Bitmap) *Bitmap {
|
||||
|
||||
trans := &task{bsi: b}
|
||||
@@ -587,11 +569,9 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) {
|
||||
|
||||
// Make sure we have enough bit slices
|
||||
for bits > b.BitCount() {
|
||||
newBm := NewBitmap()
|
||||
if b.runOptimized {
|
||||
newBm.RunOptimize()
|
||||
}
|
||||
b.bA = append(b.bA, newBm)
|
||||
bm := Bitmap{}
|
||||
bm.RunOptimize()
|
||||
b.bA = append(b.bA, bm)
|
||||
}
|
||||
|
||||
a := make([][]*Bitmap, bits)
|
||||
@@ -599,9 +579,8 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) {
|
||||
a[i] = make([]*Bitmap, 0)
|
||||
for _, x := range bsis {
|
||||
if len(x.bA) > i {
|
||||
a[i] = append(a[i], x.bA[i])
|
||||
a[i] = append(a[i], &x.bA[i])
|
||||
} else {
|
||||
a[i] = []*Bitmap{NewBitmap()}
|
||||
if b.runOptimized {
|
||||
a[i][0].RunOptimize()
|
||||
}
|
||||
@@ -612,7 +591,7 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) {
|
||||
// Consolidate existence bit maps
|
||||
ebms := make([]*Bitmap, len(bsis))
|
||||
for i := range ebms {
|
||||
ebms[i] = bsis[i].eBM
|
||||
ebms[i] = &bsis[i].eBM
|
||||
}
|
||||
|
||||
// First merge all the bit slices from all bsi maps that exist in target
|
||||
@@ -621,17 +600,17 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) {
|
||||
wg.Add(1)
|
||||
go func(j int) {
|
||||
defer wg.Done()
|
||||
x := []*Bitmap{b.bA[j]}
|
||||
x := []*Bitmap{&b.bA[j]}
|
||||
x = append(x, a[j]...)
|
||||
b.bA[j] = ParOr(parallelism, x...)
|
||||
b.bA[j] = *ParOr(parallelism, x...)
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// merge all the EBM maps
|
||||
x := []*Bitmap{b.eBM}
|
||||
x := []*Bitmap{&b.eBM}
|
||||
x = append(x, ebms...)
|
||||
b.eBM = ParOr(parallelism, x...)
|
||||
b.eBM = *ParOr(parallelism, x...)
|
||||
}
|
||||
|
||||
// UnmarshalBinary de-serialize a BSI. The value at bitData[0] is the EBM. Other indices are in least to most
|
||||
@@ -643,7 +622,7 @@ func (b *BSI) UnmarshalBinary(bitData [][]byte) error {
|
||||
continue
|
||||
}
|
||||
if b.BitCount() < i {
|
||||
newBm := NewBitmap()
|
||||
newBm := Bitmap{}
|
||||
if b.runOptimized {
|
||||
newBm.RunOptimize()
|
||||
}
|
||||
@@ -659,7 +638,7 @@ func (b *BSI) UnmarshalBinary(bitData [][]byte) error {
|
||||
}
|
||||
// First element of bitData is the EBM
|
||||
if bitData[0] == nil {
|
||||
b.eBM = NewBitmap()
|
||||
b.eBM = Bitmap{}
|
||||
if b.runOptimized {
|
||||
b.eBM.RunOptimize()
|
||||
}
|
||||
@@ -674,6 +653,39 @@ func (b *BSI) UnmarshalBinary(bitData [][]byte) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReadFrom reads a serialized version of this BSI from stream.
|
||||
func (b *BSI) ReadFrom(stream io.Reader) (p int64, err error) {
|
||||
bm, n, err := readBSIContainerFromStream(stream)
|
||||
p += n
|
||||
if err != nil {
|
||||
err = fmt.Errorf("reading existence bitmap: %w", err)
|
||||
return
|
||||
}
|
||||
b.eBM = bm
|
||||
b.bA = b.bA[:0]
|
||||
for {
|
||||
// This forces a new memory location to be allocated and if we're lucky it only escapes if
|
||||
// there's no error.
|
||||
var bm Bitmap
|
||||
bm, n, err = readBSIContainerFromStream(stream)
|
||||
p += n
|
||||
if err == io.EOF {
|
||||
err = nil
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
err = fmt.Errorf("reading bit slice index %v: %w", len(b.bA), err)
|
||||
return
|
||||
}
|
||||
b.bA = append(b.bA, bm)
|
||||
}
|
||||
}
|
||||
|
||||
func readBSIContainerFromStream(r io.Reader) (bm Bitmap, p int64, err error) {
|
||||
p, err = bm.ReadFrom(r)
|
||||
return
|
||||
}
|
||||
|
||||
// MarshalBinary serializes a BSI
|
||||
func (b *BSI) MarshalBinary() ([][]byte, error) {
|
||||
|
||||
@@ -694,6 +706,23 @@ func (b *BSI) MarshalBinary() ([][]byte, error) {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// WriteTo writes a serialized version of this BSI to stream.
|
||||
func (b *BSI) WriteTo(w io.Writer) (n int64, err error) {
|
||||
n1, err := b.eBM.WriteTo(w)
|
||||
n += n1
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for _, bm := range b.bA {
|
||||
n1, err = bm.WriteTo(w)
|
||||
n += n1
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// BatchEqual returns a bitmap containing the column IDs where the values are contained within the list of values provided.
|
||||
func (b *BSI) BatchEqual(parallelism int, values []int64) *Bitmap {
|
||||
|
||||
@@ -702,7 +731,7 @@ func (b *BSI) BatchEqual(parallelism int, values []int64) *Bitmap {
|
||||
valMap[values[i]] = struct{}{}
|
||||
}
|
||||
comp := &task{bsi: b, values: valMap}
|
||||
return parallelExecutor(parallelism, comp, batchEqual, b.eBM)
|
||||
return parallelExecutor(parallelism, comp, batchEqual, &b.eBM)
|
||||
}
|
||||
|
||||
func batchEqual(e *task, batch []uint64, resultsChan chan *Bitmap,
|
||||
@@ -742,13 +771,13 @@ func (b *BSI) ClearValues(foundSet *Bitmap) {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
ClearBits(foundSet, b.eBM)
|
||||
ClearBits(foundSet, &b.eBM)
|
||||
}()
|
||||
for i := 0; i < b.BitCount(); i++ {
|
||||
wg.Add(1)
|
||||
go func(j int) {
|
||||
defer wg.Done()
|
||||
ClearBits(foundSet, b.bA[j])
|
||||
ClearBits(foundSet, &b.bA[j])
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
@@ -758,19 +787,19 @@ func (b *BSI) ClearValues(foundSet *Bitmap) {
|
||||
func (b *BSI) NewBSIRetainSet(foundSet *Bitmap) *BSI {
|
||||
|
||||
newBSI := NewBSI(b.MaxValue, b.MinValue)
|
||||
newBSI.bA = make([]*Bitmap, b.BitCount())
|
||||
newBSI.bA = make([]Bitmap, b.BitCount())
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
newBSI.eBM = b.eBM.Clone()
|
||||
newBSI.eBM = *b.eBM.Clone()
|
||||
newBSI.eBM.And(foundSet)
|
||||
}()
|
||||
for i := 0; i < b.BitCount(); i++ {
|
||||
wg.Add(1)
|
||||
go func(j int) {
|
||||
defer wg.Done()
|
||||
newBSI.bA[j] = b.bA[j].Clone()
|
||||
newBSI.bA[j] = *b.bA[j].Clone()
|
||||
newBSI.bA[j].And(foundSet)
|
||||
}(i)
|
||||
}
|
||||
@@ -780,28 +809,28 @@ func (b *BSI) NewBSIRetainSet(foundSet *Bitmap) *BSI {
|
||||
|
||||
// Clone performs a deep copy of BSI contents.
|
||||
func (b *BSI) Clone() *BSI {
|
||||
return b.NewBSIRetainSet(b.eBM)
|
||||
return b.NewBSIRetainSet(&b.eBM)
|
||||
}
|
||||
|
||||
// Add - In-place sum the contents of another BSI with this BSI, column wise.
|
||||
func (b *BSI) Add(other *BSI) {
|
||||
|
||||
b.eBM.Or(other.eBM)
|
||||
b.eBM.Or(&other.eBM)
|
||||
for i := 0; i < len(other.bA); i++ {
|
||||
b.addDigit(other.bA[i], i)
|
||||
b.addDigit(&other.bA[i], i)
|
||||
}
|
||||
}
|
||||
|
||||
func (b *BSI) addDigit(foundSet *Bitmap, i int) {
|
||||
|
||||
if i >= len(b.bA) {
|
||||
b.bA = append(b.bA, NewBitmap())
|
||||
b.bA = append(b.bA, Bitmap{})
|
||||
}
|
||||
carry := And(b.bA[i], foundSet)
|
||||
carry := And(&b.bA[i], foundSet)
|
||||
b.bA[i].Xor(foundSet)
|
||||
if !carry.IsEmpty() {
|
||||
if i+1 >= len(b.bA) {
|
||||
b.bA = append(b.bA, NewBitmap())
|
||||
b.bA = append(b.bA, Bitmap{})
|
||||
}
|
||||
b.addDigit(carry, i+1)
|
||||
}
|
||||
@@ -811,7 +840,6 @@ func (b *BSI) addDigit(foundSet *Bitmap, i int) {
|
||||
// contained within the input BSI. Given that for BSIs, different columnIDs can have the same value. TransposeWithCounts
|
||||
// is useful for situations where there is a one-to-many relationship between the vectored integer sets. The resulting BSI
|
||||
// contains the number of times a particular value appeared in the input BSI.
|
||||
//
|
||||
func (b *BSI) TransposeWithCounts(parallelism int, foundSet, filterSet *Bitmap) *BSI {
|
||||
|
||||
return parallelExecutorBSIResults(parallelism, b, transposeWithCounts, foundSet, filterSet, true)
|
||||
@@ -844,9 +872,42 @@ func transposeWithCounts(input *BSI, filterSet *Bitmap, batch []uint64, resultsC
|
||||
// Increment - In-place increment of values in a BSI. Found set select columns for incrementing.
|
||||
func (b *BSI) Increment(foundSet *Bitmap) {
|
||||
b.addDigit(foundSet, 0)
|
||||
b.eBM.Or(foundSet)
|
||||
}
|
||||
|
||||
// IncrementAll - In-place increment of all values in a BSI.
|
||||
func (b *BSI) IncrementAll() {
|
||||
b.Increment(b.GetExistenceBitmap())
|
||||
}
|
||||
|
||||
// Equals - Check for semantic equality of two BSIs.
|
||||
func (b *BSI) Equals(other *BSI) bool {
|
||||
if !b.eBM.Equals(&other.eBM) {
|
||||
return false
|
||||
}
|
||||
for i := 0; i < len(b.bA) || i < len(other.bA); i++ {
|
||||
if i >= len(b.bA) {
|
||||
if !other.bA[i].IsEmpty() {
|
||||
return false
|
||||
}
|
||||
} else if i >= len(other.bA) {
|
||||
if !b.bA[i].IsEmpty() {
|
||||
return false
|
||||
}
|
||||
} else {
|
||||
if !b.bA[i].Equals(&other.bA[i]) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// GetSizeInBytes - the size in bytes of the data structure
|
||||
func (b *BSI) GetSizeInBytes() int {
|
||||
size := b.eBM.GetSizeInBytes()
|
||||
for _, bm := range b.bA {
|
||||
size += bm.GetSizeInBytes()
|
||||
}
|
||||
return int(size)
|
||||
}
|
||||
|
||||
+85
-52
@@ -9,6 +9,7 @@ import (
|
||||
"strconv"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/RoaringBitmap/roaring/internal"
|
||||
)
|
||||
|
||||
const serialCookieNoRunContainer = 12346 // only arrays and bitmaps
|
||||
@@ -61,7 +62,7 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) {
|
||||
}
|
||||
n += int64(written)
|
||||
pos := 0
|
||||
keyBuf := make([]byte, 4)
|
||||
keyBuf := buf[:4]
|
||||
for pos < rb.highlowcontainer.size() {
|
||||
c := rb.highlowcontainer.getContainerAtIndex(pos)
|
||||
binary.LittleEndian.PutUint32(keyBuf, rb.highlowcontainer.getKeyAtIndex(pos))
|
||||
@@ -80,37 +81,86 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
|
||||
// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
|
||||
// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
|
||||
func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) {
|
||||
stream := internal.NewByteBuffer(data)
|
||||
sizeBuf := make([]byte, 8)
|
||||
n, err := stream.Read(sizeBuf)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
p += int64(n)
|
||||
size := binary.LittleEndian.Uint64(sizeBuf)
|
||||
|
||||
rb.highlowcontainer.resize(0)
|
||||
if cap(rb.highlowcontainer.keys) >= int(size) {
|
||||
rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size]
|
||||
} else {
|
||||
rb.highlowcontainer.keys = make([]uint32, size)
|
||||
}
|
||||
if cap(rb.highlowcontainer.containers) >= int(size) {
|
||||
rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size]
|
||||
} else {
|
||||
rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
|
||||
}
|
||||
if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) {
|
||||
rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size]
|
||||
} else {
|
||||
rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
|
||||
}
|
||||
for i := uint64(0); i < size; i++ {
|
||||
keyBuf, err := stream.Next(4)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read key #%d: %w", i, err)
|
||||
}
|
||||
p += 4
|
||||
rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf)
|
||||
rb.highlowcontainer.containers[i] = roaring.NewBitmap()
|
||||
n, err := rb.highlowcontainer.containers[i].ReadFrom(stream)
|
||||
if n == 0 || err != nil {
|
||||
return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err)
|
||||
}
|
||||
p += int64(n)
|
||||
}
|
||||
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// ReadFrom reads a serialized version of this bitmap from stream.
|
||||
// The format is compatible with other 64-bit RoaringBitmap
|
||||
// implementations (Java, Go, C++) and it has a specification :
|
||||
// https://github.com/RoaringBitmap/RoaringFormatSpec#extention-for-64-bit-implementations
|
||||
func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) {
|
||||
cookie, r32, p, err := tryReadFromRoaring32(rb, stream)
|
||||
if err != nil {
|
||||
return p, err
|
||||
} else if r32 {
|
||||
return p, nil
|
||||
}
|
||||
// TODO: Add buffer interning as in base roaring package.
|
||||
|
||||
sizeBuf := make([]byte, 4)
|
||||
sizeBuf := make([]byte, 8)
|
||||
var n int
|
||||
n, err = stream.Read(sizeBuf)
|
||||
if n == 0 || err != nil {
|
||||
return int64(n), fmt.Errorf("error in bitmap.readFrom: could not read number of containers: %s", err)
|
||||
n, err = io.ReadFull(stream, sizeBuf)
|
||||
if err != nil {
|
||||
return int64(n), err
|
||||
}
|
||||
p += int64(n)
|
||||
sizeBuf = append(cookie, sizeBuf...)
|
||||
|
||||
size := binary.LittleEndian.Uint64(sizeBuf)
|
||||
rb.highlowcontainer = roaringArray64{}
|
||||
rb.highlowcontainer.keys = make([]uint32, size)
|
||||
rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
|
||||
rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
|
||||
keyBuf := make([]byte, 4)
|
||||
rb.highlowcontainer.resize(0)
|
||||
if cap(rb.highlowcontainer.keys) >= int(size) {
|
||||
rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size]
|
||||
} else {
|
||||
rb.highlowcontainer.keys = make([]uint32, size)
|
||||
}
|
||||
if cap(rb.highlowcontainer.containers) >= int(size) {
|
||||
rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size]
|
||||
} else {
|
||||
rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
|
||||
}
|
||||
if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) {
|
||||
rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size]
|
||||
} else {
|
||||
rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
|
||||
}
|
||||
keyBuf := sizeBuf[:4]
|
||||
for i := uint64(0); i < size; i++ {
|
||||
n, err = stream.Read(keyBuf)
|
||||
if n == 0 || err != nil {
|
||||
n, err = io.ReadFull(stream, keyBuf)
|
||||
if err != nil {
|
||||
return int64(n), fmt.Errorf("error in bitmap.readFrom: could not read key #%d: %s", i, err)
|
||||
}
|
||||
p += int64(n)
|
||||
@@ -126,30 +176,6 @@ func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) {
|
||||
return p, nil
|
||||
}
|
||||
|
||||
func tryReadFromRoaring32(rb *Bitmap, stream io.Reader) (cookie []byte, r32 bool, p int64, err error) {
|
||||
// Verify the first two bytes are a valid MagicNumber.
|
||||
cookie = make([]byte, 4)
|
||||
size, err := stream.Read(cookie)
|
||||
if err != nil {
|
||||
return cookie, false, int64(size), err
|
||||
}
|
||||
fileMagic := int(binary.LittleEndian.Uint16(cookie[0:2]))
|
||||
if fileMagic == serialCookieNoRunContainer || fileMagic == serialCookie {
|
||||
bm32 := roaring.NewBitmap()
|
||||
p, err = bm32.ReadFrom(stream, cookie...)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
rb.highlowcontainer = roaringArray64{
|
||||
keys: []uint32{0},
|
||||
containers: []*roaring.Bitmap{bm32},
|
||||
needCopyOnWrite: []bool{false},
|
||||
}
|
||||
return cookie, true, p, nil
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// FromBuffer creates a bitmap from its serialized version stored in buffer
|
||||
// func (rb *Bitmap) FromBuffer(data []byte) (p int64, err error) {
|
||||
//
|
||||
@@ -298,12 +324,8 @@ func (rb *Bitmap) ContainsInt(x int) bool {
|
||||
}
|
||||
|
||||
// Equals returns true if the two bitmaps contain the same integers
|
||||
func (rb *Bitmap) Equals(o interface{}) bool {
|
||||
srb, ok := o.(*Bitmap)
|
||||
if ok {
|
||||
return srb.highlowcontainer.equals(rb.highlowcontainer)
|
||||
}
|
||||
return false
|
||||
func (rb *Bitmap) Equals(srb *Bitmap) bool {
|
||||
return srb.highlowcontainer.equals(rb.highlowcontainer)
|
||||
}
|
||||
|
||||
// Add the integer x to the bitmap
|
||||
@@ -1228,3 +1250,14 @@ func (rb *Bitmap) Stats() roaring.Statistics {
|
||||
func (rb *Bitmap) GetSerializedSizeInBytes() uint64 {
|
||||
return rb.highlowcontainer.serializedSizeInBytes()
|
||||
}
|
||||
|
||||
// Roaring32AsRoaring64 inserts a 32-bit roaring bitmap into
|
||||
// a 64-bit roaring bitmap. No copy is made.
|
||||
func Roaring32AsRoaring64(bm32 *roaring.Bitmap) *Bitmap {
|
||||
rb := NewBitmap()
|
||||
rb.highlowcontainer.resize(0)
|
||||
rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, 0)
|
||||
rb.highlowcontainer.containers = append(rb.highlowcontainer.containers, bm32)
|
||||
rb.highlowcontainer.needCopyOnWrite = append(rb.highlowcontainer.needCopyOnWrite, false)
|
||||
return rb
|
||||
}
|
||||
|
||||
+10
-5
@@ -1,6 +1,8 @@
|
||||
package roaring64
|
||||
|
||||
import "github.com/RoaringBitmap/roaring"
|
||||
import (
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
)
|
||||
|
||||
type roaringArray64 struct {
|
||||
keys []uint32
|
||||
@@ -12,9 +14,10 @@ type roaringArray64 struct {
|
||||
// runOptimize compresses the element containers to minimize space consumed.
|
||||
// Q: how does this interact with copyOnWrite and needCopyOnWrite?
|
||||
// A: since we aren't changing the logical content, just the representation,
|
||||
// we don't bother to check the needCopyOnWrite bits. We replace
|
||||
// (possibly all) elements of ra.containers in-place with space
|
||||
// optimized versions.
|
||||
//
|
||||
// we don't bother to check the needCopyOnWrite bits. We replace
|
||||
// (possibly all) elements of ra.containers in-place with space
|
||||
// optimized versions.
|
||||
func (ra *roaringArray64) runOptimize() {
|
||||
for i := range ra.containers {
|
||||
ra.containers[i].RunOptimize()
|
||||
@@ -39,7 +42,7 @@ func (ra *roaringArray64) appendCopy(sa roaringArray64, startingindex int) {
|
||||
// since there is no copy-on-write, we need to clone the container (this is important)
|
||||
ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex].Clone(), copyonwrite)
|
||||
} else {
|
||||
ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], copyonwrite)
|
||||
ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex].Clone(), copyonwrite)
|
||||
if !sa.needsCopyOnWrite(startingindex) {
|
||||
sa.setNeedsCopyOnWrite(startingindex)
|
||||
}
|
||||
@@ -121,6 +124,8 @@ func (ra *roaringArray64) removeIndexRange(begin, end int) {
|
||||
|
||||
func (ra *roaringArray64) resize(newsize int) {
|
||||
for k := newsize; k < len(ra.containers); k++ {
|
||||
ra.keys[k] = 0
|
||||
ra.needCopyOnWrite[k] = false
|
||||
ra.containers[k] = nil
|
||||
}
|
||||
|
||||
|
||||
+10
-9
@@ -4,8 +4,9 @@ import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"github.com/RoaringBitmap/roaring/internal"
|
||||
"io"
|
||||
|
||||
"github.com/RoaringBitmap/roaring/internal"
|
||||
)
|
||||
|
||||
type container interface {
|
||||
@@ -112,9 +113,10 @@ func newRoaringArray() *roaringArray {
|
||||
// runOptimize compresses the element containers to minimize space consumed.
|
||||
// Q: how does this interact with copyOnWrite and needCopyOnWrite?
|
||||
// A: since we aren't changing the logical content, just the representation,
|
||||
// we don't bother to check the needCopyOnWrite bits. We replace
|
||||
// (possibly all) elements of ra.containers in-place with space
|
||||
// optimized versions.
|
||||
//
|
||||
// we don't bother to check the needCopyOnWrite bits. We replace
|
||||
// (possibly all) elements of ra.containers in-place with space
|
||||
// optimized versions.
|
||||
func (ra *roaringArray) runOptimize() {
|
||||
for i := range ra.containers {
|
||||
ra.containers[i] = ra.containers[i].toEfficientContainer()
|
||||
@@ -465,9 +467,7 @@ func (ra *roaringArray) serializedSizeInBytes() uint64 {
|
||||
return answer
|
||||
}
|
||||
|
||||
//
|
||||
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
|
||||
//
|
||||
func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) {
|
||||
hasRun := ra.hasRunCompression()
|
||||
isRunSizeInBytes := 0
|
||||
@@ -544,15 +544,14 @@ func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
//
|
||||
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
|
||||
//
|
||||
func (ra *roaringArray) toBytes() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
_, err := ra.writeTo(&buf)
|
||||
return buf.Bytes(), err
|
||||
}
|
||||
|
||||
// Reads a serialized roaringArray from a byte slice.
|
||||
func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte) (int64, error) {
|
||||
var cookie uint32
|
||||
var err error
|
||||
@@ -567,6 +566,8 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte
|
||||
return stream.GetReadBytes(), fmt.Errorf("error in roaringArray.readFrom: could not read initial cookie: %s", err)
|
||||
}
|
||||
}
|
||||
// If NextReturnsSafeSlice is false, then willNeedCopyOnWrite should be true
|
||||
willNeedCopyOnWrite := !stream.NextReturnsSafeSlice()
|
||||
|
||||
var size uint32
|
||||
var isRunBitmap []byte
|
||||
@@ -631,7 +632,7 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte
|
||||
key := keycard[2*i]
|
||||
card := int(keycard[2*i+1]) + 1
|
||||
ra.keys[i] = key
|
||||
ra.needCopyOnWrite[i] = true
|
||||
ra.needCopyOnWrite[i] = willNeedCopyOnWrite
|
||||
|
||||
if isRunBitmap != nil && isRunBitmap[i/8]&(1<<(i%8)) != 0 {
|
||||
// run container
|
||||
|
||||
+21
-28
@@ -47,6 +47,7 @@ import (
|
||||
// runContainer16 does run-length encoding of sets of
|
||||
// uint16 integers.
|
||||
type runContainer16 struct {
|
||||
// iv is a slice of sorted, non-overlapping, non-adjacent intervals.
|
||||
iv []interval16
|
||||
}
|
||||
|
||||
@@ -253,10 +254,8 @@ func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 {
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// newRunContainer16FromArray populates a new
|
||||
// runContainer16 from the contents of arr.
|
||||
//
|
||||
func newRunContainer16FromArray(arr *arrayContainer) *runContainer16 {
|
||||
// keep this in sync with newRunContainer16FromVals above
|
||||
|
||||
@@ -834,24 +833,23 @@ func (rc *runContainer16) numIntervals() int {
|
||||
// If key is not already present, then whichInterval16 is
|
||||
// set as follows:
|
||||
//
|
||||
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
|
||||
// last interval16 in rc.iv;
|
||||
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
|
||||
// last interval16 in rc.iv;
|
||||
//
|
||||
// b) whichInterval16 == -1 if key is before our first
|
||||
// interval16 in rc.iv;
|
||||
// b) whichInterval16 == -1 if key is before our first
|
||||
// interval16 in rc.iv;
|
||||
//
|
||||
// c) whichInterval16 is set to the minimum index of rc.iv
|
||||
// which comes strictly before the key;
|
||||
// so rc.iv[whichInterval16].last < key,
|
||||
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
|
||||
// (Note that whichInterval16+1 won't exist when
|
||||
// whichInterval16 is the last interval.)
|
||||
// c) whichInterval16 is set to the minimum index of rc.iv
|
||||
// which comes strictly before the key;
|
||||
// so rc.iv[whichInterval16].last < key,
|
||||
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
|
||||
// (Note that whichInterval16+1 won't exist when
|
||||
// whichInterval16 is the last interval.)
|
||||
//
|
||||
// runContainer16.search always returns whichInterval16 < len(rc.iv).
|
||||
//
|
||||
// The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there
|
||||
// no upper bound.
|
||||
//
|
||||
func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (whichInterval16 int, alreadyPresent bool, numCompares int) {
|
||||
n := int(len(rc.iv))
|
||||
if n == 0 {
|
||||
@@ -937,21 +935,20 @@ func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (w
|
||||
// If key is not already present, then whichInterval16 is
|
||||
// set as follows:
|
||||
//
|
||||
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
|
||||
// last interval16 in rc.iv;
|
||||
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
|
||||
// last interval16 in rc.iv;
|
||||
//
|
||||
// b) whichInterval16 == -1 if key is before our first
|
||||
// interval16 in rc.iv;
|
||||
// b) whichInterval16 == -1 if key is before our first
|
||||
// interval16 in rc.iv;
|
||||
//
|
||||
// c) whichInterval16 is set to the minimum index of rc.iv
|
||||
// which comes strictly before the key;
|
||||
// so rc.iv[whichInterval16].last < key,
|
||||
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
|
||||
// (Note that whichInterval16+1 won't exist when
|
||||
// whichInterval16 is the last interval.)
|
||||
// c) whichInterval16 is set to the minimum index of rc.iv
|
||||
// which comes strictly before the key;
|
||||
// so rc.iv[whichInterval16].last < key,
|
||||
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
|
||||
// (Note that whichInterval16+1 won't exist when
|
||||
// whichInterval16 is the last interval.)
|
||||
//
|
||||
// runContainer16.search always returns whichInterval16 < len(rc.iv).
|
||||
//
|
||||
func (rc *runContainer16) search(key int) (whichInterval16 int, alreadyPresent bool, numCompares int) {
|
||||
return rc.searchRange(key, 0, 0)
|
||||
}
|
||||
@@ -994,7 +991,6 @@ func newRunContainer16() *runContainer16 {
|
||||
|
||||
// newRunContainer16CopyIv creates a run container, initializing
|
||||
// with a copy of the supplied iv slice.
|
||||
//
|
||||
func newRunContainer16CopyIv(iv []interval16) *runContainer16 {
|
||||
rc := &runContainer16{
|
||||
iv: make([]interval16, len(iv)),
|
||||
@@ -1011,7 +1007,6 @@ func (rc *runContainer16) Clone() *runContainer16 {
|
||||
// newRunContainer16TakeOwnership returns a new runContainer16
|
||||
// backed by the provided iv slice, which we will
|
||||
// assume exclusive control over from now on.
|
||||
//
|
||||
func newRunContainer16TakeOwnership(iv []interval16) *runContainer16 {
|
||||
rc := &runContainer16{
|
||||
iv: iv,
|
||||
@@ -2006,7 +2001,6 @@ func (rc *runContainer16) not(firstOfRange, endx int) container {
|
||||
// Current routine is correct but
|
||||
// makes 2 more passes through the arrays than should be
|
||||
// strictly necessary. Measure both ways though--this may not matter.
|
||||
//
|
||||
func (rc *runContainer16) Not(firstOfRange, endx int) *runContainer16 {
|
||||
|
||||
if firstOfRange > endx {
|
||||
@@ -2329,7 +2323,6 @@ func runArrayUnionToRuns(rc *runContainer16, ac *arrayContainer) ([]interval16,
|
||||
// the backing array, and then you write
|
||||
// the answer at the beginning. What this
|
||||
// trick does is minimize memory allocations.
|
||||
//
|
||||
func (rc *runContainer16) lazyIOR(a container) container {
|
||||
// not lazy at the moment
|
||||
return rc.ior(a)
|
||||
|
||||
-1
@@ -7,7 +7,6 @@ import (
|
||||
|
||||
// writeTo for runContainer16 follows this
|
||||
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
|
||||
//
|
||||
func (b *runContainer16) writeTo(stream io.Writer) (int, error) {
|
||||
buf := make([]byte, 2+4*len(b.iv))
|
||||
binary.LittleEndian.PutUint16(buf[0:], uint16(len(b.iv)))
|
||||
|
||||
+52
-42
@@ -79,12 +79,12 @@ func (bc *bitmapContainer) asLittleEndianByteSlice() []byte {
|
||||
|
||||
// Deserialization code follows
|
||||
|
||||
////
|
||||
// //
|
||||
// These methods (byteSliceAsUint16Slice,...) do not make copies,
|
||||
// they are pointer-based (unsafe). The caller is responsible to
|
||||
// ensure that the input slice does not get garbage collected, deleted
|
||||
// or modified while you hold the returned slince.
|
||||
////
|
||||
// //
|
||||
func byteSliceAsUint16Slice(slice []byte) (result []uint16) { // here we create a new slice holder
|
||||
if len(slice)%2 != 0 {
|
||||
panic("Slice size should be divisible by 2")
|
||||
@@ -295,7 +295,6 @@ func byteSliceAsBoolSlice(slice []byte) (result []bool) {
|
||||
// bitmap derived from this bitmap (e.g., via Or, And) might
|
||||
// also be broken. Thus, before making buf unavailable, you should
|
||||
// call CloneCopyOnWriteContainers on all such bitmaps.
|
||||
//
|
||||
func (rb *Bitmap) FrozenView(buf []byte) error {
|
||||
return rb.highlowcontainer.frozenView(buf)
|
||||
}
|
||||
@@ -313,7 +312,7 @@ func (rb *Bitmap) FrozenView(buf []byte) error {
|
||||
* <typecodes> uint8_t[num_containers]
|
||||
* <header> uint32_t
|
||||
*
|
||||
* <header> is a 4-byte value which is a bit union of FROZEN_COOKIE (15 bits)
|
||||
* <header> is a 4-byte value which is a bit union of frozenCookie (15 bits)
|
||||
* and the number of containers (17 bits).
|
||||
*
|
||||
* <counts> stores number of elements for every container.
|
||||
@@ -329,43 +328,50 @@ func (rb *Bitmap) FrozenView(buf []byte) error {
|
||||
* All members have their native alignments during deserilization except <header>,
|
||||
* which is not guaranteed to be aligned by 4 bytes.
|
||||
*/
|
||||
const FROZEN_COOKIE = 13766
|
||||
const frozenCookie = 13766
|
||||
|
||||
var (
|
||||
FrozenBitmapInvalidCookie = errors.New("header does not contain the FROZEN_COOKIE")
|
||||
FrozenBitmapBigEndian = errors.New("loading big endian frozen bitmaps is not supported")
|
||||
FrozenBitmapIncomplete = errors.New("input buffer too small to contain a frozen bitmap")
|
||||
FrozenBitmapOverpopulated = errors.New("too many containers")
|
||||
FrozenBitmapUnexpectedData = errors.New("spurious data in input")
|
||||
FrozenBitmapInvalidTypecode = errors.New("unrecognized typecode")
|
||||
FrozenBitmapBufferTooSmall = errors.New("buffer too small")
|
||||
// ErrFrozenBitmapInvalidCookie is returned when the header does not contain the frozenCookie.
|
||||
ErrFrozenBitmapInvalidCookie = errors.New("header does not contain the frozenCookie")
|
||||
// ErrFrozenBitmapBigEndian is returned when the header is big endian.
|
||||
ErrFrozenBitmapBigEndian = errors.New("loading big endian frozen bitmaps is not supported")
|
||||
// ErrFrozenBitmapIncomplete is returned when the buffer is too small to contain a frozen bitmap.
|
||||
ErrFrozenBitmapIncomplete = errors.New("input buffer too small to contain a frozen bitmap")
|
||||
// ErrFrozenBitmapOverpopulated is returned when the number of containers is too large.
|
||||
ErrFrozenBitmapOverpopulated = errors.New("too many containers")
|
||||
// ErrFrozenBitmapUnexpectedData is returned when the buffer contains unexpected data.
|
||||
ErrFrozenBitmapUnexpectedData = errors.New("spurious data in input")
|
||||
// ErrFrozenBitmapInvalidTypecode is returned when the typecode is invalid.
|
||||
ErrFrozenBitmapInvalidTypecode = errors.New("unrecognized typecode")
|
||||
// ErrFrozenBitmapBufferTooSmall is returned when the buffer is too small.
|
||||
ErrFrozenBitmapBufferTooSmall = errors.New("buffer too small")
|
||||
)
|
||||
|
||||
func (ra *roaringArray) frozenView(buf []byte) error {
|
||||
if len(buf) < 4 {
|
||||
return FrozenBitmapIncomplete
|
||||
return ErrFrozenBitmapIncomplete
|
||||
}
|
||||
|
||||
headerBE := binary.BigEndian.Uint32(buf[len(buf)-4:])
|
||||
if headerBE&0x7fff == FROZEN_COOKIE {
|
||||
return FrozenBitmapBigEndian
|
||||
if headerBE&0x7fff == frozenCookie {
|
||||
return ErrFrozenBitmapBigEndian
|
||||
}
|
||||
|
||||
header := binary.LittleEndian.Uint32(buf[len(buf)-4:])
|
||||
buf = buf[:len(buf)-4]
|
||||
|
||||
if header&0x7fff != FROZEN_COOKIE {
|
||||
return FrozenBitmapInvalidCookie
|
||||
if header&0x7fff != frozenCookie {
|
||||
return ErrFrozenBitmapInvalidCookie
|
||||
}
|
||||
|
||||
nCont := int(header >> 15)
|
||||
if nCont > (1 << 16) {
|
||||
return FrozenBitmapOverpopulated
|
||||
return ErrFrozenBitmapOverpopulated
|
||||
}
|
||||
|
||||
// 1 byte per type, 2 bytes per key, 2 bytes per count.
|
||||
if len(buf) < 5*nCont {
|
||||
return FrozenBitmapIncomplete
|
||||
return ErrFrozenBitmapIncomplete
|
||||
}
|
||||
|
||||
types := buf[len(buf)-nCont:]
|
||||
@@ -390,12 +396,12 @@ func (ra *roaringArray) frozenView(buf []byte) error {
|
||||
nRun++
|
||||
nRunEl += int(counts[i])
|
||||
default:
|
||||
return FrozenBitmapInvalidTypecode
|
||||
return ErrFrozenBitmapInvalidTypecode
|
||||
}
|
||||
}
|
||||
|
||||
if len(buf) < (1<<13)*nBitmap+4*nRunEl+2*nArrayEl {
|
||||
return FrozenBitmapIncomplete
|
||||
return ErrFrozenBitmapIncomplete
|
||||
}
|
||||
|
||||
bitsetsArena := byteSliceAsUint64Slice(buf[:(1<<13)*nBitmap])
|
||||
@@ -408,15 +414,15 @@ func (ra *roaringArray) frozenView(buf []byte) error {
|
||||
buf = buf[2*nArrayEl:]
|
||||
|
||||
if len(buf) != 0 {
|
||||
return FrozenBitmapUnexpectedData
|
||||
return ErrFrozenBitmapUnexpectedData
|
||||
}
|
||||
|
||||
var c container
|
||||
containersSz := int(unsafe.Sizeof(c))*nCont
|
||||
bitsetsSz := int(unsafe.Sizeof(bitmapContainer{}))*nBitmap
|
||||
arraysSz := int(unsafe.Sizeof(arrayContainer{}))*nArray
|
||||
runsSz := int(unsafe.Sizeof(runContainer16{}))*nRun
|
||||
needCOWSz := int(unsafe.Sizeof(true))*nCont
|
||||
containersSz := int(unsafe.Sizeof(c)) * nCont
|
||||
bitsetsSz := int(unsafe.Sizeof(bitmapContainer{})) * nBitmap
|
||||
arraysSz := int(unsafe.Sizeof(arrayContainer{})) * nArray
|
||||
runsSz := int(unsafe.Sizeof(runContainer16{})) * nRun
|
||||
needCOWSz := int(unsafe.Sizeof(true)) * nCont
|
||||
|
||||
bitmapArenaSz := containersSz + bitsetsSz + arraysSz + runsSz + needCOWSz
|
||||
bitmapArena := make([]byte, bitmapArenaSz)
|
||||
@@ -475,9 +481,10 @@ func (ra *roaringArray) frozenView(buf []byte) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (bm *Bitmap) GetFrozenSizeInBytes() uint64 {
|
||||
// GetFrozenSizeInBytes returns the size in bytes of the frozen bitmap.
|
||||
func (rb *Bitmap) GetFrozenSizeInBytes() uint64 {
|
||||
nBits, nArrayEl, nRunEl := uint64(0), uint64(0), uint64(0)
|
||||
for _, c := range bm.highlowcontainer.containers {
|
||||
for _, c := range rb.highlowcontainer.containers {
|
||||
switch v := c.(type) {
|
||||
case *bitmapContainer:
|
||||
nBits++
|
||||
@@ -487,19 +494,21 @@ func (bm *Bitmap) GetFrozenSizeInBytes() uint64 {
|
||||
nRunEl += uint64(len(v.iv))
|
||||
}
|
||||
}
|
||||
return 4 + 5*uint64(len(bm.highlowcontainer.containers)) +
|
||||
return 4 + 5*uint64(len(rb.highlowcontainer.containers)) +
|
||||
(nBits << 13) + 2*nArrayEl + 4*nRunEl
|
||||
}
|
||||
|
||||
func (bm *Bitmap) Freeze() ([]byte, error) {
|
||||
sz := bm.GetFrozenSizeInBytes()
|
||||
// Freeze serializes the bitmap in the CRoaring's frozen format.
|
||||
func (rb *Bitmap) Freeze() ([]byte, error) {
|
||||
sz := rb.GetFrozenSizeInBytes()
|
||||
buf := make([]byte, sz)
|
||||
_, err := bm.FreezeTo(buf)
|
||||
_, err := rb.FreezeTo(buf)
|
||||
return buf, err
|
||||
}
|
||||
|
||||
func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
|
||||
containers := bm.highlowcontainer.containers
|
||||
// FreezeTo serializes the bitmap in the CRoaring's frozen format.
|
||||
func (rb *Bitmap) FreezeTo(buf []byte) (int, error) {
|
||||
containers := rb.highlowcontainer.containers
|
||||
nCont := len(containers)
|
||||
|
||||
nBits, nArrayEl, nRunEl := 0, 0, 0
|
||||
@@ -516,7 +525,7 @@ func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
|
||||
|
||||
serialSize := 4 + 5*nCont + (1<<13)*nBits + 4*nRunEl + 2*nArrayEl
|
||||
if len(buf) < serialSize {
|
||||
return 0, FrozenBitmapBufferTooSmall
|
||||
return 0, ErrFrozenBitmapBufferTooSmall
|
||||
}
|
||||
|
||||
bitsArena := byteSliceAsUint64Slice(buf[:(1<<13)*nBits])
|
||||
@@ -537,10 +546,10 @@ func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
|
||||
types := buf[:nCont]
|
||||
buf = buf[nCont:]
|
||||
|
||||
header := uint32(FROZEN_COOKIE | (nCont << 15))
|
||||
header := uint32(frozenCookie | (nCont << 15))
|
||||
binary.LittleEndian.PutUint32(buf[:4], header)
|
||||
|
||||
copy(keys, bm.highlowcontainer.keys[:])
|
||||
copy(keys, rb.highlowcontainer.keys[:])
|
||||
|
||||
for i, c := range containers {
|
||||
switch v := c.(type) {
|
||||
@@ -567,11 +576,12 @@ func (bm *Bitmap) FreezeTo(buf []byte) (int, error) {
|
||||
return serialSize, nil
|
||||
}
|
||||
|
||||
func (bm *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
|
||||
// WriteFrozenTo serializes the bitmap in the CRoaring's frozen format.
|
||||
func (rb *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
|
||||
// FIXME: this is a naive version that iterates 4 times through the
|
||||
// containers and allocates 3*len(containers) bytes; it's quite likely
|
||||
// it can be done more efficiently.
|
||||
containers := bm.highlowcontainer.containers
|
||||
containers := rb.highlowcontainer.containers
|
||||
written := 0
|
||||
|
||||
for _, c := range containers {
|
||||
@@ -610,7 +620,7 @@ func (bm *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
|
||||
}
|
||||
}
|
||||
|
||||
n, err := wr.Write(uint16SliceAsByteSlice(bm.highlowcontainer.keys))
|
||||
n, err := wr.Write(uint16SliceAsByteSlice(rb.highlowcontainer.keys))
|
||||
written += n
|
||||
if err != nil {
|
||||
return written, err
|
||||
@@ -642,7 +652,7 @@ func (bm *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) {
|
||||
return written, err
|
||||
}
|
||||
|
||||
header := uint32(FROZEN_COOKIE | (len(containers) << 15))
|
||||
header := uint32(frozenCookie | (len(containers) << 15))
|
||||
if err := binary.Write(wr, binary.LittleEndian, header); err != nil {
|
||||
return written, err
|
||||
}
|
||||
|
||||
+62
-3
@@ -7,6 +7,15 @@
|
||||
[](https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc)
|
||||
|
||||
|
||||
This library is part of the [awesome go collection](https://github.com/avelino/awesome-go). It is used in production by several important systems:
|
||||
|
||||
* [beego](https://github.com/beego/beego)
|
||||
* [CubeFS](https://github.com/cubefs/cubefs)
|
||||
* [Amazon EKS Distro](https://github.com/aws/eks-distro)
|
||||
* [sourcegraph](https://github.com/sourcegraph/sourcegraph)
|
||||
* [torrent](https://github.com/anacrolix/torrent)
|
||||
|
||||
|
||||
## Description
|
||||
|
||||
Package bitset implements bitsets, a mapping between non-negative integers and boolean values.
|
||||
@@ -60,19 +69,69 @@ func main() {
|
||||
}
|
||||
```
|
||||
|
||||
As an alternative to BitSets, one should check out the 'big' package, which provides a (less set-theoretical) view of bitsets.
|
||||
|
||||
Package documentation is at: https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc
|
||||
|
||||
## Serialization
|
||||
|
||||
|
||||
You may serialize a bitset safely and portably to a stream
|
||||
of bytes as follows:
|
||||
```Go
|
||||
const length = 9585
|
||||
const oneEvery = 97
|
||||
bs := bitset.New(length)
|
||||
// Add some bits
|
||||
for i := uint(0); i < length; i += oneEvery {
|
||||
bs = bs.Set(i)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
n, err := bs.WriteTo(&buf)
|
||||
if err != nil {
|
||||
// failure
|
||||
}
|
||||
// Here n == buf.Len()
|
||||
```
|
||||
You can later deserialize the result as follows:
|
||||
|
||||
```Go
|
||||
// Read back from buf
|
||||
bs = bitset.New()
|
||||
n, err = bs.ReadFrom(&buf)
|
||||
if err != nil {
|
||||
// error
|
||||
}
|
||||
// n is the number of bytes read
|
||||
```
|
||||
|
||||
The `ReadFrom` function attempts to read the data into the existing
|
||||
BitSet instance, to minimize memory allocations.
|
||||
|
||||
|
||||
*Performance tip*:
|
||||
When reading and writing to a file or a network connection, you may get better performance by
|
||||
wrapping your streams with `bufio` instances.
|
||||
|
||||
E.g.,
|
||||
```Go
|
||||
f, err := os.Create("myfile")
|
||||
w := bufio.NewWriter(f)
|
||||
```
|
||||
```Go
|
||||
f, err := os.Open("myfile")
|
||||
r := bufio.NewReader(f)
|
||||
```
|
||||
|
||||
## Memory Usage
|
||||
|
||||
The memory usage of a bitset using N bits is at least N/8 bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring).
|
||||
The memory usage of a bitset using `N` bits is at least `N/8` bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring).
|
||||
|
||||
## Implementation Note
|
||||
|
||||
Go 1.9 introduced a native `math/bits` library. We provide backward compatibility to Go 1.7, which might be removed.
|
||||
|
||||
It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `unit64`). If so, the version will be bumped.
|
||||
It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `uint64`). If so, the version will be bumped.
|
||||
|
||||
## Installation
|
||||
|
||||
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
You can report privately a vulnerability by email at daniel@lemire.me (current maintainer).
|
||||
+317
-134
@@ -33,12 +33,10 @@ Example use:
|
||||
|
||||
As an alternative to BitSets, one should check out the 'big' package,
|
||||
which provides a (less set-theoretical) view of bitsets.
|
||||
|
||||
*/
|
||||
package bitset
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"encoding/binary"
|
||||
@@ -52,6 +50,9 @@ import (
|
||||
// the wordSize of a bit set
|
||||
const wordSize = uint(64)
|
||||
|
||||
// the wordSize of a bit set in bytes
|
||||
const wordBytes = wordSize / 8
|
||||
|
||||
// log2WordSize is lg(wordSize)
|
||||
const log2WordSize = uint(6)
|
||||
|
||||
@@ -87,9 +88,20 @@ func (b *BitSet) safeSet() []uint64 {
|
||||
return b.set
|
||||
}
|
||||
|
||||
// SetBitsetFrom fills the bitset with an array of integers without creating a new BitSet instance
|
||||
func (b *BitSet) SetBitsetFrom(buf []uint64) {
|
||||
b.length = uint(len(buf)) * 64
|
||||
b.set = buf
|
||||
}
|
||||
|
||||
// From is a constructor used to create a BitSet from an array of integers
|
||||
func From(buf []uint64) *BitSet {
|
||||
return &BitSet{uint(len(buf)) * 64, buf}
|
||||
return FromWithLength(uint(len(buf))*64, buf)
|
||||
}
|
||||
|
||||
// FromWithLength constructs from an array of integers and length.
|
||||
func FromWithLength(len uint, set []uint64) *BitSet {
|
||||
return &BitSet{len, set}
|
||||
}
|
||||
|
||||
// Bytes returns the bitset as array of integers
|
||||
@@ -105,6 +117,17 @@ func wordsNeeded(i uint) int {
|
||||
return int((i + (wordSize - 1)) >> log2WordSize)
|
||||
}
|
||||
|
||||
// wordsNeededUnbound calculates the number of words needed for i bits, possibly exceeding the capacity.
|
||||
// This function is useful if you know that the capacity cannot be exceeded (e.g., you have an existing bitmap).
|
||||
func wordsNeededUnbound(i uint) int {
|
||||
return int((i + (wordSize - 1)) >> log2WordSize)
|
||||
}
|
||||
|
||||
// wordsIndex calculates the index of words in a `uint64`
|
||||
func wordsIndex(i uint) uint {
|
||||
return i & (wordSize - 1)
|
||||
}
|
||||
|
||||
// New creates a new BitSet with a hint that length bits will be required
|
||||
func New(length uint) (bset *BitSet) {
|
||||
defer func() {
|
||||
@@ -135,24 +158,22 @@ func (b *BitSet) Len() uint {
|
||||
return b.length
|
||||
}
|
||||
|
||||
// extendSetMaybe adds additional words to incorporate new bits if needed
|
||||
func (b *BitSet) extendSetMaybe(i uint) {
|
||||
if i >= b.length { // if we need more bits, make 'em
|
||||
if i >= Cap() {
|
||||
panic("You are exceeding the capacity")
|
||||
}
|
||||
nsize := wordsNeeded(i + 1)
|
||||
if b.set == nil {
|
||||
b.set = make([]uint64, nsize)
|
||||
} else if cap(b.set) >= nsize {
|
||||
b.set = b.set[:nsize] // fast resize
|
||||
} else if len(b.set) < nsize {
|
||||
newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x
|
||||
copy(newset, b.set)
|
||||
b.set = newset
|
||||
}
|
||||
b.length = i + 1
|
||||
// extendSet adds additional words to incorporate new bits if needed
|
||||
func (b *BitSet) extendSet(i uint) {
|
||||
if i >= Cap() {
|
||||
panic("You are exceeding the capacity")
|
||||
}
|
||||
nsize := wordsNeeded(i + 1)
|
||||
if b.set == nil {
|
||||
b.set = make([]uint64, nsize)
|
||||
} else if cap(b.set) >= nsize {
|
||||
b.set = b.set[:nsize] // fast resize
|
||||
} else if len(b.set) < nsize {
|
||||
newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x
|
||||
copy(newset, b.set)
|
||||
b.set = newset
|
||||
}
|
||||
b.length = i + 1
|
||||
}
|
||||
|
||||
// Test whether bit i is set.
|
||||
@@ -160,7 +181,7 @@ func (b *BitSet) Test(i uint) bool {
|
||||
if i >= b.length {
|
||||
return false
|
||||
}
|
||||
return b.set[i>>log2WordSize]&(1<<(i&(wordSize-1))) != 0
|
||||
return b.set[i>>log2WordSize]&(1<<wordsIndex(i)) != 0
|
||||
}
|
||||
|
||||
// Set bit i to 1, the capacity of the bitset is automatically
|
||||
@@ -170,8 +191,10 @@ func (b *BitSet) Test(i uint) bool {
|
||||
// may lead to a memory shortage and a panic: the caller is responsible
|
||||
// for providing sensible parameters in line with their memory capacity.
|
||||
func (b *BitSet) Set(i uint) *BitSet {
|
||||
b.extendSetMaybe(i)
|
||||
b.set[i>>log2WordSize] |= 1 << (i & (wordSize - 1))
|
||||
if i >= b.length { // if we need more bits, make 'em
|
||||
b.extendSet(i)
|
||||
}
|
||||
b.set[i>>log2WordSize] |= 1 << wordsIndex(i)
|
||||
return b
|
||||
}
|
||||
|
||||
@@ -180,7 +203,7 @@ func (b *BitSet) Clear(i uint) *BitSet {
|
||||
if i >= b.length {
|
||||
return b
|
||||
}
|
||||
b.set[i>>log2WordSize] &^= 1 << (i & (wordSize - 1))
|
||||
b.set[i>>log2WordSize] &^= 1 << wordsIndex(i)
|
||||
return b
|
||||
}
|
||||
|
||||
@@ -205,7 +228,7 @@ func (b *BitSet) Flip(i uint) *BitSet {
|
||||
if i >= b.length {
|
||||
return b.Set(i)
|
||||
}
|
||||
b.set[i>>log2WordSize] ^= 1 << (i & (wordSize - 1))
|
||||
b.set[i>>log2WordSize] ^= 1 << wordsIndex(i)
|
||||
return b
|
||||
}
|
||||
|
||||
@@ -218,15 +241,23 @@ func (b *BitSet) FlipRange(start, end uint) *BitSet {
|
||||
if start >= end {
|
||||
return b
|
||||
}
|
||||
|
||||
b.extendSetMaybe(end - 1)
|
||||
if end-1 >= b.length { // if we need more bits, make 'em
|
||||
b.extendSet(end - 1)
|
||||
}
|
||||
var startWord uint = start >> log2WordSize
|
||||
var endWord uint = end >> log2WordSize
|
||||
b.set[startWord] ^= ^(^uint64(0) << (start & (wordSize - 1)))
|
||||
for i := startWord; i < endWord; i++ {
|
||||
b.set[i] = ^b.set[i]
|
||||
b.set[startWord] ^= ^(^uint64(0) << wordsIndex(start))
|
||||
if endWord > 0 {
|
||||
// bounds check elimination
|
||||
data := b.set
|
||||
_ = data[endWord-1]
|
||||
for i := startWord; i < endWord; i++ {
|
||||
data[i] = ^data[i]
|
||||
}
|
||||
}
|
||||
if end&(wordSize-1) != 0 {
|
||||
b.set[endWord] ^= ^uint64(0) >> wordsIndex(-end)
|
||||
}
|
||||
b.set[endWord] ^= ^uint64(0) >> (-end & (wordSize - 1))
|
||||
return b
|
||||
}
|
||||
|
||||
@@ -254,9 +285,10 @@ func (b *BitSet) Shrink(lastbitindex uint) *BitSet {
|
||||
copy(shrunk, b.set[:idx])
|
||||
b.set = shrunk
|
||||
b.length = length
|
||||
if length < 64 {
|
||||
b.set[idx-1] &= (allBits >> (uint64(64) - uint64(length&(wordSize-1))))
|
||||
}
|
||||
lastWordUsedBits := length % 64
|
||||
if lastWordUsedBits != 0 {
|
||||
b.set[idx-1] &= allBits >> uint64(64-wordsIndex(lastWordUsedBits))
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
@@ -285,7 +317,7 @@ func (b *BitSet) Compact() *BitSet {
|
||||
// this method could be extremely slow and in some cases might cause the entire BitSet
|
||||
// to be recopied.
|
||||
func (b *BitSet) InsertAt(idx uint) *BitSet {
|
||||
insertAtElement := (idx >> log2WordSize)
|
||||
insertAtElement := idx >> log2WordSize
|
||||
|
||||
// if length of set is a multiple of wordSize we need to allocate more space first
|
||||
if b.isLenExactMultiple() {
|
||||
@@ -304,13 +336,13 @@ func (b *BitSet) InsertAt(idx uint) *BitSet {
|
||||
|
||||
// generate a mask to extract the data that we need to shift left
|
||||
// within the element where we insert a bit
|
||||
dataMask := ^(uint64(1)<<uint64(idx&(wordSize-1)) - 1)
|
||||
dataMask := uint64(1)<<uint64(wordsIndex(idx)) - 1
|
||||
|
||||
// extract that data that we'll shift
|
||||
data := b.set[i] & dataMask
|
||||
data := b.set[i] & (^dataMask)
|
||||
|
||||
// set the positions of the data mask to 0 in the element where we insert
|
||||
b.set[i] &= ^dataMask
|
||||
b.set[i] &= dataMask
|
||||
|
||||
// shift data mask to the left and insert its data to the slice element
|
||||
b.set[i] |= data << 1
|
||||
@@ -358,7 +390,7 @@ func (b *BitSet) DeleteAt(i uint) *BitSet {
|
||||
|
||||
// generate a mask for the data that needs to be shifted right
|
||||
// within that slice element that gets modified
|
||||
dataMask := ^((uint64(1) << (i & (wordSize - 1))) - 1)
|
||||
dataMask := ^((uint64(1) << wordsIndex(i)) - 1)
|
||||
|
||||
// extract the data that we'll shift right from the slice element
|
||||
data := b.set[deleteAtElement] & dataMask
|
||||
@@ -396,16 +428,20 @@ func (b *BitSet) NextSet(i uint) (uint, bool) {
|
||||
return 0, false
|
||||
}
|
||||
w := b.set[x]
|
||||
w = w >> (i & (wordSize - 1))
|
||||
w = w >> wordsIndex(i)
|
||||
if w != 0 {
|
||||
return i + trailingZeroes64(w), true
|
||||
}
|
||||
x = x + 1
|
||||
x++
|
||||
// bounds check elimination in the loop
|
||||
if x < 0 {
|
||||
return 0, false
|
||||
}
|
||||
for x < len(b.set) {
|
||||
if b.set[x] != 0 {
|
||||
return uint(x)*wordSize + trailingZeroes64(b.set[x]), true
|
||||
}
|
||||
x = x + 1
|
||||
x++
|
||||
|
||||
}
|
||||
return 0, false
|
||||
@@ -415,21 +451,20 @@ func (b *BitSet) NextSet(i uint) (uint, bool) {
|
||||
// including possibly the current index and up to cap(buffer).
|
||||
// If the returned slice has len zero, then no more set bits were found
|
||||
//
|
||||
// buffer := make([]uint, 256) // this should be reused
|
||||
// j := uint(0)
|
||||
// j, buffer = bitmap.NextSetMany(j, buffer)
|
||||
// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) {
|
||||
// for k := range buffer {
|
||||
// do something with buffer[k]
|
||||
// }
|
||||
// j += 1
|
||||
// }
|
||||
//
|
||||
// buffer := make([]uint, 256) // this should be reused
|
||||
// j := uint(0)
|
||||
// j, buffer = bitmap.NextSetMany(j, buffer)
|
||||
// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) {
|
||||
// for k := range buffer {
|
||||
// do something with buffer[k]
|
||||
// }
|
||||
// j += 1
|
||||
// }
|
||||
//
|
||||
// It is possible to retrieve all set bits as follow:
|
||||
//
|
||||
// indices := make([]uint, bitmap.Count())
|
||||
// bitmap.NextSetMany(0, indices)
|
||||
// indices := make([]uint, bitmap.Count())
|
||||
// bitmap.NextSetMany(0, indices)
|
||||
//
|
||||
// However if bitmap.Count() is large, it might be preferable to
|
||||
// use several calls to NextSetMany, for performance reasons.
|
||||
@@ -440,7 +475,7 @@ func (b *BitSet) NextSetMany(i uint, buffer []uint) (uint, []uint) {
|
||||
if x >= len(b.set) || capacity == 0 {
|
||||
return 0, myanswer[:0]
|
||||
}
|
||||
skip := i & (wordSize - 1)
|
||||
skip := wordsIndex(i)
|
||||
word := b.set[x] >> skip
|
||||
myanswer = myanswer[:capacity]
|
||||
size := int(0)
|
||||
@@ -483,17 +518,23 @@ func (b *BitSet) NextClear(i uint) (uint, bool) {
|
||||
return 0, false
|
||||
}
|
||||
w := b.set[x]
|
||||
w = w >> (i & (wordSize - 1))
|
||||
wA := allBits >> (i & (wordSize - 1))
|
||||
w = w >> wordsIndex(i)
|
||||
wA := allBits >> wordsIndex(i)
|
||||
index := i + trailingZeroes64(^w)
|
||||
if w != wA && index < b.length {
|
||||
return index, true
|
||||
}
|
||||
x++
|
||||
// bounds check elimination in the loop
|
||||
if x < 0 {
|
||||
return 0, false
|
||||
}
|
||||
for x < len(b.set) {
|
||||
index = uint(x)*wordSize + trailingZeroes64(^b.set[x])
|
||||
if b.set[x] != allBits && index < b.length {
|
||||
return index, true
|
||||
if b.set[x] != allBits {
|
||||
index = uint(x)*wordSize + trailingZeroes64(^b.set[x])
|
||||
if index < b.length {
|
||||
return index, true
|
||||
}
|
||||
}
|
||||
x++
|
||||
}
|
||||
@@ -512,7 +553,7 @@ func (b *BitSet) ClearAll() *BitSet {
|
||||
|
||||
// wordCount returns the number of words used in a bit set
|
||||
func (b *BitSet) wordCount() int {
|
||||
return len(b.set)
|
||||
return wordsNeededUnbound(b.length)
|
||||
}
|
||||
|
||||
// Clone this BitSet
|
||||
@@ -524,9 +565,10 @@ func (b *BitSet) Clone() *BitSet {
|
||||
return c
|
||||
}
|
||||
|
||||
// Copy into a destination BitSet
|
||||
// Returning the size of the destination BitSet
|
||||
// like array copy
|
||||
// Copy into a destination BitSet using the Go array copy semantics:
|
||||
// the number of bits copied is the minimum of the number of bits in the current
|
||||
// BitSet (Len()) and the destination Bitset.
|
||||
// We return the number of bits copied in the destination BitSet.
|
||||
func (b *BitSet) Copy(c *BitSet) (count uint) {
|
||||
if c == nil {
|
||||
return
|
||||
@@ -538,9 +580,33 @@ func (b *BitSet) Copy(c *BitSet) (count uint) {
|
||||
if b.length < c.length {
|
||||
count = b.length
|
||||
}
|
||||
// Cleaning the last word is needed to keep the invariant that other functions, such as Count, require
|
||||
// that any bits in the last word that would exceed the length of the bitmask are set to 0.
|
||||
c.cleanLastWord()
|
||||
return
|
||||
}
|
||||
|
||||
// CopyFull copies into a destination BitSet such that the destination is
|
||||
// identical to the source after the operation, allocating memory if necessary.
|
||||
func (b *BitSet) CopyFull(c *BitSet) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.length = b.length
|
||||
if len(b.set) == 0 {
|
||||
if c.set != nil {
|
||||
c.set = c.set[:0]
|
||||
}
|
||||
} else {
|
||||
if cap(c.set) < len(b.set) {
|
||||
c.set = make([]uint64, len(b.set))
|
||||
} else {
|
||||
c.set = c.set[:len(b.set)]
|
||||
}
|
||||
copy(c.set, b.set)
|
||||
}
|
||||
}
|
||||
|
||||
// Count (number of set bits).
|
||||
// Also known as "popcount" or "population count".
|
||||
func (b *BitSet) Count() uint {
|
||||
@@ -563,10 +629,15 @@ func (b *BitSet) Equal(c *BitSet) bool {
|
||||
if b.length == 0 { // if they have both length == 0, then could have nil set
|
||||
return true
|
||||
}
|
||||
// testing for equality shoud not transform the bitset (no call to safeSet)
|
||||
|
||||
for p, v := range b.set {
|
||||
if c.set[p] != v {
|
||||
wn := b.wordCount()
|
||||
// bounds check elimination
|
||||
if wn <= 0 {
|
||||
return true
|
||||
}
|
||||
_ = b.set[wn-1]
|
||||
_ = c.set[wn-1]
|
||||
for p := 0; p < wn; p++ {
|
||||
if c.set[p] != b.set[p] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
@@ -585,9 +656,9 @@ func (b *BitSet) Difference(compare *BitSet) (result *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
result = b.Clone() // clone b (in case b is bigger than compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
result.set[i] = b.set[i] &^ compare.set[i]
|
||||
@@ -599,9 +670,9 @@ func (b *BitSet) Difference(compare *BitSet) (result *BitSet) {
|
||||
func (b *BitSet) DifferenceCardinality(compare *BitSet) uint {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
cnt := uint64(0)
|
||||
cnt += popcntMaskSlice(b.set[:l], compare.set[:l])
|
||||
@@ -614,12 +685,19 @@ func (b *BitSet) DifferenceCardinality(compare *BitSet) uint {
|
||||
func (b *BitSet) InPlaceDifference(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
if l <= 0 {
|
||||
return
|
||||
}
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] &^= compare.set[i]
|
||||
data[i] &^= cmpData[i]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -662,18 +740,29 @@ func (b *BitSet) IntersectionCardinality(compare *BitSet) uint {
|
||||
func (b *BitSet) InPlaceIntersection(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] &= compare.set[i]
|
||||
if l > 0 {
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
|
||||
for i := 0; i < l; i++ {
|
||||
data[i] &= cmpData[i]
|
||||
}
|
||||
}
|
||||
for i := l; i < len(b.set); i++ {
|
||||
b.set[i] = 0
|
||||
if l >= 0 {
|
||||
for i := l; i < len(b.set); i++ {
|
||||
b.set[i] = 0
|
||||
}
|
||||
}
|
||||
if compare.length > 0 {
|
||||
b.extendSetMaybe(compare.length - 1)
|
||||
if compare.length-1 >= b.length {
|
||||
b.extendSet(compare.length - 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -708,15 +797,22 @@ func (b *BitSet) UnionCardinality(compare *BitSet) uint {
|
||||
func (b *BitSet) InPlaceUnion(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
if compare.length > 0 {
|
||||
b.extendSetMaybe(compare.length - 1)
|
||||
if compare.length > 0 && compare.length-1 >= b.length {
|
||||
b.extendSet(compare.length - 1)
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] |= compare.set[i]
|
||||
if l > 0 {
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
|
||||
for i := 0; i < l; i++ {
|
||||
data[i] |= cmpData[i]
|
||||
}
|
||||
}
|
||||
if len(compare.set) > l {
|
||||
for i := l; i < len(compare.set); i++ {
|
||||
@@ -756,15 +852,21 @@ func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint {
|
||||
func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) {
|
||||
panicIfNull(b)
|
||||
panicIfNull(compare)
|
||||
l := int(compare.wordCount())
|
||||
if l > int(b.wordCount()) {
|
||||
l = int(b.wordCount())
|
||||
l := compare.wordCount()
|
||||
if l > b.wordCount() {
|
||||
l = b.wordCount()
|
||||
}
|
||||
if compare.length > 0 {
|
||||
b.extendSetMaybe(compare.length - 1)
|
||||
if compare.length > 0 && compare.length-1 >= b.length {
|
||||
b.extendSet(compare.length - 1)
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
b.set[i] ^= compare.set[i]
|
||||
if l > 0 {
|
||||
// bounds check elimination
|
||||
data, cmpData := b.set, compare.set
|
||||
_ = data[l-1]
|
||||
_ = cmpData[l-1]
|
||||
for i := 0; i < l; i++ {
|
||||
data[i] ^= cmpData[i]
|
||||
}
|
||||
}
|
||||
if len(compare.set) > l {
|
||||
for i := l; i < len(compare.set); i++ {
|
||||
@@ -775,17 +877,17 @@ func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) {
|
||||
|
||||
// Is the length an exact multiple of word sizes?
|
||||
func (b *BitSet) isLenExactMultiple() bool {
|
||||
return b.length%wordSize == 0
|
||||
return wordsIndex(b.length) == 0
|
||||
}
|
||||
|
||||
// Clean last word by setting unused bits to 0
|
||||
func (b *BitSet) cleanLastWord() {
|
||||
if !b.isLenExactMultiple() {
|
||||
b.set[len(b.set)-1] &= allBits >> (wordSize - b.length%wordSize)
|
||||
b.set[len(b.set)-1] &= allBits >> (wordSize - wordsIndex(b.length))
|
||||
}
|
||||
}
|
||||
|
||||
// Complement computes the (local) complement of a biset (up to length bits)
|
||||
// Complement computes the (local) complement of a bitset (up to length bits)
|
||||
func (b *BitSet) Complement() (result *BitSet) {
|
||||
panicIfNull(b)
|
||||
result = New(b.length)
|
||||
@@ -813,7 +915,6 @@ func (b *BitSet) None() bool {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -826,12 +927,16 @@ func (b *BitSet) Any() bool {
|
||||
|
||||
// IsSuperSet returns true if this is a superset of the other set
|
||||
func (b *BitSet) IsSuperSet(other *BitSet) bool {
|
||||
for i, e := other.NextSet(0); e; i, e = other.NextSet(i + 1) {
|
||||
if !b.Test(i) {
|
||||
l := other.wordCount()
|
||||
if b.wordCount() < l {
|
||||
l = b.wordCount()
|
||||
}
|
||||
for i, word := range other.set[:l] {
|
||||
if b.set[i]&word != word {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
return popcntSlice(other.set[l:]) == 0
|
||||
}
|
||||
|
||||
// IsStrictSuperSet returns true if this is a strict superset of the other set
|
||||
@@ -852,78 +957,156 @@ func (b *BitSet) DumpAsBits() string {
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// BinaryStorageSize returns the binary storage requirements
|
||||
// BinaryStorageSize returns the binary storage requirements (see WriteTo) in bytes.
|
||||
func (b *BitSet) BinaryStorageSize() int {
|
||||
return binary.Size(uint64(0)) + binary.Size(b.set)
|
||||
return int(wordBytes + wordBytes*uint(b.wordCount()))
|
||||
}
|
||||
|
||||
// WriteTo writes a BitSet to a stream
|
||||
func readUint64Array(reader io.Reader, data []uint64) error {
|
||||
length := len(data)
|
||||
bufferSize := 128
|
||||
buffer := make([]byte, bufferSize*int(wordBytes))
|
||||
for i := 0; i < length; i += bufferSize {
|
||||
end := i + bufferSize
|
||||
if end > length {
|
||||
end = length
|
||||
buffer = buffer[:wordBytes*uint(end-i)]
|
||||
}
|
||||
chunk := data[i:end]
|
||||
if _, err := io.ReadFull(reader, buffer); err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range chunk {
|
||||
chunk[i] = uint64(binaryOrder.Uint64(buffer[8*i:]))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeUint64Array(writer io.Writer, data []uint64) error {
|
||||
bufferSize := 128
|
||||
buffer := make([]byte, bufferSize*int(wordBytes))
|
||||
for i := 0; i < len(data); i += bufferSize {
|
||||
end := i + bufferSize
|
||||
if end > len(data) {
|
||||
end = len(data)
|
||||
buffer = buffer[:wordBytes*uint(end-i)]
|
||||
}
|
||||
chunk := data[i:end]
|
||||
for i, x := range chunk {
|
||||
binaryOrder.PutUint64(buffer[8*i:], x)
|
||||
}
|
||||
_, err := writer.Write(buffer)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriteTo writes a BitSet to a stream. The format is:
|
||||
// 1. uint64 length
|
||||
// 2. []uint64 set
|
||||
// Upon success, the number of bytes written is returned.
|
||||
//
|
||||
// Performance: if this function is used to write to a disk or network
|
||||
// connection, it might be beneficial to wrap the stream in a bufio.Writer.
|
||||
// E.g.,
|
||||
//
|
||||
// f, err := os.Create("myfile")
|
||||
// w := bufio.NewWriter(f)
|
||||
func (b *BitSet) WriteTo(stream io.Writer) (int64, error) {
|
||||
length := uint64(b.length)
|
||||
|
||||
// Write length
|
||||
err := binary.Write(stream, binaryOrder, length)
|
||||
err := binary.Write(stream, binaryOrder, &length)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
// Upon failure, we do not guarantee that we
|
||||
// return the number of bytes written.
|
||||
return int64(0), err
|
||||
}
|
||||
|
||||
// Write set
|
||||
err = binary.Write(stream, binaryOrder, b.set)
|
||||
return int64(b.BinaryStorageSize()), err
|
||||
err = writeUint64Array(stream, b.set[:b.wordCount()])
|
||||
if err != nil {
|
||||
// Upon failure, we do not guarantee that we
|
||||
// return the number of bytes written.
|
||||
return int64(wordBytes), err
|
||||
}
|
||||
return int64(b.BinaryStorageSize()), nil
|
||||
}
|
||||
|
||||
// ReadFrom reads a BitSet from a stream written using WriteTo
|
||||
// The format is:
|
||||
// 1. uint64 length
|
||||
// 2. []uint64 set
|
||||
// Upon success, the number of bytes read is returned.
|
||||
// If the current BitSet is not large enough to hold the data,
|
||||
// it is extended. In case of error, the BitSet is either
|
||||
// left unchanged or made empty if the error occurs too late
|
||||
// to preserve the content.
|
||||
//
|
||||
// Performance: if this function is used to read from a disk or network
|
||||
// connection, it might be beneficial to wrap the stream in a bufio.Reader.
|
||||
// E.g.,
|
||||
//
|
||||
// f, err := os.Open("myfile")
|
||||
// r := bufio.NewReader(f)
|
||||
func (b *BitSet) ReadFrom(stream io.Reader) (int64, error) {
|
||||
var length uint64
|
||||
|
||||
// Read length first
|
||||
err := binary.Read(stream, binaryOrder, &length)
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
err = io.ErrUnexpectedEOF
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
newset := New(uint(length))
|
||||
newlength := uint(length)
|
||||
|
||||
if uint64(newset.length) != length {
|
||||
if uint64(newlength) != length {
|
||||
return 0, errors.New("unmarshalling error: type mismatch")
|
||||
}
|
||||
nWords := wordsNeeded(uint(newlength))
|
||||
if cap(b.set) >= nWords {
|
||||
b.set = b.set[:nWords]
|
||||
} else {
|
||||
b.set = make([]uint64, nWords)
|
||||
}
|
||||
|
||||
// Read remaining bytes as set
|
||||
err = binary.Read(stream, binaryOrder, newset.set)
|
||||
b.length = newlength
|
||||
|
||||
err = readUint64Array(stream, b.set)
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
err = io.ErrUnexpectedEOF
|
||||
}
|
||||
// We do not want to leave the BitSet partially filled as
|
||||
// it is error prone.
|
||||
b.set = b.set[:0]
|
||||
b.length = 0
|
||||
return 0, err
|
||||
}
|
||||
|
||||
*b = *newset
|
||||
return int64(b.BinaryStorageSize()), nil
|
||||
}
|
||||
|
||||
// MarshalBinary encodes a BitSet into a binary form and returns the result.
|
||||
func (b *BitSet) MarshalBinary() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
writer := bufio.NewWriter(&buf)
|
||||
|
||||
_, err := b.WriteTo(writer)
|
||||
_, err := b.WriteTo(&buf)
|
||||
if err != nil {
|
||||
return []byte{}, err
|
||||
}
|
||||
|
||||
err = writer.Flush()
|
||||
|
||||
return buf.Bytes(), err
|
||||
}
|
||||
|
||||
// UnmarshalBinary decodes the binary form generated by MarshalBinary.
|
||||
func (b *BitSet) UnmarshalBinary(data []byte) error {
|
||||
buf := bytes.NewReader(data)
|
||||
reader := bufio.NewReader(buf)
|
||||
|
||||
_, err := b.ReadFrom(reader)
|
||||
|
||||
_, err := b.ReadFrom(buf)
|
||||
return err
|
||||
}
|
||||
|
||||
// MarshalJSON marshals a BitSet as a JSON structure
|
||||
func (b *BitSet) MarshalJSON() ([]byte, error) {
|
||||
func (b BitSet) MarshalJSON() ([]byte, error) {
|
||||
buffer := bytes.NewBuffer(make([]byte, 0, b.BinaryStorageSize()))
|
||||
_, err := b.WriteTo(buffer)
|
||||
if err != nil {
|
||||
|
||||
+17
@@ -1,3 +1,4 @@
|
||||
//go:build go1.9
|
||||
// +build go1.9
|
||||
|
||||
package bitset
|
||||
@@ -14,6 +15,10 @@ func popcntSlice(s []uint64) uint64 {
|
||||
|
||||
func popcntMaskSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] &^ m[i])
|
||||
}
|
||||
@@ -22,6 +27,10 @@ func popcntMaskSlice(s, m []uint64) uint64 {
|
||||
|
||||
func popcntAndSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] & m[i])
|
||||
}
|
||||
@@ -30,6 +39,10 @@ func popcntAndSlice(s, m []uint64) uint64 {
|
||||
|
||||
func popcntOrSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] | m[i])
|
||||
}
|
||||
@@ -38,6 +51,10 @@ func popcntOrSlice(s, m []uint64) uint64 {
|
||||
|
||||
func popcntXorSlice(s, m []uint64) uint64 {
|
||||
var cnt int
|
||||
// this explicit check eliminates a bounds check in the loop
|
||||
if len(m) < len(s) {
|
||||
panic("mask slice is too short")
|
||||
}
|
||||
for i := range s {
|
||||
cnt += bits.OnesCount64(s[i] ^ m[i])
|
||||
}
|
||||
|
||||
+2
-2
@@ -1,5 +1,5 @@
|
||||
// +build !go1.9
|
||||
// +build amd64,!appengine
|
||||
//go:build !go1.9 && amd64 && !appengine
|
||||
// +build !go1.9,amd64,!appengine
|
||||
|
||||
package bitset
|
||||
|
||||
|
||||
+1
@@ -1,3 +1,4 @@
|
||||
//go:build !go1.9 && (!amd64 || appengine)
|
||||
// +build !go1.9
|
||||
// +build !amd64 appengine
|
||||
|
||||
|
||||
+1
@@ -1,3 +1,4 @@
|
||||
//go:build !go1.9
|
||||
// +build !go1.9
|
||||
|
||||
package bitset
|
||||
|
||||
+1
@@ -1,3 +1,4 @@
|
||||
//go:build go1.9
|
||||
// +build go1.9
|
||||
|
||||
package bitset
|
||||
|
||||
+4
-4
@@ -9,21 +9,21 @@
|
||||
[](https://sourcegraph.com/github.com/blevesearch/bleve?badge)
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
|
||||
A modern text indexing library in go
|
||||
A modern indexing library in GO
|
||||
|
||||
## Features
|
||||
|
||||
* Index any go data structure (including JSON)
|
||||
* Intelligent defaults backed up by powerful configuration
|
||||
* Supported field types:
|
||||
* Text, Numeric, Datetime, Boolean
|
||||
* `text`, `number`, `datetime`, `boolean`, `geopoint`, `geoshape`, `IP`, `vector`
|
||||
* Supported query types:
|
||||
* Term, Phrase, Match, Match Phrase, Prefix, Fuzzy
|
||||
* Conjunction, Disjunction, Boolean (must/should/must_not)
|
||||
* Conjunction, Disjunction, Boolean (`must`/`should`/`must_not`)
|
||||
* Term Range, Numeric Range, Date Range
|
||||
* [Geo Spatial](https://github.com/blevesearch/bleve/blob/master/geo/README.md)
|
||||
* Simple [query string syntax](http://www.blevesearch.com/docs/Query-String-Query/)
|
||||
* [Vector Search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md)
|
||||
* Approximate k-nearest neighbors over [vectors](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md)
|
||||
* [tf-idf](https://en.wikipedia.org/wiki/Tf-idf) Scoring
|
||||
* Query time boosting
|
||||
* Search result match highlighting with document fragments
|
||||
|
||||
Generated
Vendored
+52
@@ -0,0 +1,52 @@
|
||||
// Copyright (c) 2023 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package microseconds
|
||||
|
||||
import (
|
||||
"math"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "unix_micro"
|
||||
|
||||
type DateTimeParser struct {
|
||||
}
|
||||
|
||||
var minBound int64 = math.MinInt64 / 1000
|
||||
var maxBound int64 = math.MaxInt64 / 1000
|
||||
|
||||
func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
|
||||
// unix timestamp is milliseconds since UNIX epoch
|
||||
timestamp, err := strconv.ParseInt(input, 10, 64)
|
||||
if err != nil {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampString
|
||||
}
|
||||
if timestamp < minBound || timestamp > maxBound {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampRange
|
||||
}
|
||||
return time.UnixMicro(timestamp), Name, nil
|
||||
}
|
||||
|
||||
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||
return &DateTimeParser{}, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
|
||||
}
|
||||
Generated
Vendored
+52
@@ -0,0 +1,52 @@
|
||||
// Copyright (c) 2023 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package milliseconds
|
||||
|
||||
import (
|
||||
"math"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "unix_milli"
|
||||
|
||||
type DateTimeParser struct {
|
||||
}
|
||||
|
||||
var minBound int64 = math.MinInt64 / 1000000
|
||||
var maxBound int64 = math.MaxInt64 / 1000000
|
||||
|
||||
func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
|
||||
// unix timestamp is milliseconds since UNIX epoch
|
||||
timestamp, err := strconv.ParseInt(input, 10, 64)
|
||||
if err != nil {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampString
|
||||
}
|
||||
if timestamp < minBound || timestamp > maxBound {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampRange
|
||||
}
|
||||
return time.UnixMilli(timestamp), Name, nil
|
||||
}
|
||||
|
||||
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||
return &DateTimeParser{}, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
|
||||
}
|
||||
Generated
Vendored
+52
@@ -0,0 +1,52 @@
|
||||
// Copyright (c) 2023 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package nanoseconds
|
||||
|
||||
import (
|
||||
"math"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "unix_nano"
|
||||
|
||||
type DateTimeParser struct {
|
||||
}
|
||||
|
||||
var minBound int64 = math.MinInt64
|
||||
var maxBound int64 = math.MaxInt64
|
||||
|
||||
func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
|
||||
// unix timestamp is milliseconds since UNIX epoch
|
||||
timestamp, err := strconv.ParseInt(input, 10, 64)
|
||||
if err != nil {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampString
|
||||
}
|
||||
if timestamp < minBound || timestamp > maxBound {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampRange
|
||||
}
|
||||
return time.Unix(0, timestamp), Name, nil
|
||||
}
|
||||
|
||||
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||
return &DateTimeParser{}, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
|
||||
}
|
||||
Generated
Vendored
+52
@@ -0,0 +1,52 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package seconds
|
||||
|
||||
import (
|
||||
"math"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "unix_sec"
|
||||
|
||||
type DateTimeParser struct {
|
||||
}
|
||||
|
||||
var minBound int64 = math.MinInt64 / 1000000000
|
||||
var maxBound int64 = math.MaxInt64 / 1000000000
|
||||
|
||||
func (p *DateTimeParser) ParseDateTime(input string) (time.Time, string, error) {
|
||||
// unix timestamp is seconds since UNIX epoch
|
||||
timestamp, err := strconv.ParseInt(input, 10, 64)
|
||||
if err != nil {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampString
|
||||
}
|
||||
if timestamp < minBound || timestamp > maxBound {
|
||||
return time.Time{}, "", analysis.ErrInvalidTimestampRange
|
||||
}
|
||||
return time.Unix(timestamp, 0), Name, nil
|
||||
}
|
||||
|
||||
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
|
||||
return &DateTimeParser{}, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
|
||||
}
|
||||
+6
-1
@@ -43,10 +43,15 @@ type BooleanField struct {
|
||||
}
|
||||
|
||||
func (b *BooleanField) Size() int {
|
||||
var freqSize int
|
||||
if b.frequencies != nil {
|
||||
freqSize = b.frequencies.Size()
|
||||
}
|
||||
return reflectStaticSizeBooleanField + size.SizeOfPtr +
|
||||
len(b.name) +
|
||||
len(b.arrayPositions)*size.SizeOfUint64 +
|
||||
len(b.value)
|
||||
len(b.value) +
|
||||
freqSize
|
||||
}
|
||||
|
||||
func (b *BooleanField) Name() string {
|
||||
|
||||
+5
-2
@@ -68,13 +68,16 @@ func (c *CompositeField) Size() int {
|
||||
sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr +
|
||||
len(c.name)
|
||||
|
||||
for k, _ := range c.includedFields {
|
||||
for k := range c.includedFields {
|
||||
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
|
||||
}
|
||||
|
||||
for k, _ := range c.excludedFields {
|
||||
for k := range c.excludedFields {
|
||||
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
|
||||
}
|
||||
if c.compositeFrequencies != nil {
|
||||
sizeInBytes += c.compositeFrequencies.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
+7
-1
@@ -53,9 +53,15 @@ type DateTimeField struct {
|
||||
}
|
||||
|
||||
func (n *DateTimeField) Size() int {
|
||||
var freqSize int
|
||||
if n.frequencies != nil {
|
||||
freqSize = n.frequencies.Size()
|
||||
}
|
||||
return reflectStaticSizeDateTimeField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.arrayPositions)*size.SizeOfUint64
|
||||
len(n.arrayPositions)*size.SizeOfUint64 +
|
||||
len(n.value) +
|
||||
freqSize
|
||||
}
|
||||
|
||||
func (n *DateTimeField) Name() string {
|
||||
|
||||
+7
-1
@@ -47,9 +47,15 @@ type GeoPointField struct {
|
||||
}
|
||||
|
||||
func (n *GeoPointField) Size() int {
|
||||
var freqSize int
|
||||
if n.frequencies != nil {
|
||||
freqSize = n.frequencies.Size()
|
||||
}
|
||||
return reflectStaticSizeGeoPointField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.arrayPositions)*size.SizeOfUint64
|
||||
len(n.arrayPositions)*size.SizeOfUint64 +
|
||||
len(n.value) +
|
||||
freqSize
|
||||
}
|
||||
|
||||
func (n *GeoPointField) Name() string {
|
||||
|
||||
+8
-1
@@ -48,9 +48,16 @@ type GeoShapeField struct {
|
||||
}
|
||||
|
||||
func (n *GeoShapeField) Size() int {
|
||||
var freqSize int
|
||||
if n.frequencies != nil {
|
||||
freqSize = n.frequencies.Size()
|
||||
}
|
||||
return reflectStaticSizeGeoShapeField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.arrayPositions)*size.SizeOfUint64
|
||||
len(n.arrayPositions)*size.SizeOfUint64 +
|
||||
len(n.encodedValue) +
|
||||
len(n.value) +
|
||||
freqSize
|
||||
}
|
||||
|
||||
func (n *GeoShapeField) Name() string {
|
||||
|
||||
+6
-1
@@ -44,10 +44,15 @@ type IPField struct {
|
||||
}
|
||||
|
||||
func (b *IPField) Size() int {
|
||||
var freqSize int
|
||||
if b.frequencies != nil {
|
||||
freqSize = b.frequencies.Size()
|
||||
}
|
||||
return reflectStaticSizeIPField + size.SizeOfPtr +
|
||||
len(b.name) +
|
||||
len(b.arrayPositions)*size.SizeOfUint64 +
|
||||
len(b.value)
|
||||
len(b.value) +
|
||||
freqSize
|
||||
}
|
||||
|
||||
func (b *IPField) Name() string {
|
||||
|
||||
+7
-1
@@ -46,9 +46,15 @@ type NumericField struct {
|
||||
}
|
||||
|
||||
func (n *NumericField) Size() int {
|
||||
var freqSize int
|
||||
if n.frequencies != nil {
|
||||
freqSize = n.frequencies.Size()
|
||||
}
|
||||
return reflectStaticSizeNumericField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.arrayPositions)*size.SizeOfPtr
|
||||
len(n.arrayPositions)*size.SizeOfUint64 +
|
||||
len(n.value) +
|
||||
freqSize
|
||||
}
|
||||
|
||||
func (n *NumericField) Name() string {
|
||||
|
||||
+6
-1
@@ -44,10 +44,15 @@ type TextField struct {
|
||||
}
|
||||
|
||||
func (t *TextField) Size() int {
|
||||
var freqSize int
|
||||
if t.frequencies != nil {
|
||||
freqSize = t.frequencies.Size()
|
||||
}
|
||||
return reflectStaticSizeTextField + size.SizeOfPtr +
|
||||
len(t.name) +
|
||||
len(t.arrayPositions)*size.SizeOfUint64 +
|
||||
len(t.value)
|
||||
len(t.value) +
|
||||
freqSize
|
||||
}
|
||||
|
||||
func (t *TextField) Name() string {
|
||||
|
||||
+2
@@ -47,6 +47,8 @@ type VectorField struct {
|
||||
func (n *VectorField) Size() int {
|
||||
return reflectStaticSizeVectorField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.similarity) +
|
||||
len(n.vectorIndexOptimizedFor) +
|
||||
int(numBytesFloat32s(n.value))
|
||||
}
|
||||
|
||||
|
||||
+163
@@ -0,0 +1,163 @@
|
||||
// Copyright (c) 2024 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build vectors
|
||||
// +build vectors
|
||||
|
||||
package document
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/size"
|
||||
"github.com/blevesearch/bleve/v2/util"
|
||||
index "github.com/blevesearch/bleve_index_api"
|
||||
)
|
||||
|
||||
var reflectStaticSizeVectorBase64Field int
|
||||
|
||||
func init() {
|
||||
var f VectorBase64Field
|
||||
reflectStaticSizeVectorBase64Field = int(reflect.TypeOf(f).Size())
|
||||
}
|
||||
|
||||
type VectorBase64Field struct {
|
||||
vectorField *VectorField
|
||||
base64Encoding string
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Size() int {
|
||||
var vecFieldSize int
|
||||
if n.vectorField != nil {
|
||||
vecFieldSize = n.vectorField.Size()
|
||||
}
|
||||
return reflectStaticSizeVectorBase64Field + size.SizeOfPtr +
|
||||
len(n.base64Encoding) +
|
||||
vecFieldSize
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Name() string {
|
||||
return n.vectorField.Name()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) ArrayPositions() []uint64 {
|
||||
return n.vectorField.ArrayPositions()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Options() index.FieldIndexingOptions {
|
||||
return n.vectorField.Options()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) NumPlainTextBytes() uint64 {
|
||||
return n.vectorField.NumPlainTextBytes()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) AnalyzedLength() int {
|
||||
return n.vectorField.AnalyzedLength()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) EncodedFieldType() byte {
|
||||
return 'e'
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) AnalyzedTokenFrequencies() index.TokenFrequencies {
|
||||
return n.vectorField.AnalyzedTokenFrequencies()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Analyze() {
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Value() []byte {
|
||||
return n.vectorField.Value()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) GoString() string {
|
||||
return fmt.Sprintf("&document.vectorFieldBase64Field{Name:%s, Options: %s, "+
|
||||
"Value: %+v}", n.vectorField.Name(), n.vectorField.Options(), n.vectorField.Value())
|
||||
}
|
||||
|
||||
// For the sake of not polluting the API, we are keeping arrayPositions as a
|
||||
// parameter, but it is not used.
|
||||
func NewVectorBase64Field(name string, arrayPositions []uint64, vectorBase64 string,
|
||||
dims int, similarity, vectorIndexOptimizedFor string) (*VectorBase64Field, error) {
|
||||
|
||||
decodedVector, err := DecodeVector(vectorBase64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &VectorBase64Field{
|
||||
vectorField: NewVectorFieldWithIndexingOptions(name, arrayPositions,
|
||||
decodedVector, dims, similarity,
|
||||
vectorIndexOptimizedFor, DefaultVectorIndexingOptions),
|
||||
|
||||
base64Encoding: vectorBase64,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// This function takes a base64 encoded string and decodes it into
|
||||
// a vector.
|
||||
func DecodeVector(encodedValue string) ([]float32, error) {
|
||||
// We first decode the encoded string into a byte array.
|
||||
decodedString, err := base64.StdEncoding.DecodeString(encodedValue)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// The array is expected to be divisible by 4 because each float32
|
||||
// should occupy 4 bytes
|
||||
if len(decodedString)%size.SizeOfFloat32 != 0 {
|
||||
return nil, fmt.Errorf("decoded byte array not divisible by %d", size.SizeOfFloat32)
|
||||
}
|
||||
dims := int(len(decodedString) / size.SizeOfFloat32)
|
||||
|
||||
if dims <= 0 {
|
||||
return nil, fmt.Errorf("unable to decode encoded vector")
|
||||
}
|
||||
|
||||
decodedVector := make([]float32, dims)
|
||||
|
||||
// We iterate through the array 4 bytes at a time and convert each of
|
||||
// them to a float32 value by reading them in a little endian notation
|
||||
for i := 0; i < dims; i++ {
|
||||
bytes := decodedString[i*size.SizeOfFloat32 : (i+1)*size.SizeOfFloat32]
|
||||
entry := math.Float32frombits(binary.LittleEndian.Uint32(bytes))
|
||||
if !util.IsValidFloat32(float64(entry)) {
|
||||
return nil, fmt.Errorf("invalid float32 value: %f", entry)
|
||||
}
|
||||
decodedVector[i] = entry
|
||||
}
|
||||
|
||||
return decodedVector, nil
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Vector() []float32 {
|
||||
return n.vectorField.Vector()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Dims() int {
|
||||
return n.vectorField.Dims()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) Similarity() string {
|
||||
return n.vectorField.Similarity()
|
||||
}
|
||||
|
||||
func (n *VectorBase64Field) IndexOptimizedFor() string {
|
||||
return n.vectorField.IndexOptimizedFor()
|
||||
}
|
||||
+1
-1
@@ -218,7 +218,7 @@ A term search for term T in field F will look something like this:
|
||||
}
|
||||
```
|
||||
|
||||
The searchResultPostings will be a new implementation of the TermFieldReader inteface.
|
||||
The searchResultPostings will be a new implementation of the TermFieldReader interface.
|
||||
|
||||
As a reminder this interface is:
|
||||
|
||||
|
||||
+6
-1
@@ -22,7 +22,8 @@ var RegistryAsyncErrorCallbacks = map[string]func(error, string){}
|
||||
|
||||
// RegistryEventCallbacks should be treated as read-only after
|
||||
// process init()'ialization.
|
||||
var RegistryEventCallbacks = map[string]func(Event){}
|
||||
// In the event of not having a callback, these return true.
|
||||
var RegistryEventCallbacks = map[string]func(Event) bool{}
|
||||
|
||||
// Event represents the information provided in an OnEvent() callback.
|
||||
type Event struct {
|
||||
@@ -62,3 +63,7 @@ var EventKindMergeTaskIntroductionStart = EventKind(7)
|
||||
// EventKindMergeTaskIntroduction is fired when the merger has completed
|
||||
// the introduction of merged segment from a single merge task.
|
||||
var EventKindMergeTaskIntroduction = EventKind(8)
|
||||
|
||||
// EventKindPreMergeCheck is fired before the merge begins to check if
|
||||
// the caller should proceed with the merge.
|
||||
var EventKindPreMergeCheck = EventKind(9)
|
||||
|
||||
+11
@@ -72,6 +72,17 @@ OUTER:
|
||||
ctrlMsg = ctrlMsgDflt
|
||||
}
|
||||
if ctrlMsg != nil {
|
||||
continueMerge := s.fireEvent(EventKindPreMergeCheck, 0)
|
||||
// The default, if there's no handler, is to continue the merge.
|
||||
if !continueMerge {
|
||||
// If it's decided that this merge can't take place now,
|
||||
// begin the merge process all over again.
|
||||
// Retry instead of blocking/waiting here since a long wait
|
||||
// can result in more segments introduced i.e. s.root will
|
||||
// be updated.
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
startTime := time.Now()
|
||||
|
||||
// lets get started
|
||||
|
||||
+36
-4
@@ -37,6 +37,11 @@ type Segment interface {
|
||||
// Size of the live data of the segment; i.e., FullSize() minus
|
||||
// any logical deletions.
|
||||
LiveSize() int64
|
||||
|
||||
HasVector() bool
|
||||
|
||||
// Size of the persisted segment file.
|
||||
FileSize() int64
|
||||
}
|
||||
|
||||
// Plan() will functionally compute a merge plan. A segment will be
|
||||
@@ -76,6 +81,11 @@ type MergePlanOptions struct {
|
||||
// planner’s predicted sizes.
|
||||
MaxSegmentSize int64
|
||||
|
||||
// Max size (in bytes) of the persisted segment file that contains the
|
||||
// vectors. This is used to prevent merging of segments that
|
||||
// contain vectors that are too large.
|
||||
MaxSegmentFileSize int64
|
||||
|
||||
// The growth factor for each tier in a staircase of idealized
|
||||
// segments computed by CalcBudget().
|
||||
TierGrowth float64
|
||||
@@ -128,6 +138,7 @@ var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limi
|
||||
var DefaultMergePlanOptions = MergePlanOptions{
|
||||
MaxSegmentsPerTier: 10,
|
||||
MaxSegmentSize: 5000000,
|
||||
MaxSegmentFileSize: 4000000000, // 4GB
|
||||
TierGrowth: 10.0,
|
||||
SegmentsPerMergeTask: 10,
|
||||
FloorSegmentSize: 2000,
|
||||
@@ -139,6 +150,7 @@ var DefaultMergePlanOptions = MergePlanOptions{
|
||||
var SingleSegmentMergePlanOptions = MergePlanOptions{
|
||||
MaxSegmentsPerTier: 1,
|
||||
MaxSegmentSize: 1 << 30,
|
||||
MaxSegmentFileSize: 1 << 40,
|
||||
TierGrowth: 1.0,
|
||||
SegmentsPerMergeTask: 10,
|
||||
FloorSegmentSize: 1 << 30,
|
||||
@@ -170,8 +182,17 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
|
||||
minLiveSize = segment.LiveSize()
|
||||
}
|
||||
|
||||
isEligible := segment.LiveSize() < o.MaxSegmentSize/2
|
||||
// An eligible segment (based on #documents) may be too large
|
||||
// and thus need a stricter check based on the file size.
|
||||
// This is particularly important for segments that contain
|
||||
// vectors.
|
||||
if isEligible && segment.HasVector() && o.MaxSegmentFileSize > 0 {
|
||||
isEligible = segment.FileSize() < o.MaxSegmentFileSize/2
|
||||
}
|
||||
|
||||
// Only small-enough segments are eligible.
|
||||
if segment.LiveSize() < o.MaxSegmentSize/2 {
|
||||
if isEligible {
|
||||
eligibles = append(eligibles, segment)
|
||||
eligiblesLiveSize += segment.LiveSize()
|
||||
}
|
||||
@@ -215,14 +236,25 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
|
||||
for startIdx := 0; startIdx < len(eligibles); startIdx++ {
|
||||
var roster []Segment
|
||||
var rosterLiveSize int64
|
||||
var rosterFileSize int64 // useful for segments with vectors
|
||||
|
||||
for idx := startIdx; idx < len(eligibles) && len(roster) < o.SegmentsPerMergeTask; idx++ {
|
||||
eligible := eligibles[idx]
|
||||
|
||||
if rosterLiveSize+eligible.LiveSize() < o.MaxSegmentSize {
|
||||
roster = append(roster, eligible)
|
||||
rosterLiveSize += eligible.LiveSize()
|
||||
if rosterLiveSize+eligible.LiveSize() >= o.MaxSegmentSize {
|
||||
continue
|
||||
}
|
||||
|
||||
if eligible.HasVector() {
|
||||
efs := eligible.FileSize()
|
||||
if rosterFileSize+efs >= o.MaxSegmentFileSize {
|
||||
continue
|
||||
}
|
||||
rosterFileSize += efs
|
||||
}
|
||||
|
||||
roster = append(roster, eligible)
|
||||
rosterLiveSize += eligible.LiveSize()
|
||||
}
|
||||
|
||||
if len(roster) > 0 {
|
||||
|
||||
+2
-2
@@ -77,7 +77,7 @@ func (o *OptimizeVR) Finish() error {
|
||||
wg.Done()
|
||||
}()
|
||||
for field, vrs := range o.vrs {
|
||||
vecIndex, err := segment.InterpretVectorIndex(field)
|
||||
vecIndex, err := segment.InterpretVectorIndex(field, origSeg.deleted)
|
||||
if err != nil {
|
||||
errorsM.Lock()
|
||||
errors = append(errors, err)
|
||||
@@ -91,7 +91,7 @@ func (o *OptimizeVR) Finish() error {
|
||||
for _, vr := range vrs {
|
||||
// for each VR, populate postings list and iterators
|
||||
// by passing the obtained vector index and getting similar vectors.
|
||||
pl, err := vecIndex.Search(vr.vector, vr.k, origSeg.deleted)
|
||||
pl, err := vecIndex.Search(vr.vector, vr.k, vr.searchParams)
|
||||
if err != nil {
|
||||
errorsM.Lock()
|
||||
errors = append(errors, err)
|
||||
|
||||
+15
-5
@@ -549,11 +549,14 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string,
|
||||
val := make([]byte, 8)
|
||||
bytesWritten := atomic.LoadUint64(&snapshot.parent.stats.TotBytesWrittenAtIndexTime)
|
||||
binary.LittleEndian.PutUint64(val, bytesWritten)
|
||||
internalBucket.Put(TotBytesWrittenKey, val)
|
||||
err = internalBucket.Put(TotBytesWrittenKey, val)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var filenames []string
|
||||
newSegmentPaths := make(map[uint64]string)
|
||||
filenames := make([]string, 0, len(snapshot.segment))
|
||||
newSegmentPaths := make(map[uint64]string, len(snapshot.segment))
|
||||
|
||||
// first ensure that each segment in this snapshot has been persisted
|
||||
for _, segmentSnapshot := range snapshot.segment {
|
||||
@@ -826,6 +829,10 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
||||
for k, _ := c.First(); k != nil; k, _ = c.Next() {
|
||||
if k[0] == boltInternalKey[0] {
|
||||
internalBucket := snapshot.Bucket(k)
|
||||
if internalBucket == nil {
|
||||
_ = rv.DecRef()
|
||||
return nil, fmt.Errorf("internal bucket missing")
|
||||
}
|
||||
err := internalBucket.ForEach(func(key []byte, val []byte) error {
|
||||
copiedVal := append([]byte(nil), val...)
|
||||
rv.internal[string(key)] = copiedVal
|
||||
@@ -982,7 +989,7 @@ func getTimeSeriesSnapshots(maxDataPoints int, interval time.Duration,
|
||||
return ptr, rv
|
||||
}
|
||||
|
||||
// getProtectedEpochs aims to fetch the epochs keep based on a timestamp basis.
|
||||
// getProtectedSnapshots aims to fetch the epochs keep based on a timestamp basis.
|
||||
// It tries to get NumSnapshotsToKeep snapshots, each of which are separated
|
||||
// by a time duration of RollbackSamplingInterval.
|
||||
func getProtectedSnapshots(rollbackSamplingInterval time.Duration,
|
||||
@@ -1133,7 +1140,7 @@ func (s *Scorch) removeOldZapFiles() error {
|
||||
for _, f := range files {
|
||||
fname := f.Name()
|
||||
if filepath.Ext(fname) == ".zap" {
|
||||
if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] {
|
||||
if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] && (s.copyScheduled[fname] <= 0) {
|
||||
err := os.Remove(s.path + string(os.PathSeparator) + fname)
|
||||
if err != nil {
|
||||
log.Printf("got err removing file: %s, err: %v", fname, err)
|
||||
@@ -1198,6 +1205,9 @@ func (s *Scorch) rootBoltSnapshotMetaData() ([]*snapshotMetaData, error) {
|
||||
}
|
||||
|
||||
snapshot := snapshots.Bucket(sk)
|
||||
if snapshot == nil {
|
||||
continue
|
||||
}
|
||||
metaBucket := snapshot.Bucket(boltMetaDataKey)
|
||||
if metaBucket == nil {
|
||||
continue
|
||||
|
||||
+4
@@ -89,6 +89,10 @@ func RollbackPoints(path string) ([]*RollbackPoint, error) {
|
||||
for j, _ := c2.First(); j != nil; j, _ = c2.Next() {
|
||||
if j[0] == boltInternalKey[0] {
|
||||
internalBucket := snapshot.Bucket(j)
|
||||
if internalBucket == nil {
|
||||
err = fmt.Errorf("internal bucket missing")
|
||||
break
|
||||
}
|
||||
err = internalBucket.ForEach(func(key []byte, val []byte) error {
|
||||
copiedVal := append([]byte(nil), val...)
|
||||
meta[string(key)] = copiedVal
|
||||
|
||||
+49
-3
@@ -18,6 +18,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
@@ -49,6 +50,7 @@ type Scorch struct {
|
||||
unsafeBatch bool
|
||||
|
||||
rootLock sync.RWMutex
|
||||
|
||||
root *IndexSnapshot // holds 1 ref-count on the root
|
||||
rootPersisted []chan error // closed when root is persisted
|
||||
persistedCallbacks []index.BatchCallback
|
||||
@@ -56,6 +58,12 @@ type Scorch struct {
|
||||
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
|
||||
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
|
||||
|
||||
// keeps track of segments scheduled for online copy/backup operation. Each segment's filename maps to
|
||||
// the count of copy schedules. Segments with non-zero counts are protected from removal by the cleanup
|
||||
// operation. Counts decrement upon successful copy, allowing removal of segments with zero or absent counts.
|
||||
// must be accessed within the rootLock as it is accessed by the asynchronous cleanup routine.
|
||||
copyScheduled map[string]int
|
||||
|
||||
numSnapshotsToKeep int
|
||||
rollbackRetentionFactor float64
|
||||
checkPoints []*snapshotMetaData
|
||||
@@ -69,7 +77,7 @@ type Scorch struct {
|
||||
rootBolt *bolt.DB
|
||||
asyncTasks sync.WaitGroup
|
||||
|
||||
onEvent func(event Event)
|
||||
onEvent func(event Event) bool
|
||||
onAsyncError func(err error, path string)
|
||||
|
||||
forceMergeRequestCh chan *mergerCtrl
|
||||
@@ -112,6 +120,7 @@ func NewScorch(storeName string,
|
||||
ineligibleForRemoval: map[string]bool{},
|
||||
forceMergeRequestCh: make(chan *mergerCtrl, 1),
|
||||
segPlugin: defaultSegmentPlugin,
|
||||
copyScheduled: map[string]int{},
|
||||
}
|
||||
|
||||
forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config)
|
||||
@@ -175,12 +184,14 @@ func (s *Scorch) NumEventsBlocking() uint64 {
|
||||
return eventsStarted - eventsCompleted
|
||||
}
|
||||
|
||||
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
|
||||
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) bool {
|
||||
res := true
|
||||
if s.onEvent != nil {
|
||||
atomic.AddUint64(&s.stats.TotEventTriggerStarted, 1)
|
||||
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
|
||||
res = s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
|
||||
atomic.AddUint64(&s.stats.TotEventTriggerCompleted, 1)
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func (s *Scorch) fireAsyncError(err error) {
|
||||
@@ -822,6 +833,10 @@ func (fs *fieldStats) Aggregate(stats segment.FieldStats) {
|
||||
|
||||
// Returns the stats map
|
||||
func (fs *fieldStats) Fetch() map[string]map[string]uint64 {
|
||||
if fs == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return fs.statMap
|
||||
}
|
||||
|
||||
@@ -832,3 +847,34 @@ func newFieldStats() *fieldStats {
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// CopyReader returns a low-level accessor for index data, ensuring persisted segments
|
||||
// remain on disk for backup, preventing race conditions with the persister/merger cleanup.
|
||||
// Close the reader after backup to allow segment removal by the persister/merger.
|
||||
func (s *Scorch) CopyReader() index.CopyReader {
|
||||
s.rootLock.Lock()
|
||||
rv := s.root
|
||||
if rv != nil {
|
||||
rv.AddRef()
|
||||
var fileName string
|
||||
// schedule a backup for all the segments from the root. Note that the
|
||||
// both the unpersisted and persisted segments are scheduled for backup.
|
||||
// because during the backup, the unpersisted segments may get persisted and
|
||||
// hence we need to protect both the unpersisted and persisted segments from removal
|
||||
// by the cleanup routine during the online backup
|
||||
for _, seg := range rv.segment {
|
||||
if perSeg, ok := seg.segment.(segment.PersistedSegment); ok {
|
||||
// segment is persisted
|
||||
fileName = filepath.Base(perSeg.Path())
|
||||
} else {
|
||||
// segment is not persisted
|
||||
// the name of the segment file that is generated if the
|
||||
// the segment is persisted in the future.
|
||||
fileName = zapFileName(seg.id)
|
||||
}
|
||||
rv.parent.copyScheduled[fileName]++
|
||||
}
|
||||
}
|
||||
s.rootLock.Unlock()
|
||||
return rv
|
||||
}
|
||||
|
||||
+23
@@ -905,3 +905,26 @@ func (is *IndexSnapshot) GetSpatialAnalyzerPlugin(typ string) (
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (is *IndexSnapshot) CloseCopyReader() error {
|
||||
// first unmark the segments that were marked for backup by this index snapshot
|
||||
is.parent.rootLock.Lock()
|
||||
for _, seg := range is.segment {
|
||||
var fileName string
|
||||
if perSeg, ok := seg.segment.(segment.PersistedSegment); ok {
|
||||
// segment is persisted
|
||||
fileName = filepath.Base(perSeg.Path())
|
||||
} else {
|
||||
// segment is not persisted
|
||||
// the name of the segment file that is generated if the
|
||||
// the segment is persisted in the future.
|
||||
fileName = zapFileName(seg.id)
|
||||
}
|
||||
if is.parent.copyScheduled[fileName]--; is.parent.copyScheduled[fileName] <= 0 {
|
||||
delete(is.parent.copyScheduled, fileName)
|
||||
}
|
||||
}
|
||||
is.parent.rootLock.Unlock()
|
||||
// close the index snapshot normally
|
||||
return is.Close()
|
||||
}
|
||||
|
||||
+7
-2
@@ -20,6 +20,7 @@ package scorch
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
@@ -48,11 +49,15 @@ type IndexSnapshotVectorReader struct {
|
||||
currPosting segment_api.VecPosting
|
||||
currID index.IndexInternalID
|
||||
ctx context.Context
|
||||
|
||||
searchParams json.RawMessage
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotVectorReader) Size() int {
|
||||
sizeInBytes := reflectStaticSizeIndexSnapshotVectorReader + size.SizeOfPtr +
|
||||
len(i.vector) + len(i.field) + len(i.currID)
|
||||
len(i.vector)*size.SizeOfFloat32 +
|
||||
len(i.field) +
|
||||
len(i.currID)
|
||||
|
||||
for _, entry := range i.postings {
|
||||
sizeInBytes += entry.Size()
|
||||
@@ -103,7 +108,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
|
||||
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {
|
||||
|
||||
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
|
||||
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k)
|
||||
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, i.searchParams)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
+26
@@ -16,6 +16,7 @@ package scorch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
@@ -66,6 +67,31 @@ func (s *SegmentSnapshot) LiveSize() int64 {
|
||||
return int64(s.Count())
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) HasVector() bool {
|
||||
// number of vectors, for each vector field in the segment
|
||||
numVecs := s.stats.Fetch()["num_vectors"]
|
||||
return len(numVecs) > 0
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) FileSize() int64 {
|
||||
ps, ok := s.segment.(segment.PersistedSegment)
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
|
||||
path := ps.Path()
|
||||
if path == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
fi, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return fi.Size()
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Close() error {
|
||||
return s.segment.Close()
|
||||
}
|
||||
|
||||
+7
-5
@@ -19,20 +19,22 @@ package scorch
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
|
||||
index "github.com/blevesearch/bleve_index_api"
|
||||
segment_api "github.com/blevesearch/scorch_segment_api/v2"
|
||||
)
|
||||
|
||||
func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32,
|
||||
field string, k int64) (
|
||||
field string, k int64, searchParams json.RawMessage) (
|
||||
index.VectorReader, error) {
|
||||
|
||||
rv := &IndexSnapshotVectorReader{
|
||||
vector: vector,
|
||||
field: field,
|
||||
k: k,
|
||||
snapshot: is,
|
||||
vector: vector,
|
||||
field: field,
|
||||
k: k,
|
||||
snapshot: is,
|
||||
searchParams: searchParams,
|
||||
}
|
||||
|
||||
if rv.postings == nil {
|
||||
|
||||
+36
-13
@@ -25,6 +25,10 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds"
|
||||
"github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds"
|
||||
"github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds"
|
||||
"github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds"
|
||||
"github.com/blevesearch/bleve/v2/document"
|
||||
"github.com/blevesearch/bleve/v2/index/scorch"
|
||||
"github.com/blevesearch/bleve/v2/index/upsidedown"
|
||||
@@ -738,10 +742,28 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest,
|
||||
datetime, layout, err := docF.DateTime()
|
||||
if err == nil {
|
||||
if layout == "" {
|
||||
// layout not set probably means it was indexed as a timestamp
|
||||
value = strconv.FormatInt(datetime.UnixNano(), 10)
|
||||
// missing layout means we fallback to
|
||||
// the default layout which is RFC3339
|
||||
value = datetime.Format(time.RFC3339)
|
||||
} else {
|
||||
value = datetime.Format(layout)
|
||||
// the layout here can now either be representative
|
||||
// of an actual datetime layout or a timestamp
|
||||
switch layout {
|
||||
case seconds.Name:
|
||||
value = strconv.FormatInt(datetime.Unix(), 10)
|
||||
case milliseconds.Name:
|
||||
value = strconv.FormatInt(datetime.UnixMilli(), 10)
|
||||
case microseconds.Name:
|
||||
value = strconv.FormatInt(datetime.UnixMicro(), 10)
|
||||
case nanoseconds.Name:
|
||||
value = strconv.FormatInt(datetime.UnixNano(), 10)
|
||||
default:
|
||||
// the layout for formatting the date to a string
|
||||
// is provided by a datetime parser which is not
|
||||
// handling the timestamp case, hence the layout
|
||||
// can be directly used to format the date
|
||||
value = datetime.Format(layout)
|
||||
}
|
||||
}
|
||||
}
|
||||
case index.BooleanField:
|
||||
@@ -1052,22 +1074,23 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) {
|
||||
return ErrorIndexClosed
|
||||
}
|
||||
|
||||
indexReader, err := i.i.Reader()
|
||||
if err != nil {
|
||||
return err
|
||||
copyIndex, ok := i.i.(index.CopyIndex)
|
||||
if !ok {
|
||||
return fmt.Errorf("index implementation does not support copy reader")
|
||||
}
|
||||
|
||||
copyReader := copyIndex.CopyReader()
|
||||
if copyReader == nil {
|
||||
return fmt.Errorf("index's copyReader is nil")
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if cerr := indexReader.Close(); err == nil && cerr != nil {
|
||||
if cerr := copyReader.CloseCopyReader(); err == nil && cerr != nil {
|
||||
err = cerr
|
||||
}
|
||||
}()
|
||||
|
||||
irc, ok := indexReader.(IndexCopyable)
|
||||
if !ok {
|
||||
return fmt.Errorf("index implementation does not support copy")
|
||||
}
|
||||
|
||||
err = irc.CopyTo(d)
|
||||
err = copyReader.CopyTo(d)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error copying index metadata: %v", err)
|
||||
}
|
||||
|
||||
+13
-11
@@ -443,6 +443,8 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
|
||||
fieldMapping.processGeoShape(property, pathString, path, indexes, context)
|
||||
} else if fieldMapping.Type == "geopoint" {
|
||||
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
|
||||
} else if fieldMapping.Type == "vector_base64" {
|
||||
fieldMapping.processVectorBase64(property, pathString, path, indexes, context)
|
||||
} else {
|
||||
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
|
||||
}
|
||||
@@ -532,33 +534,33 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
|
||||
dm.walkDocument(property, path, indexes, context)
|
||||
}
|
||||
case reflect.Map, reflect.Slice:
|
||||
var isPropertyVector bool
|
||||
var isPropertyVectorInitialized bool
|
||||
if subDocMapping != nil {
|
||||
walkDocument := false
|
||||
if subDocMapping != nil && len(subDocMapping.Fields) != 0 {
|
||||
for _, fieldMapping := range subDocMapping.Fields {
|
||||
switch fieldMapping.Type {
|
||||
case "vector":
|
||||
processed := fieldMapping.processVector(property, pathString, path,
|
||||
fieldMapping.processVector(property, pathString, path,
|
||||
indexes, context)
|
||||
if !isPropertyVectorInitialized {
|
||||
isPropertyVector = processed
|
||||
isPropertyVectorInitialized = true
|
||||
} else {
|
||||
isPropertyVector = isPropertyVector && processed
|
||||
}
|
||||
case "geopoint":
|
||||
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
|
||||
walkDocument = true
|
||||
case "IP":
|
||||
ip, ok := property.(net.IP)
|
||||
if ok {
|
||||
fieldMapping.processIP(ip, pathString, path, indexes, context)
|
||||
}
|
||||
walkDocument = true
|
||||
case "geoshape":
|
||||
fieldMapping.processGeoShape(property, pathString, path, indexes, context)
|
||||
walkDocument = true
|
||||
default:
|
||||
walkDocument = true
|
||||
}
|
||||
}
|
||||
} else {
|
||||
walkDocument = true
|
||||
}
|
||||
if !isPropertyVector {
|
||||
if walkDocument {
|
||||
dm.walkDocument(property, path, indexes, context)
|
||||
}
|
||||
case reflect.Ptr:
|
||||
|
||||
+1
-1
@@ -102,7 +102,7 @@ func newTextFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
|
||||
return rv
|
||||
}
|
||||
|
||||
// NewKeyworFieldMapping returns a default field mapping for text with analyzer "keyword".
|
||||
// NewKeywordFieldMapping returns a default field mapping for text with analyzer "keyword".
|
||||
func NewKeywordFieldMapping() *FieldMapping {
|
||||
return &FieldMapping{
|
||||
Type: "text",
|
||||
|
||||
+6
-14
@@ -437,24 +437,16 @@ func (im *IndexMappingImpl) FieldAnalyzer(field string) string {
|
||||
func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping {
|
||||
if im.TypeMapping != nil {
|
||||
for _, v := range im.TypeMapping {
|
||||
for field, property := range v.Properties {
|
||||
for _, v1 := range property.Fields {
|
||||
if field == path {
|
||||
// Return field mapping if the name matches the path param.
|
||||
return *v1
|
||||
}
|
||||
}
|
||||
fm := v.fieldDescribedByPath(path)
|
||||
if fm != nil {
|
||||
return *fm
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for field, property := range im.DefaultMapping.Properties {
|
||||
for _, v1 := range property.Fields {
|
||||
if field == path {
|
||||
// Return field mapping if the name matches the path param.
|
||||
return *v1
|
||||
}
|
||||
}
|
||||
fm := im.DefaultMapping.fieldDescribedByPath(path)
|
||||
if fm != nil {
|
||||
return *fm
|
||||
}
|
||||
|
||||
return FieldMapping{}
|
||||
|
||||
+9
@@ -21,11 +21,20 @@ func NewVectorFieldMapping() *FieldMapping {
|
||||
return nil
|
||||
}
|
||||
|
||||
func NewVectorBase64FieldMapping() *FieldMapping {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
|
||||
pathString string, path []string, indexes []uint64, context *walkContext) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (fm *FieldMapping) processVectorBase64(propertyMightBeVector interface{},
|
||||
pathString string, path []string, indexes []uint64, context *walkContext) {
|
||||
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// document validation functions
|
||||
|
||||
|
||||
+38
-4
@@ -26,10 +26,11 @@ import (
|
||||
index "github.com/blevesearch/bleve_index_api"
|
||||
)
|
||||
|
||||
// Min and Max allowed dimensions for a vector field
|
||||
const (
|
||||
// Min and Max allowed dimensions for a vector field;
|
||||
// p.s must be set/updated at process init() _only_
|
||||
var (
|
||||
MinVectorDims = 1
|
||||
MaxVectorDims = 2048
|
||||
MaxVectorDims = 4096
|
||||
)
|
||||
|
||||
func NewVectorFieldMapping() *FieldMapping {
|
||||
@@ -43,6 +44,17 @@ func NewVectorFieldMapping() *FieldMapping {
|
||||
}
|
||||
}
|
||||
|
||||
func NewVectorBase64FieldMapping() *FieldMapping {
|
||||
return &FieldMapping{
|
||||
Type: "vector_base64",
|
||||
Store: false,
|
||||
Index: true,
|
||||
IncludeInAll: false,
|
||||
DocValues: false,
|
||||
SkipFreqNorm: true,
|
||||
}
|
||||
}
|
||||
|
||||
// validate and process a flat vector
|
||||
func processFlatVector(vecV reflect.Value, dims int) ([]float32, bool) {
|
||||
if vecV.Len() != dims {
|
||||
@@ -140,13 +152,35 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
|
||||
return true
|
||||
}
|
||||
|
||||
func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interface{},
|
||||
pathString string, path []string, indexes []uint64, context *walkContext) {
|
||||
encodedString, ok := propertyMightBeVectorBase64.(string)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
decodedVector, err := document.DecodeVector(encodedString)
|
||||
if err != nil || len(decodedVector) != fm.Dims {
|
||||
return
|
||||
}
|
||||
|
||||
fieldName := getFieldName(pathString, path, fm)
|
||||
options := fm.Options()
|
||||
field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector,
|
||||
fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options)
|
||||
context.doc.AddField(field)
|
||||
|
||||
// "_all" composite field is not applicable for vector_base64 field
|
||||
context.excludedFromAll = append(context.excludedFromAll, fieldName)
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// document validation functions
|
||||
|
||||
func validateFieldMapping(field *FieldMapping, parentName string,
|
||||
fieldAliasCtx map[string]*FieldMapping) error {
|
||||
switch field.Type {
|
||||
case "vector":
|
||||
case "vector", "vector_base64":
|
||||
return validateVectorFieldAlias(field, parentName, fieldAliasCtx)
|
||||
default: // non-vector field
|
||||
return validateFieldType(field)
|
||||
|
||||
+4
@@ -22,3 +22,7 @@ import "github.com/blevesearch/bleve/v2/mapping"
|
||||
func NewVectorFieldMapping() *mapping.FieldMapping {
|
||||
return mapping.NewVectorFieldMapping()
|
||||
}
|
||||
|
||||
func NewVectorBase64FieldMapping() *mapping.FieldMapping {
|
||||
return mapping.NewVectorBase64FieldMapping()
|
||||
}
|
||||
|
||||
+1
-1
@@ -83,7 +83,7 @@ func NewDateRangeStringQuery(start, end string) *query.DateRangeStringQuery {
|
||||
return query.NewDateRangeStringQuery(start, end)
|
||||
}
|
||||
|
||||
// NewDateRangeStringQuery creates a new Query for ranges
|
||||
// NewDateRangeInclusiveStringQuery creates a new Query for ranges
|
||||
// of date values.
|
||||
// Date strings are parsed using the DateTimeParser set using
|
||||
//
|
||||
|
||||
+1
-1
@@ -53,7 +53,7 @@ func NewDateRangeStringQuery(start, end string) *DateRangeStringQuery {
|
||||
return NewDateRangeStringInclusiveQuery(start, end, nil, nil)
|
||||
}
|
||||
|
||||
// NewDateRangeStringQuery creates a new Query for ranges
|
||||
// NewDateRangeStringInclusiveQuery creates a new Query for ranges
|
||||
// of date values.
|
||||
// Date strings are parsed using the DateTimeParser field of the query struct,
|
||||
// which is a custom date time parser defined in the index mapping.
|
||||
|
||||
+9
-1
@@ -19,6 +19,7 @@ package query
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/mapping"
|
||||
@@ -32,6 +33,9 @@ type KNNQuery struct {
|
||||
Vector []float32 `json:"vector"`
|
||||
K int64 `json:"k"`
|
||||
BoostVal *Boost `json:"boost,omitempty"`
|
||||
|
||||
// see KNNRequest.Params for description
|
||||
Params json.RawMessage `json:"params"`
|
||||
}
|
||||
|
||||
func NewKNNQuery(vector []float32) *KNNQuery {
|
||||
@@ -59,6 +63,10 @@ func (q *KNNQuery) Boost() float64 {
|
||||
return q.BoostVal.Value()
|
||||
}
|
||||
|
||||
func (q *KNNQuery) SetParams(params json.RawMessage) {
|
||||
q.Params = params
|
||||
}
|
||||
|
||||
func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader,
|
||||
m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
|
||||
fieldMapping := m.FieldMappingForPath(q.VectorField)
|
||||
@@ -70,5 +78,5 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader,
|
||||
return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty")
|
||||
}
|
||||
return searcher.NewKNNSearcher(ctx, i, m, options, q.VectorField,
|
||||
q.Vector, q.K, q.BoostVal.Value(), similarityMetric)
|
||||
q.Vector, q.K, q.BoostVal.Value(), similarityMetric, q.Params)
|
||||
}
|
||||
|
||||
+1
-1
@@ -65,7 +65,7 @@ type ValidatableQuery interface {
|
||||
Validate() error
|
||||
}
|
||||
|
||||
// ParseQuery deserializes a JSON representation of
|
||||
// ParsePreSearchData deserializes a JSON representation of
|
||||
// a PreSearchData object.
|
||||
func ParsePreSearchData(input []byte) (map[string]interface{}, error) {
|
||||
var rv map[string]interface{}
|
||||
|
||||
+2
-1
@@ -47,7 +47,8 @@ type KNNQueryScorer struct {
|
||||
|
||||
func (s *KNNQueryScorer) Size() int {
|
||||
sizeInBytes := reflectStaticSizeKNNQueryScorer + size.SizeOfPtr +
|
||||
(len(s.queryVector) * size.SizeOfFloat32) + len(s.queryField)
|
||||
(len(s.queryVector) * size.SizeOfFloat32) + len(s.queryField) +
|
||||
len(s.similarityMetric)
|
||||
|
||||
if s.queryWeightExplanation != nil {
|
||||
sizeInBytes += s.queryWeightExplanation.Size()
|
||||
|
||||
+1
-1
@@ -147,7 +147,7 @@ type DocumentMatch struct {
|
||||
Index string `json:"index,omitempty"`
|
||||
ID string `json:"id"`
|
||||
IndexInternalID index.IndexInternalID `json:"-"`
|
||||
Score float64 `json:"score,omitempty"`
|
||||
Score float64 `json:"score"`
|
||||
Expl *Explanation `json:"explanation,omitempty"`
|
||||
Locations FieldTermLocationMap `json:"locations,omitempty"`
|
||||
Fragments FieldFragmentMap `json:"fragments,omitempty"`
|
||||
|
||||
+5
-2
@@ -19,6 +19,7 @@ package searcher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/mapping"
|
||||
@@ -48,9 +49,11 @@ type KNNSearcher struct {
|
||||
|
||||
func NewKNNSearcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping,
|
||||
options search.SearcherOptions, field string, vector []float32, k int64,
|
||||
boost float64, similarityMetric string) (search.Searcher, error) {
|
||||
boost float64, similarityMetric string, searchParams json.RawMessage) (
|
||||
search.Searcher, error) {
|
||||
|
||||
if vr, ok := i.(index.VectorIndexReader); ok {
|
||||
vectorReader, err := vr.VectorReader(ctx, vector, field, k)
|
||||
vectorReader, err := vr.VectorReader(ctx, vector, field, k, searchParams)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
+30
-4
@@ -23,18 +23,22 @@ import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/document"
|
||||
"github.com/blevesearch/bleve/v2/search"
|
||||
"github.com/blevesearch/bleve/v2/search/collector"
|
||||
"github.com/blevesearch/bleve/v2/search/query"
|
||||
index "github.com/blevesearch/bleve_index_api"
|
||||
)
|
||||
|
||||
const supportForVectorSearch = true
|
||||
|
||||
type knnOperator string
|
||||
|
||||
// Must be updated only at init
|
||||
var BleveMaxK = int64(10000)
|
||||
|
||||
type SearchRequest struct {
|
||||
ClientContextID string `json:"client_context_id,omitempty"`
|
||||
Query query.Query `json:"query"`
|
||||
Size int `json:"size"`
|
||||
From int `json:"from"`
|
||||
@@ -66,11 +70,23 @@ type SearchRequest struct {
|
||||
sortFunc func(sort.Interface)
|
||||
}
|
||||
|
||||
// Vector takes precedence over vectorBase64 in case both fields are given
|
||||
type KNNRequest struct {
|
||||
Field string `json:"field"`
|
||||
Vector []float32 `json:"vector"`
|
||||
K int64 `json:"k"`
|
||||
Boost *query.Boost `json:"boost,omitempty"`
|
||||
Field string `json:"field"`
|
||||
Vector []float32 `json:"vector"`
|
||||
VectorBase64 string `json:"vector_base64"`
|
||||
K int64 `json:"k"`
|
||||
Boost *query.Boost `json:"boost,omitempty"`
|
||||
|
||||
// Search parameters for the field's vector index part of the segment.
|
||||
// Value of it depends on the field's backing vector index implementation.
|
||||
//
|
||||
// For Faiss IVF index, supported search params are:
|
||||
// - ivf_nprobe_pct : int // percentage of total clusters to search
|
||||
// - ivf_max_codes_pct : float // percentage of total vectors to visit to do a query (across all clusters)
|
||||
//
|
||||
// Consult go-faiss to know all supported search params
|
||||
Params json.RawMessage `json:"params"`
|
||||
}
|
||||
|
||||
func (r *SearchRequest) AddKNN(field string, vector []float32, k int64, boost float64) {
|
||||
@@ -208,6 +224,7 @@ func createKNNQuery(req *SearchRequest) (query.Query, []int64, int64, error) {
|
||||
knnQuery.SetFieldVal(knn.Field)
|
||||
knnQuery.SetK(knn.K)
|
||||
knnQuery.SetBoost(knn.Boost.Value())
|
||||
knnQuery.SetParams(knn.Params)
|
||||
subQueries = append(subQueries, knnQuery)
|
||||
kArray = append(kArray, knn.K)
|
||||
sumOfK += knn.K
|
||||
@@ -230,6 +247,15 @@ func validateKNN(req *SearchRequest) error {
|
||||
if q == nil {
|
||||
return fmt.Errorf("knn query cannot be nil")
|
||||
}
|
||||
if len(q.Vector) == 0 && q.VectorBase64 != "" {
|
||||
// consider vector_base64 only if vector is not provided
|
||||
decodedVector, err := document.DecodeVector(q.VectorBase64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
q.Vector = decodedVector
|
||||
}
|
||||
if q.K <= 0 || len(q.Vector) == 0 {
|
||||
return fmt.Errorf("k must be greater than 0 and vector must be non-empty")
|
||||
}
|
||||
|
||||
+2
@@ -28,6 +28,8 @@ import (
|
||||
index "github.com/blevesearch/bleve_index_api"
|
||||
)
|
||||
|
||||
const supportForVectorSearch = false
|
||||
|
||||
// A SearchRequest describes all the parameters
|
||||
// needed to search the index.
|
||||
// Query is required.
|
||||
|
||||
+5
-1
@@ -48,7 +48,7 @@ func ExtractNumericValFloat32(v interface{}) (float32, bool) {
|
||||
switch {
|
||||
case val.CanFloat():
|
||||
floatVal := val.Float()
|
||||
if floatVal > math.MaxFloat32 {
|
||||
if !IsValidFloat32(floatVal) {
|
||||
return 0, false
|
||||
}
|
||||
return float32(floatVal), true
|
||||
@@ -60,3 +60,7 @@ func ExtractNumericValFloat32(v interface{}) (float32, bool) {
|
||||
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func IsValidFloat32(val float64) bool {
|
||||
return !math.IsNaN(val) && !math.IsInf(val, 0) && val <= math.MaxFloat32
|
||||
}
|
||||
|
||||
+18
@@ -48,6 +48,15 @@ type Index interface {
|
||||
StatsMap() map[string]interface{}
|
||||
}
|
||||
|
||||
// CopyIndex is an extended index that supports copying to a new location online.
|
||||
// Use the CopyReader method to obtain a reader for initiating the copy operation.
|
||||
type CopyIndex interface {
|
||||
Index
|
||||
// Obtain a copy reader for the online copy/backup operation,
|
||||
// to handle necessary bookkeeping, instead of using the regular IndexReader.
|
||||
CopyReader() CopyReader
|
||||
}
|
||||
|
||||
type IndexReader interface {
|
||||
TermFieldReader(ctx context.Context, term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
|
||||
|
||||
@@ -79,6 +88,15 @@ type IndexReader interface {
|
||||
Close() error
|
||||
}
|
||||
|
||||
// CopyReader is an extended index reader for backup or online copy operations, replacing the regular index reader.
|
||||
type CopyReader interface {
|
||||
IndexReader
|
||||
// CopyTo performs an online copy or backup of the index to the specified directory.
|
||||
CopyTo(d Directory) error
|
||||
// CloseCopyReader must be used instead of Close() to close the copy reader.
|
||||
CloseCopyReader() error
|
||||
}
|
||||
|
||||
type IndexReaderRegexp interface {
|
||||
FieldDictRegexp(field string, regex string) (FieldDict, error)
|
||||
}
|
||||
|
||||
+7
-4
@@ -51,19 +51,22 @@ var SupportedSimilarityMetrics = map[string]struct{}{
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
const (
|
||||
IndexOptimizedForRecall = "recall"
|
||||
IndexOptimizedForLatency = "latency"
|
||||
IndexOptimizedForRecall = "recall"
|
||||
IndexOptimizedForLatency = "latency"
|
||||
IndexOptimizedForMemoryEfficient = "memory-efficient"
|
||||
)
|
||||
|
||||
const DefaultIndexOptimization = IndexOptimizedForRecall
|
||||
|
||||
var SupportedVectorIndexOptimizations = map[string]int{
|
||||
IndexOptimizedForRecall: 0,
|
||||
IndexOptimizedForLatency: 1,
|
||||
IndexOptimizedForRecall: 0,
|
||||
IndexOptimizedForLatency: 1,
|
||||
IndexOptimizedForMemoryEfficient: 2,
|
||||
}
|
||||
|
||||
// Reverse maps vector index optimizations': int -> string
|
||||
var VectorIndexOptimizationsReverseLookup = map[int]string{
|
||||
0: IndexOptimizedForRecall,
|
||||
1: IndexOptimizedForLatency,
|
||||
2: IndexOptimizedForMemoryEfficient,
|
||||
}
|
||||
|
||||
+2
-1
@@ -19,6 +19,7 @@ package index
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"reflect"
|
||||
)
|
||||
|
||||
@@ -47,7 +48,7 @@ type VectorReader interface {
|
||||
}
|
||||
|
||||
type VectorIndexReader interface {
|
||||
VectorReader(ctx context.Context, vector []float32, field string, k int64) (
|
||||
VectorReader(ctx context.Context, vector []float32, field string, k int64, searchParams json.RawMessage) (
|
||||
VectorReader, error)
|
||||
}
|
||||
|
||||
|
||||
-6
@@ -6,7 +6,6 @@ package faiss
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
@@ -16,9 +15,6 @@ type ParameterSpace struct {
|
||||
|
||||
// NewParameterSpace creates a new ParameterSpace.
|
||||
func NewParameterSpace() (*ParameterSpace, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
var ps *C.FaissParameterSpace
|
||||
if c := C.faiss_ParameterSpace_new(&ps); c != 0 {
|
||||
return nil, getLastError()
|
||||
@@ -28,12 +24,10 @@ func NewParameterSpace() (*ParameterSpace, error) {
|
||||
|
||||
// SetIndexParameter sets one of the parameters.
|
||||
func (p *ParameterSpace) SetIndexParameter(idx Index, name string, val float64) error {
|
||||
runtime.LockOSThread()
|
||||
cname := C.CString(name)
|
||||
|
||||
defer func() {
|
||||
C.free(unsafe.Pointer(cname))
|
||||
runtime.UnlockOSThread()
|
||||
}()
|
||||
|
||||
c := C.faiss_ParameterSpace_set_index_parameter(
|
||||
|
||||
+45
-63
@@ -12,8 +12,8 @@ package faiss
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
@@ -49,7 +49,7 @@ type Index interface {
|
||||
// corresponding distances.
|
||||
Search(x []float32, k int64) (distances []float32, labels []int64, err error)
|
||||
|
||||
SearchWithoutIDs(x []float32, k int64, exclude []int64) (distances []float32,
|
||||
SearchWithoutIDs(x []float32, k int64, exclude []int64, params json.RawMessage) (distances []float32,
|
||||
labels []int64, err error)
|
||||
|
||||
Reconstruct(key int64) ([]float32, error)
|
||||
@@ -108,9 +108,6 @@ func (idx *faissIndex) MetricType() int {
|
||||
}
|
||||
|
||||
func (idx *faissIndex) Train(x []float32) error {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
n := len(x) / idx.D()
|
||||
if c := C.faiss_Index_train(idx.idx, C.idx_t(n), (*C.float)(&x[0])); c != 0 {
|
||||
return getLastError()
|
||||
@@ -119,9 +116,6 @@ func (idx *faissIndex) Train(x []float32) error {
|
||||
}
|
||||
|
||||
func (idx *faissIndex) Add(x []float32) error {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
n := len(x) / idx.D()
|
||||
if c := C.faiss_Index_add(idx.idx, C.idx_t(n), (*C.float)(&x[0])); c != 0 {
|
||||
return getLastError()
|
||||
@@ -130,9 +124,6 @@ func (idx *faissIndex) Add(x []float32) error {
|
||||
}
|
||||
|
||||
func (idx *faissIndex) AddWithIDs(x []float32, xids []int64) error {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
n := len(x) / idx.D()
|
||||
if c := C.faiss_Index_add_with_ids(
|
||||
idx.idx,
|
||||
@@ -148,8 +139,6 @@ func (idx *faissIndex) AddWithIDs(x []float32, xids []int64) error {
|
||||
func (idx *faissIndex) Search(x []float32, k int64) (
|
||||
distances []float32, labels []int64, err error,
|
||||
) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
n := len(x) / idx.D()
|
||||
distances = make([]float32, int64(n)*k)
|
||||
@@ -168,52 +157,35 @@ func (idx *faissIndex) Search(x []float32, k int64) (
|
||||
return
|
||||
}
|
||||
|
||||
func (idx *faissIndex) SearchWithoutIDs(x []float32, k int64, exclude []int64) (
|
||||
func (idx *faissIndex) SearchWithoutIDs(x []float32, k int64, exclude []int64, params json.RawMessage) (
|
||||
distances []float32, labels []int64, err error,
|
||||
) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
if len(exclude) <= 0 {
|
||||
if params == nil && len(exclude) == 0 {
|
||||
return idx.Search(x, k)
|
||||
}
|
||||
|
||||
excludeSelector, err := NewIDSelectorNot(exclude)
|
||||
var selector *C.FaissIDSelector
|
||||
if len(exclude) > 0 {
|
||||
excludeSelector, err := NewIDSelectorNot(exclude)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
selector = excludeSelector.sel
|
||||
defer excludeSelector.Delete()
|
||||
}
|
||||
|
||||
searchParams, err := NewSearchParams(idx, params, selector)
|
||||
defer searchParams.Delete()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
var sp *C.FaissSearchParameters
|
||||
C.faiss_SearchParameters_new(&sp, (*C.FaissIDSelector)(excludeSelector.sel))
|
||||
ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr())
|
||||
if ivfPtr != nil {
|
||||
sp = C.faiss_SearchParametersIVF_cast(sp)
|
||||
C.faiss_SearchParametersIVF_new_with_sel(&sp, (*C.FaissIDSelector)(excludeSelector.sel))
|
||||
}
|
||||
distances, labels, err = idx.searchWithParams(x, k, searchParams.sp)
|
||||
|
||||
n := len(x) / idx.D()
|
||||
distances = make([]float32, int64(n)*k)
|
||||
labels = make([]int64, int64(n)*k)
|
||||
|
||||
if c := C.faiss_Index_search_with_params(
|
||||
idx.idx,
|
||||
C.idx_t(n),
|
||||
(*C.float)(&x[0]),
|
||||
C.idx_t(k), sp,
|
||||
(*C.float)(&distances[0]),
|
||||
(*C.idx_t)(&labels[0]),
|
||||
); c != 0 {
|
||||
err = getLastError()
|
||||
}
|
||||
excludeSelector.Delete()
|
||||
C.faiss_SearchParameters_free(sp)
|
||||
return
|
||||
}
|
||||
|
||||
func (idx *faissIndex) Reconstruct(key int64) (recons []float32, err error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
rv := make([]float32, idx.D())
|
||||
if c := C.faiss_Index_reconstruct(
|
||||
idx.idx,
|
||||
@@ -227,9 +199,6 @@ func (idx *faissIndex) Reconstruct(key int64) (recons []float32, err error) {
|
||||
}
|
||||
|
||||
func (idx *faissIndex) ReconstructBatch(keys []int64, recons []float32) ([]float32, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
var err error
|
||||
n := int64(len(keys))
|
||||
if c := C.faiss_Index_reconstruct_batch(
|
||||
@@ -252,9 +221,6 @@ func (i *IndexImpl) MergeFrom(other Index, add_id int64) error {
|
||||
}
|
||||
|
||||
func (idx *faissIndex) MergeFrom(other Index, add_id int64) (err error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
otherIdx, ok := other.(*faissIndex)
|
||||
if !ok {
|
||||
return fmt.Errorf("merge api not supported")
|
||||
@@ -274,9 +240,6 @@ func (idx *faissIndex) MergeFrom(other Index, add_id int64) (err error) {
|
||||
func (idx *faissIndex) RangeSearch(x []float32, radius float32) (
|
||||
*RangeSearchResult, error,
|
||||
) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
n := len(x) / idx.D()
|
||||
var rsr *C.FaissRangeSearchResult
|
||||
if c := C.faiss_RangeSearchResult_new(&rsr, C.idx_t(n)); c != 0 {
|
||||
@@ -295,9 +258,6 @@ func (idx *faissIndex) RangeSearch(x []float32, radius float32) (
|
||||
}
|
||||
|
||||
func (idx *faissIndex) Reset() error {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
if c := C.faiss_Index_reset(idx.idx); c != 0 {
|
||||
return getLastError()
|
||||
}
|
||||
@@ -305,9 +265,6 @@ func (idx *faissIndex) Reset() error {
|
||||
}
|
||||
|
||||
func (idx *faissIndex) RemoveIDs(sel *IDSelector) (int, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
var nRemoved C.size_t
|
||||
if c := C.faiss_Index_remove_ids(idx.idx, sel.sel, &nRemoved); c != 0 {
|
||||
return 0, getLastError()
|
||||
@@ -319,6 +276,30 @@ func (idx *faissIndex) Close() {
|
||||
C.faiss_Index_free(idx.idx)
|
||||
}
|
||||
|
||||
func (idx *faissIndex) searchWithParams(x []float32, k int64, searchParams *C.FaissSearchParameters) (
|
||||
distances []float32, labels []int64, err error,
|
||||
) {
|
||||
n := len(x) / idx.D()
|
||||
distances = make([]float32, int64(n)*k)
|
||||
labels = make([]int64, int64(n)*k)
|
||||
|
||||
if c := C.faiss_Index_search_with_params(
|
||||
idx.idx,
|
||||
C.idx_t(n),
|
||||
(*C.float)(&x[0]),
|
||||
C.idx_t(k),
|
||||
searchParams,
|
||||
(*C.float)(&distances[0]),
|
||||
(*C.idx_t)(&labels[0]),
|
||||
); c != 0 {
|
||||
err = getLastError()
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// RangeSearchResult is the result of a range search.
|
||||
type RangeSearchResult struct {
|
||||
rsr *C.FaissRangeSearchResult
|
||||
@@ -364,9 +345,6 @@ type IndexImpl struct {
|
||||
// IndexFactory builds a composite index.
|
||||
// description is a comma-separated list of components.
|
||||
func IndexFactory(d int, description string, metric int) (*IndexImpl, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
cdesc := C.CString(description)
|
||||
defer C.free(unsafe.Pointer(cdesc))
|
||||
var idx faissIndex
|
||||
@@ -376,3 +354,7 @@ func IndexFactory(d int, description string, metric int) (*IndexImpl, error) {
|
||||
}
|
||||
return &IndexImpl{&idx}, nil
|
||||
}
|
||||
|
||||
func SetOMPThreads(n uint) {
|
||||
C.faiss_set_omp_threads(C.uint(n))
|
||||
}
|
||||
|
||||
+10
-13
@@ -8,7 +8,6 @@ package faiss
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
@@ -23,11 +22,8 @@ func WriteIndex(idx Index, filename string) error {
|
||||
}
|
||||
|
||||
func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
// the values to be returned by the faiss APIs
|
||||
tempBuf := (*C.uchar)(C.malloc(C.size_t(0)))
|
||||
tempBuf := (*C.uchar)(nil)
|
||||
bufSize := C.size_t(0)
|
||||
|
||||
if c := C.faiss_write_index_buf(
|
||||
@@ -35,7 +31,7 @@ func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
|
||||
&bufSize,
|
||||
&tempBuf,
|
||||
); c != 0 {
|
||||
C.free(unsafe.Pointer(tempBuf))
|
||||
C.faiss_free_buf(&tempBuf)
|
||||
return nil, getLastError()
|
||||
}
|
||||
|
||||
@@ -72,9 +68,11 @@ func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
|
||||
// cheaper.
|
||||
copy(rv, val)
|
||||
|
||||
// safe to free the c memory allocated while serializing the index;
|
||||
// safe to free the c memory allocated (tempBuf) while serializing the index (must be done
|
||||
// within C runtime for it was allocated there);
|
||||
// rv is from go runtime - so different address space altogether
|
||||
C.free(unsafe.Pointer(tempBuf))
|
||||
C.faiss_free_buf(&tempBuf)
|
||||
|
||||
// p.s: no need to free "val" since the underlying memory is same as tempBuf (deferred free)
|
||||
val = nil
|
||||
|
||||
@@ -82,9 +80,6 @@ func WriteIndexIntoBuffer(idx Index) ([]byte, error) {
|
||||
}
|
||||
|
||||
func ReadIndexFromBuffer(buf []byte, ioflags int) (*IndexImpl, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
ptr := (*C.uchar)(unsafe.Pointer(&buf[0]))
|
||||
size := C.size_t(len(buf))
|
||||
|
||||
@@ -107,8 +102,10 @@ func ReadIndexFromBuffer(buf []byte, ioflags int) (*IndexImpl, error) {
|
||||
}
|
||||
|
||||
const (
|
||||
IOFlagMmap = C.FAISS_IO_FLAG_MMAP
|
||||
IOFlagReadOnly = C.FAISS_IO_FLAG_READ_ONLY
|
||||
IOFlagMmap = C.FAISS_IO_FLAG_MMAP
|
||||
IOFlagReadOnly = C.FAISS_IO_FLAG_READ_ONLY
|
||||
IOFlagReadMmap = C.FAISS_IO_FLAG_READ_MMAP | C.FAISS_IO_FLAG_ONDISK_IVF
|
||||
IOFlagSkipPrefetch = C.FAISS_IO_FLAG_SKIP_PREFETCH
|
||||
)
|
||||
|
||||
// ReadIndex reads an index from a file.
|
||||
|
||||
+1
-6
@@ -10,12 +10,9 @@ package faiss
|
||||
import "C"
|
||||
import (
|
||||
"fmt"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
func (idx *IndexImpl) SetDirectMap(mapType int) (err error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr())
|
||||
if ivfPtr == nil {
|
||||
@@ -31,8 +28,6 @@ func (idx *IndexImpl) SetDirectMap(mapType int) (err error) {
|
||||
}
|
||||
|
||||
func (idx *IndexImpl) GetSubIndex() (*IndexImpl, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
ptr := C.faiss_IndexIDMap2_cast(idx.cPtr())
|
||||
if ptr == nil {
|
||||
@@ -54,5 +49,5 @@ func (idx *IndexImpl) SetNProbe(nprobe int32) {
|
||||
if ivfPtr == nil {
|
||||
return
|
||||
}
|
||||
C.faiss_IndexIVF_set_nprobe(ivfPtr, C.ulong(nprobe))
|
||||
C.faiss_IndexIVF_set_nprobe(ivfPtr, C.size_t(nprobe))
|
||||
}
|
||||
|
||||
+99
@@ -0,0 +1,99 @@
|
||||
package faiss
|
||||
|
||||
/*
|
||||
#include <faiss/c_api/Index_c.h>
|
||||
#include <faiss/c_api/IndexIVF_c.h>
|
||||
#include <faiss/c_api/impl/AuxIndexStructures_c.h>
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type SearchParams struct {
|
||||
sp *C.FaissSearchParameters
|
||||
}
|
||||
|
||||
// Delete frees the memory associated with s.
|
||||
func (s *SearchParams) Delete() {
|
||||
if s == nil || s.sp == nil {
|
||||
return
|
||||
}
|
||||
C.faiss_SearchParameters_free(s.sp)
|
||||
}
|
||||
|
||||
type searchParamsIVF struct {
|
||||
NprobePct float32 `json:"ivf_nprobe_pct,omitempty"`
|
||||
MaxCodesPct float32 `json:"ivf_max_codes_pct,omitempty"`
|
||||
}
|
||||
|
||||
func (s *searchParamsIVF) Validate() error {
|
||||
if s.NprobePct < 0 || s.NprobePct > 100 {
|
||||
return fmt.Errorf("invalid IVF search params, ivf_nprobe_pct:%v, "+
|
||||
"should be in range [0, 100]", s.NprobePct)
|
||||
}
|
||||
|
||||
if s.MaxCodesPct < 0 || s.MaxCodesPct > 100 {
|
||||
return fmt.Errorf("invalid IVF search params, ivf_max_codes_pct:%v, "+
|
||||
"should be in range [0, 100]", s.MaxCodesPct)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Always return a valid SearchParams object,
|
||||
// thus caller must clean up the object
|
||||
// by invoking Delete() method, even if an error is returned.
|
||||
func NewSearchParams(idx Index, params json.RawMessage, sel *C.FaissIDSelector,
|
||||
) (*SearchParams, error) {
|
||||
rv := &SearchParams{}
|
||||
if c := C.faiss_SearchParameters_new(&rv.sp, sel); c != 0 {
|
||||
return rv, fmt.Errorf("failed to create faiss search params")
|
||||
}
|
||||
|
||||
// # check if the index is IVF and set the search params
|
||||
if ivfIdx := C.faiss_IndexIVF_cast(idx.cPtr()); ivfIdx != nil {
|
||||
rv.sp = C.faiss_SearchParametersIVF_cast(rv.sp)
|
||||
if len(params) == 0 {
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
var ivfParams searchParamsIVF
|
||||
if err := json.Unmarshal(params, &ivfParams); err != nil {
|
||||
return rv, fmt.Errorf("failed to unmarshal IVF search params, "+
|
||||
"err:%v", err)
|
||||
}
|
||||
if err := ivfParams.Validate(); err != nil {
|
||||
return rv, err
|
||||
}
|
||||
|
||||
var nprobe, maxCodes int
|
||||
|
||||
if ivfParams.NprobePct > 0 {
|
||||
nlist := float32(C.faiss_IndexIVF_nlist(ivfIdx))
|
||||
nprobe = int(nlist * (ivfParams.NprobePct / 100))
|
||||
} else {
|
||||
// It's important to set nprobe to the value decided at the time of
|
||||
// index creation. Otherwise, nprobe will be set to the default
|
||||
// value of 1.
|
||||
nprobe = int(C.faiss_IndexIVF_nprobe(ivfIdx))
|
||||
}
|
||||
|
||||
if ivfParams.MaxCodesPct > 0 {
|
||||
nvecs := C.faiss_Index_ntotal(idx.cPtr())
|
||||
maxCodes = int(float32(nvecs) * (ivfParams.MaxCodesPct / 100))
|
||||
} // else, maxCodes will be set to the default value of 0, which means no limit
|
||||
|
||||
if c := C.faiss_SearchParametersIVF_new_with(
|
||||
&rv.sp,
|
||||
sel,
|
||||
C.size_t(nprobe),
|
||||
C.size_t(maxCodes),
|
||||
); c != 0 {
|
||||
return rv, fmt.Errorf("failed to create faiss IVF search params")
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
+31
-17
@@ -4,18 +4,42 @@ package faiss
|
||||
#include <faiss/c_api/impl/AuxIndexStructures_c.h>
|
||||
*/
|
||||
import "C"
|
||||
import "runtime"
|
||||
|
||||
// IDSelector represents a set of IDs to remove.
|
||||
type IDSelector struct {
|
||||
sel *C.FaissIDSelector
|
||||
}
|
||||
|
||||
// Delete frees the memory associated with s.
|
||||
func (s *IDSelector) Delete() {
|
||||
if s == nil || s.sel == nil {
|
||||
return
|
||||
}
|
||||
|
||||
C.faiss_IDSelector_free(s.sel)
|
||||
}
|
||||
|
||||
type IDSelectorBatch struct {
|
||||
sel *C.FaissIDSelector
|
||||
batchSel *C.FaissIDSelector
|
||||
}
|
||||
|
||||
// Delete frees the memory associated with s.
|
||||
func (s *IDSelectorBatch) Delete() {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if s.sel != nil {
|
||||
C.faiss_IDSelector_free(s.sel)
|
||||
}
|
||||
if s.batchSel != nil {
|
||||
C.faiss_IDSelector_free(s.batchSel)
|
||||
}
|
||||
}
|
||||
|
||||
// NewIDSelectorRange creates a selector that removes IDs on [imin, imax).
|
||||
func NewIDSelectorRange(imin, imax int64) (*IDSelector, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
var sel *C.FaissIDSelectorRange
|
||||
c := C.faiss_IDSelectorRange_new(&sel, C.idx_t(imin), C.idx_t(imax))
|
||||
if c != 0 {
|
||||
@@ -26,9 +50,6 @@ func NewIDSelectorRange(imin, imax int64) (*IDSelector, error) {
|
||||
|
||||
// NewIDSelectorBatch creates a new batch selector.
|
||||
func NewIDSelectorBatch(indices []int64) (*IDSelector, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
var sel *C.FaissIDSelectorBatch
|
||||
if c := C.faiss_IDSelectorBatch_new(
|
||||
&sel,
|
||||
@@ -42,10 +63,7 @@ func NewIDSelectorBatch(indices []int64) (*IDSelector, error) {
|
||||
|
||||
// NewIDSelectorNot creates a new Not selector, wrapped arround a
|
||||
// batch selector, with the IDs in 'exclude'.
|
||||
func NewIDSelectorNot(exclude []int64) (*IDSelector, error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
func NewIDSelectorNot(exclude []int64) (*IDSelectorBatch, error) {
|
||||
batchSelector, err := NewIDSelectorBatch(exclude)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -56,12 +74,8 @@ func NewIDSelectorNot(exclude []int64) (*IDSelector, error) {
|
||||
&sel,
|
||||
batchSelector.sel,
|
||||
); c != 0 {
|
||||
batchSelector.Delete()
|
||||
return nil, getLastError()
|
||||
}
|
||||
return &IDSelector{(*C.FaissIDSelector)(sel)}, nil
|
||||
}
|
||||
|
||||
// Delete frees the memory associated with s.
|
||||
func (s *IDSelector) Delete() {
|
||||
C.faiss_IDSelector_free(s.sel)
|
||||
return &IDSelectorBatch{sel: (*C.FaissIDSelector)(sel), batchSel: batchSelector.sel}, nil
|
||||
}
|
||||
|
||||
+5
-2
@@ -18,6 +18,8 @@
|
||||
package segment
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
)
|
||||
|
||||
@@ -55,14 +57,15 @@ type VecPostingsIterator interface {
|
||||
}
|
||||
|
||||
type VectorIndex interface {
|
||||
Search(qVector []float32, k int64, except *roaring.Bitmap) (VecPostingsList, error)
|
||||
// @params: Search params for backing vector index (like IVF, HNSW, etc.)
|
||||
Search(qVector []float32, k int64, params json.RawMessage) (VecPostingsList, error)
|
||||
Close()
|
||||
Size() uint64
|
||||
}
|
||||
|
||||
type VectorSegment interface {
|
||||
Segment
|
||||
InterpretVectorIndex(field string) (VectorIndex, error)
|
||||
InterpretVectorIndex(field string, except *roaring.Bitmap) (VectorIndex, error)
|
||||
}
|
||||
|
||||
type VecPosting interface {
|
||||
|
||||
+1
-1
@@ -166,7 +166,6 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32,
|
||||
memCRC: memCRC,
|
||||
chunkMode: chunkMode,
|
||||
fieldsMap: fieldsMap,
|
||||
fieldsInv: fieldsInv,
|
||||
numDocs: numDocs,
|
||||
storedIndexOffset: storedIndexOffset,
|
||||
fieldsIndexOffset: sectionsIndexOffset,
|
||||
@@ -175,6 +174,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32,
|
||||
docValueOffset: 0, // docValueOffsets identified automatically by the section
|
||||
dictLocs: dictLocs,
|
||||
fieldFSTs: make(map[uint16]*vellum.FST),
|
||||
vecIndexCache: newVectorIndexCache(),
|
||||
}
|
||||
sb.updateSize()
|
||||
|
||||
|
||||
+299
@@ -0,0 +1,299 @@
|
||||
// Copyright (c) 2024 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build vectors
|
||||
// +build vectors
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
faiss "github.com/blevesearch/go-faiss"
|
||||
)
|
||||
|
||||
func newVectorIndexCache() *vectorIndexCache {
|
||||
return &vectorIndexCache{
|
||||
cache: make(map[uint16]*cacheEntry),
|
||||
closeCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
type vectorIndexCache struct {
|
||||
closeCh chan struct{}
|
||||
m sync.RWMutex
|
||||
cache map[uint16]*cacheEntry
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) Clear() {
|
||||
vc.m.Lock()
|
||||
close(vc.closeCh)
|
||||
|
||||
// forcing a close on all indexes to avoid memory leaks.
|
||||
for _, entry := range vc.cache {
|
||||
entry.close()
|
||||
}
|
||||
vc.cache = nil
|
||||
vc.m.Unlock()
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) loadOrCreate(fieldID uint16, mem []byte, except *roaring.Bitmap) (
|
||||
index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, vecIDsToExclude []int64, err error) {
|
||||
var found bool
|
||||
index, vecDocIDMap, vecIDsToExclude, found = vc.loadFromCache(fieldID, except)
|
||||
if !found {
|
||||
index, vecDocIDMap, vecIDsToExclude, err = vc.createAndCache(fieldID, mem, except)
|
||||
}
|
||||
return index, vecDocIDMap, vecIDsToExclude, err
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) loadFromCache(fieldID uint16, except *roaring.Bitmap) (
|
||||
index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, vecIDsToExclude []int64, found bool) {
|
||||
vc.m.RLock()
|
||||
defer vc.m.RUnlock()
|
||||
|
||||
entry, ok := vc.cache[fieldID]
|
||||
if !ok {
|
||||
return nil, nil, nil, false
|
||||
}
|
||||
|
||||
index, vecDocIDMap = entry.load()
|
||||
vecIDsToExclude = getVecIDsToExclude(vecDocIDMap, except)
|
||||
|
||||
return index, vecDocIDMap, vecIDsToExclude, true
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) createAndCache(fieldID uint16, mem []byte, except *roaring.Bitmap) (
|
||||
index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, vecIDsToExclude []int64, err error) {
|
||||
vc.m.Lock()
|
||||
defer vc.m.Unlock()
|
||||
|
||||
// when there are multiple threads trying to build the index, guard redundant
|
||||
// index creation by doing a double check and return if already created and
|
||||
// cached.
|
||||
entry, ok := vc.cache[fieldID]
|
||||
if ok {
|
||||
index, vecDocIDMap = entry.load()
|
||||
vecIDsToExclude = getVecIDsToExclude(vecDocIDMap, except)
|
||||
return index, vecDocIDMap, vecIDsToExclude, nil
|
||||
}
|
||||
|
||||
// if the cache doesn't have entry, construct the vector to doc id map and the
|
||||
// vector index out of the mem bytes and update the cache under lock.
|
||||
pos := 0
|
||||
numVecs, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
|
||||
vecDocIDMap = make(map[int64]uint32, numVecs)
|
||||
isExceptNotEmpty := except != nil && !except.IsEmpty()
|
||||
for i := 0; i < int(numVecs); i++ {
|
||||
vecID, n := binary.Varint(mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
docID, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
|
||||
docIDUint32 := uint32(docID)
|
||||
if isExceptNotEmpty && except.Contains(docIDUint32) {
|
||||
vecIDsToExclude = append(vecIDsToExclude, vecID)
|
||||
continue
|
||||
}
|
||||
vecDocIDMap[vecID] = docIDUint32
|
||||
}
|
||||
|
||||
indexSize, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
|
||||
index, err = faiss.ReadIndexFromBuffer(mem[pos:pos+int(indexSize)], faissIOFlags)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
vc.insertLOCKED(fieldID, index, vecDocIDMap)
|
||||
return index, vecDocIDMap, vecIDsToExclude, nil
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) insertLOCKED(fieldIDPlus1 uint16,
|
||||
index *faiss.IndexImpl, vecDocIDMap map[int64]uint32) {
|
||||
// the first time we've hit the cache, try to spawn a monitoring routine
|
||||
// which will reconcile the moving averages for all the fields being hit
|
||||
if len(vc.cache) == 0 {
|
||||
go vc.monitor()
|
||||
}
|
||||
|
||||
_, ok := vc.cache[fieldIDPlus1]
|
||||
if !ok {
|
||||
// initializing the alpha with 0.4 essentially means that we are favoring
|
||||
// the history a little bit more relative to the current sample value.
|
||||
// this makes the average to be kept above the threshold value for a
|
||||
// longer time and thereby the index to be resident in the cache
|
||||
// for longer time.
|
||||
vc.cache[fieldIDPlus1] = createCacheEntry(index, vecDocIDMap, 0.4)
|
||||
}
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) incHit(fieldIDPlus1 uint16) {
|
||||
vc.m.RLock()
|
||||
entry, ok := vc.cache[fieldIDPlus1]
|
||||
if ok {
|
||||
entry.incHit()
|
||||
}
|
||||
vc.m.RUnlock()
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) decRef(fieldIDPlus1 uint16) {
|
||||
vc.m.RLock()
|
||||
entry, ok := vc.cache[fieldIDPlus1]
|
||||
if ok {
|
||||
entry.decRef()
|
||||
}
|
||||
vc.m.RUnlock()
|
||||
}
|
||||
|
||||
func (vc *vectorIndexCache) cleanup() bool {
|
||||
vc.m.Lock()
|
||||
cache := vc.cache
|
||||
|
||||
// for every field reconcile the average with the current sample values
|
||||
for fieldIDPlus1, entry := range cache {
|
||||
sample := atomic.LoadUint64(&entry.tracker.sample)
|
||||
entry.tracker.add(sample)
|
||||
|
||||
refCount := atomic.LoadInt64(&entry.refs)
|
||||
// the comparison threshold as of now is (1 - a). mathematically it
|
||||
// means that there is only 1 query per second on average as per history.
|
||||
// and in the current second, there were no queries performed against
|
||||
// this index.
|
||||
if entry.tracker.avg <= (1-entry.tracker.alpha) && refCount <= 0 {
|
||||
atomic.StoreUint64(&entry.tracker.sample, 0)
|
||||
delete(vc.cache, fieldIDPlus1)
|
||||
entry.close()
|
||||
continue
|
||||
}
|
||||
atomic.StoreUint64(&entry.tracker.sample, 0)
|
||||
}
|
||||
|
||||
rv := len(vc.cache) == 0
|
||||
vc.m.Unlock()
|
||||
return rv
|
||||
}
|
||||
|
||||
var monitorFreq = 1 * time.Second
|
||||
|
||||
func (vc *vectorIndexCache) monitor() {
|
||||
ticker := time.NewTicker(monitorFreq)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-vc.closeCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
exit := vc.cleanup()
|
||||
if exit {
|
||||
// no entries to be monitored, exit
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
type ewma struct {
|
||||
alpha float64
|
||||
avg float64
|
||||
// every hit to the cache entry is recorded as part of a sample
|
||||
// which will be used to calculate the average in the next cycle of average
|
||||
// computation (which is average traffic for the field till now). this is
|
||||
// used to track the per second hits to the cache entries.
|
||||
sample uint64
|
||||
}
|
||||
|
||||
func (e *ewma) add(val uint64) {
|
||||
if e.avg == 0.0 {
|
||||
e.avg = float64(val)
|
||||
} else {
|
||||
// the exponentially weighted moving average
|
||||
// X(t) = a.v + (1 - a).X(t-1)
|
||||
e.avg = e.alpha*float64(val) + (1-e.alpha)*e.avg
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
func createCacheEntry(index *faiss.IndexImpl, vecDocIDMap map[int64]uint32, alpha float64) *cacheEntry {
|
||||
return &cacheEntry{
|
||||
index: index,
|
||||
vecDocIDMap: vecDocIDMap,
|
||||
tracker: &ewma{
|
||||
alpha: alpha,
|
||||
sample: 1,
|
||||
},
|
||||
refs: 1,
|
||||
}
|
||||
}
|
||||
|
||||
type cacheEntry struct {
|
||||
tracker *ewma
|
||||
|
||||
// this is used to track the live references to the cache entry,
|
||||
// such that while we do a cleanup() and we see that the avg is below a
|
||||
// threshold we close/cleanup only if the live refs to the cache entry is 0.
|
||||
refs int64
|
||||
|
||||
index *faiss.IndexImpl
|
||||
vecDocIDMap map[int64]uint32
|
||||
}
|
||||
|
||||
func (ce *cacheEntry) incHit() {
|
||||
atomic.AddUint64(&ce.tracker.sample, 1)
|
||||
}
|
||||
|
||||
func (ce *cacheEntry) addRef() {
|
||||
atomic.AddInt64(&ce.refs, 1)
|
||||
}
|
||||
|
||||
func (ce *cacheEntry) decRef() {
|
||||
atomic.AddInt64(&ce.refs, -1)
|
||||
}
|
||||
|
||||
func (ce *cacheEntry) load() (*faiss.IndexImpl, map[int64]uint32) {
|
||||
ce.incHit()
|
||||
ce.addRef()
|
||||
return ce.index, ce.vecDocIDMap
|
||||
}
|
||||
|
||||
func (ce *cacheEntry) close() {
|
||||
go func() {
|
||||
ce.index.Close()
|
||||
ce.index = nil
|
||||
ce.vecDocIDMap = nil
|
||||
}()
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
func getVecIDsToExclude(vecDocIDMap map[int64]uint32, except *roaring.Bitmap) (vecIDsToExclude []int64) {
|
||||
if except != nil && !except.IsEmpty() {
|
||||
for vecID, docID := range vecDocIDMap {
|
||||
if except.Contains(docID) {
|
||||
vecIDsToExclude = append(vecIDsToExclude, vecID)
|
||||
}
|
||||
}
|
||||
}
|
||||
return vecIDsToExclude
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
// Copyright (c) 2024 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build !vectors
|
||||
// +build !vectors
|
||||
|
||||
package zap
|
||||
|
||||
type vectorIndexCache struct {
|
||||
}
|
||||
|
||||
func newVectorIndexCache() *vectorIndexCache {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *vectorIndexCache) Clear() {}
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
// Copyright (c) 2024 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build vectors && !windows
|
||||
// +build vectors,!windows
|
||||
|
||||
package zap
|
||||
|
||||
import faiss "github.com/blevesearch/go-faiss"
|
||||
|
||||
const faissIOFlags = faiss.IOFlagReadMmap | faiss.IOFlagSkipPrefetch
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
// Copyright (c) 2024 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build vectors && windows
|
||||
// +build vectors,windows
|
||||
|
||||
package zap
|
||||
|
||||
import faiss "github.com/blevesearch/go-faiss"
|
||||
|
||||
const faissIOFlags = faiss.IOFlagReadOnly
|
||||
+28
-50
@@ -19,6 +19,7 @@ package zap
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
@@ -266,14 +267,14 @@ func (vpl *VecPostingsIterator) BytesWritten() uint64 {
|
||||
|
||||
// vectorIndexWrapper conforms to scorch_segment_api's VectorIndex interface
|
||||
type vectorIndexWrapper struct {
|
||||
search func(qVector []float32, k int64, except *roaring.Bitmap) (segment.VecPostingsList, error)
|
||||
search func(qVector []float32, k int64, params json.RawMessage) (segment.VecPostingsList, error)
|
||||
close func()
|
||||
size func() uint64
|
||||
}
|
||||
|
||||
func (i *vectorIndexWrapper) Search(qVector []float32, k int64, except *roaring.Bitmap) (
|
||||
func (i *vectorIndexWrapper) Search(qVector []float32, k int64, params json.RawMessage) (
|
||||
segment.VecPostingsList, error) {
|
||||
return i.search(qVector, k, except)
|
||||
return i.search(qVector, k, params)
|
||||
}
|
||||
|
||||
func (i *vectorIndexWrapper) Close() {
|
||||
@@ -284,21 +285,23 @@ func (i *vectorIndexWrapper) Size() uint64 {
|
||||
return i.size()
|
||||
}
|
||||
|
||||
// InterpretVectorIndex returns closures that will allow the caller to -
|
||||
// (1) SearchVectorIndex - search within an attached vector index
|
||||
// (2) CloseVectorIndex - close attached vector index
|
||||
//
|
||||
// These function pointers may be nil, when InterpretVectorIndex return a non-nil err.
|
||||
// It is on the caller to ensure CloseVectorIndex is invoked (sync or async) after
|
||||
// their business with the attached vector index concludes.
|
||||
func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex, error) {
|
||||
// InterpretVectorIndex returns a construct of closures (vectorIndexWrapper)
|
||||
// that will allow the caller to -
|
||||
// (1) search within an attached vector index
|
||||
// (2) close attached vector index
|
||||
// (3) get the size of the attached vector index
|
||||
func (sb *SegmentBase) InterpretVectorIndex(field string, except *roaring.Bitmap) (
|
||||
segment.VectorIndex, error) {
|
||||
// Params needed for the closures
|
||||
var vecIndex *faiss.IndexImpl
|
||||
vecDocIDMap := make(map[int64]uint32)
|
||||
var vecDocIDMap map[int64]uint32
|
||||
var vectorIDsToExclude []int64
|
||||
var fieldIDPlus1 uint16
|
||||
var vecIndexSize uint64
|
||||
|
||||
var (
|
||||
wrapVecIndex = &vectorIndexWrapper{
|
||||
search: func(qVector []float32, k int64, except *roaring.Bitmap) (segment.VecPostingsList, error) {
|
||||
search: func(qVector []float32, k int64, params json.RawMessage) (segment.VecPostingsList, error) {
|
||||
// 1. returned postings list (of type PostingsList) has two types of information - docNum and its score.
|
||||
// 2. both the values can be represented using roaring bitmaps.
|
||||
// 3. the Iterator (of type PostingsIterator) returned would operate in terms of VecPostings.
|
||||
@@ -315,17 +318,7 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
var vectorIDsToExclude []int64
|
||||
// iterate through the vector doc ID map and if the doc ID is one to be
|
||||
// deleted, add it to the list
|
||||
for vecID, docID := range vecDocIDMap {
|
||||
if except != nil && except.Contains(docID) {
|
||||
vectorIDsToExclude = append(vectorIDsToExclude, vecID)
|
||||
}
|
||||
}
|
||||
|
||||
scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k, vectorIDsToExclude)
|
||||
|
||||
scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k, vectorIDsToExclude, params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -335,7 +328,7 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
|
||||
vecID := ids[i]
|
||||
// Checking if it's present in the vecDocIDMap.
|
||||
// If -1 is returned as an ID(insufficient vectors), this will ensure
|
||||
// they it isn't added to the final postings list.
|
||||
// it isn't added to the final postings list.
|
||||
if docID, ok := vecDocIDMap[vecID]; ok {
|
||||
code := getVectorCode(docID, scores[i])
|
||||
rv.postings.Add(uint64(code))
|
||||
@@ -345,22 +338,19 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
|
||||
return rv, nil
|
||||
},
|
||||
close: func() {
|
||||
if vecIndex != nil {
|
||||
vecIndex.Close()
|
||||
}
|
||||
// skipping the closing because the index is cached and it's being
|
||||
// deferred to a later point of time.
|
||||
sb.vecIndexCache.decRef(fieldIDPlus1)
|
||||
},
|
||||
size: func() uint64 {
|
||||
if vecIndex != nil {
|
||||
return vecIndex.Size()
|
||||
}
|
||||
return 0
|
||||
return vecIndexSize
|
||||
},
|
||||
}
|
||||
|
||||
err error
|
||||
)
|
||||
|
||||
fieldIDPlus1 := sb.fieldsMap[field]
|
||||
fieldIDPlus1 = sb.fieldsMap[field]
|
||||
if fieldIDPlus1 <= 0 {
|
||||
return wrapVecIndex, nil
|
||||
}
|
||||
@@ -382,25 +372,13 @@ func (sb *SegmentBase) InterpretVectorIndex(field string) (segment.VectorIndex,
|
||||
pos += n
|
||||
}
|
||||
|
||||
// read the number vectors indexed for this field and load the vector to docID mapping.
|
||||
// todo: cache the vecID to docIDs mapping for a fieldID
|
||||
numVecs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
for i := 0; i < int(numVecs); i++ {
|
||||
vecID, n := binary.Varint(sb.mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
docID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
vecDocIDMap[vecID] = uint32(docID)
|
||||
vecIndex, vecDocIDMap, vectorIDsToExclude, err =
|
||||
sb.vecIndexCache.loadOrCreate(fieldIDPlus1, sb.mem[pos:], except)
|
||||
|
||||
if vecIndex != nil {
|
||||
vecIndexSize = vecIndex.Size()
|
||||
}
|
||||
|
||||
// todo: not a good idea to cache the vector index perhaps, since it could be quite huge.
|
||||
indexSize, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
|
||||
pos += n
|
||||
indexBytes := sb.mem[pos : pos+int(indexSize)]
|
||||
pos += int(indexSize)
|
||||
|
||||
vecIndex, err = faiss.ReadIndexFromBuffer(indexBytes, faiss.IOFlagReadOnly)
|
||||
return wrapVecIndex, err
|
||||
}
|
||||
|
||||
|
||||
+45
-33
@@ -31,6 +31,8 @@ import (
|
||||
seg "github.com/blevesearch/scorch_segment_api/v2"
|
||||
)
|
||||
|
||||
const defaultFaissOMPThreads = 1
|
||||
|
||||
func init() {
|
||||
rand.Seed(time.Now().UTC().UnixNano())
|
||||
registerSegmentSection(SectionFaissVectorIndex, &faissVectorIndexSection{})
|
||||
@@ -38,6 +40,7 @@ func init() {
|
||||
_, ok := field.(index.VectorField)
|
||||
return ok
|
||||
}
|
||||
faiss.SetOMPThreads(defaultFaissOMPThreads)
|
||||
}
|
||||
|
||||
type faissVectorIndexSection struct {
|
||||
@@ -73,7 +76,7 @@ type vecIndexMeta struct {
|
||||
indexOptimizedFor string
|
||||
}
|
||||
|
||||
// keep in mind with respect to update and delete operations with resepct to vectors
|
||||
// keep in mind with respect to update and delete operations with respect to vectors
|
||||
func (v *faissVectorIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase,
|
||||
drops []*roaring.Bitmap, fieldsInv []string,
|
||||
newDocNumsIn [][]uint64, w *CountHashWriter, closeCh chan struct{}) error {
|
||||
@@ -275,7 +278,7 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
|
||||
indexes []*vecIndexMeta, w *CountHashWriter, closeCh chan struct{}) error {
|
||||
|
||||
vecIndexes := make([]*faiss.IndexImpl, 0, len(sbs))
|
||||
reconsCap := 0
|
||||
var finalVecIDCap, indexDataCap, reconsCap int
|
||||
for segI, segBase := range sbs {
|
||||
// Considering merge operations on vector indexes are expensive, it is
|
||||
// worth including an early exit if the merge is aborted, saving us
|
||||
@@ -286,14 +289,18 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
|
||||
}
|
||||
// read the index bytes. todo: parallelize this
|
||||
indexBytes := segBase.mem[indexes[segI].startOffset : indexes[segI].startOffset+int(indexes[segI].indexSize)]
|
||||
index, err := faiss.ReadIndexFromBuffer(indexBytes, faiss.IOFlagReadOnly)
|
||||
index, err := faiss.ReadIndexFromBuffer(indexBytes, faissIOFlags)
|
||||
if err != nil {
|
||||
freeReconstructedIndexes(vecIndexes)
|
||||
return err
|
||||
}
|
||||
indexReconsLen := len(indexes[segI].vecIds) * index.D()
|
||||
if indexReconsLen > reconsCap {
|
||||
reconsCap = indexReconsLen
|
||||
if len(indexes[segI].vecIds) > 0 {
|
||||
indexReconsLen := len(indexes[segI].vecIds) * index.D()
|
||||
if indexReconsLen > reconsCap {
|
||||
reconsCap = indexReconsLen
|
||||
}
|
||||
indexDataCap += indexReconsLen
|
||||
finalVecIDCap += len(indexes[segI].vecIds)
|
||||
}
|
||||
vecIndexes = append(vecIndexes, index)
|
||||
}
|
||||
@@ -303,13 +310,6 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
|
||||
return nil
|
||||
}
|
||||
|
||||
var mergedIndexBytes []byte
|
||||
|
||||
// capacities for the finalVecIDs and indexData slices
|
||||
// to avoid multiple allocations, via append.
|
||||
finalVecIDCap := len(indexes[0].vecIds) * len(vecIndexes)
|
||||
indexDataCap := finalVecIDCap * vecIndexes[0].D()
|
||||
|
||||
finalVecIDs := make([]int64, 0, finalVecIDCap)
|
||||
// merging of indexes with reconstruction method.
|
||||
// the indexes[i].vecIds has only the valid vecs of this vector
|
||||
@@ -347,25 +347,27 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
|
||||
freeReconstructedIndexes(vecIndexes)
|
||||
return nil
|
||||
}
|
||||
recons = nil
|
||||
|
||||
nvecs := len(finalVecIDs)
|
||||
|
||||
// index type to be created after merge based on the number of vectors in
|
||||
// indexData added into the index.
|
||||
nlist := determineCentroids(nvecs)
|
||||
indexDescription, indexClass := determineIndexToUse(nvecs, nlist)
|
||||
|
||||
// safe to assume that all the indexes are of the same config values, given
|
||||
// that they are extracted from the field mapping info.
|
||||
dims := vecIndexes[0].D()
|
||||
metric := vecIndexes[0].MetricType()
|
||||
indexOptimizedFor := indexes[0].indexOptimizedFor
|
||||
|
||||
// index type to be created after merge based on the number of vectors
|
||||
// in indexData added into the index.
|
||||
nlist := determineCentroids(nvecs)
|
||||
indexDescription, indexClass := determineIndexToUse(nvecs, nlist, indexOptimizedFor)
|
||||
|
||||
// freeing the reconstructed indexes immediately - waiting till the end
|
||||
// to do the same is not needed because the following operations don't need
|
||||
// the reconstructed ones anymore and doing so will hold up memory which can
|
||||
// be detrimental while creating indexes during introduction.
|
||||
freeReconstructedIndexes(vecIndexes)
|
||||
vecIndexes = nil
|
||||
|
||||
faissIndex, err := faiss.IndexFactory(dims, indexDescription, metric)
|
||||
if err != nil {
|
||||
@@ -400,6 +402,9 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase,
|
||||
return err
|
||||
}
|
||||
|
||||
indexData = nil
|
||||
finalVecIDs = nil
|
||||
var mergedIndexBytes []byte
|
||||
mergedIndexBytes, err = faiss.WriteIndexIntoBuffer(faissIndex)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -435,10 +440,7 @@ func determineCentroids(nvecs int) int {
|
||||
var nlist int
|
||||
|
||||
switch {
|
||||
// At 1M vectors, nlist = 4k gave a reasonably high recall with the right nprobe,
|
||||
// whereas 1M/100 = 10000 centroids would increase training time without
|
||||
// corresponding increase in recall
|
||||
case nvecs >= 1000000:
|
||||
case nvecs >= 200000:
|
||||
nlist = int(4 * math.Sqrt(float64(nvecs)))
|
||||
case nvecs >= 1000:
|
||||
// 100 points per cluster is a reasonable default, considering the default
|
||||
@@ -457,7 +459,16 @@ const (
|
||||
|
||||
// Returns a description string for the index and quantizer type
|
||||
// and an index type.
|
||||
func determineIndexToUse(nvecs, nlist int) (string, int) {
|
||||
func determineIndexToUse(nvecs, nlist int, indexOptimizedFor string) (string, int) {
|
||||
if indexOptimizedFor == index.IndexOptimizedForMemoryEfficient {
|
||||
switch {
|
||||
case nvecs >= 1000:
|
||||
return fmt.Sprintf("IVF%d,SQ4", nlist), IndexTypeIVF
|
||||
default:
|
||||
return "IDMap2,Flat", IndexTypeFlat
|
||||
}
|
||||
}
|
||||
|
||||
switch {
|
||||
case nvecs >= 10000:
|
||||
return fmt.Sprintf("IVF%d,SQ8", nlist), IndexTypeIVF
|
||||
@@ -476,11 +487,11 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
|
||||
for fieldID, content := range vo.vecFieldMap {
|
||||
// calculate the capacity of the vecs and ids slices
|
||||
// to avoid multiple allocations.
|
||||
vecs := make([]float32, 0, uint16(len(content.vecs))*content.dim)
|
||||
vecs := make([]float32, 0, len(content.vecs)*int(content.dim))
|
||||
ids := make([]int64, 0, len(content.vecs))
|
||||
for hash, vecInfo := range content.vecs {
|
||||
vecs = append(vecs, vecInfo.vec...)
|
||||
ids = append(ids, int64(hash))
|
||||
ids = append(ids, hash)
|
||||
}
|
||||
|
||||
var metric = faiss.MetricL2
|
||||
@@ -490,7 +501,8 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
|
||||
|
||||
nvecs := len(ids)
|
||||
nlist := determineCentroids(nvecs)
|
||||
indexDescription, indexClass := determineIndexToUse(nvecs, nlist)
|
||||
indexDescription, indexClass := determineIndexToUse(nvecs, nlist,
|
||||
content.indexOptimizedFor)
|
||||
faissIndex, err := faiss.IndexFactory(int(content.dim), indexDescription, metric)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
@@ -518,12 +530,6 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// serialize the built index into a byte slice
|
||||
buf, err := faiss.WriteIndexIntoBuffer(faissIndex)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
fieldStart := w.Count()
|
||||
// writing out two offset values to indicate that the current field's
|
||||
// vector section doesn't have valid doc value content within it.
|
||||
@@ -557,7 +563,7 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
|
||||
// section would be help avoiding in paging in this data as part of a page
|
||||
// (which is to load a non-cacheable info like index). this could help the
|
||||
// paging costs
|
||||
for vecID, _ := range content.vecs {
|
||||
for vecID := range content.vecs {
|
||||
docID := vo.vecIDMap[vecID].docID
|
||||
// write the vecID
|
||||
n = binary.PutVarint(tempBuf, vecID)
|
||||
@@ -573,6 +579,12 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
|
||||
}
|
||||
}
|
||||
|
||||
// serialize the built index into a byte slice
|
||||
buf, err := faiss.WriteIndexIntoBuffer(faissIndex)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// record the fieldStart value for this section.
|
||||
// write the vecID -> docID mapping
|
||||
// write the index bytes and its length
|
||||
|
||||
+27
-4
@@ -55,6 +55,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) {
|
||||
SegmentBase: SegmentBase{
|
||||
fieldsMap: make(map[string]uint16),
|
||||
fieldFSTs: make(map[uint16]*vellum.FST),
|
||||
vecIndexCache: newVectorIndexCache(),
|
||||
fieldDvReaders: make([]map[uint16]*docValueReader, len(segmentSections)),
|
||||
},
|
||||
f: f,
|
||||
@@ -81,7 +82,6 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) {
|
||||
_ = rv.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
@@ -110,6 +110,9 @@ type SegmentBase struct {
|
||||
|
||||
m sync.Mutex
|
||||
fieldFSTs map[uint16]*vellum.FST
|
||||
|
||||
// this cache comes into play when vectors are supported in builds.
|
||||
vecIndexCache *vectorIndexCache
|
||||
}
|
||||
|
||||
func (sb *SegmentBase) Size() int {
|
||||
@@ -146,7 +149,7 @@ func (sb *SegmentBase) updateSize() {
|
||||
|
||||
func (sb *SegmentBase) AddRef() {}
|
||||
func (sb *SegmentBase) DecRef() (err error) { return nil }
|
||||
func (sb *SegmentBase) Close() (err error) { return nil }
|
||||
func (sb *SegmentBase) Close() (err error) { sb.vecIndexCache.Clear(); return nil }
|
||||
|
||||
// Segment implements a persisted segment.Segment interface, by
|
||||
// embedding an mmap()'ed SegmentBase.
|
||||
@@ -319,13 +322,29 @@ func (s *SegmentBase) loadFieldsNew() error {
|
||||
return s.loadFields()
|
||||
}
|
||||
|
||||
seek := pos + binary.MaxVarintLen64
|
||||
if seek > uint64(len(s.mem)) {
|
||||
// handling a buffer overflow case.
|
||||
// a rare case where the backing buffer is not large enough to be read directly via
|
||||
// a pos+binary.MaxVarinLen64 seek. For eg, this can happen when there is only
|
||||
// one field to be indexed in the entire batch of data and while writing out
|
||||
// these fields metadata, you write 1 + 8 bytes whereas the MaxVarintLen64 = 10.
|
||||
seek = uint64(len(s.mem))
|
||||
}
|
||||
|
||||
// read the number of fields
|
||||
numFields, sz := binary.Uvarint(s.mem[pos : pos+binary.MaxVarintLen64])
|
||||
numFields, sz := binary.Uvarint(s.mem[pos:seek])
|
||||
// here, the pos is incremented by the valid number bytes read from the buffer
|
||||
// so in the edge case pointed out above the numFields = 1, the sz = 1 as well.
|
||||
pos += uint64(sz)
|
||||
s.incrementBytesRead(uint64(sz))
|
||||
|
||||
// the following loop will be executed only once in the edge case pointed out above
|
||||
// since there is only field's offset store which occupies 8 bytes.
|
||||
// the pointer then seeks to a position preceding the sectionsIndexOffset, at
|
||||
// which point the responbility of handling the out-of-bounds cases shifts to
|
||||
// the specific section's parsing logic.
|
||||
var fieldID uint64
|
||||
|
||||
for fieldID < numFields {
|
||||
addr := binary.BigEndian.Uint64(s.mem[pos : pos+8])
|
||||
s.incrementBytesRead(8)
|
||||
@@ -629,6 +648,9 @@ func (s *Segment) Close() (err error) {
|
||||
}
|
||||
|
||||
func (s *Segment) closeActual() (err error) {
|
||||
// clear contents from the vector index cache before un-mmapping
|
||||
s.vecIndexCache.Clear()
|
||||
|
||||
if s.mm != nil {
|
||||
err = s.mm.Unmap()
|
||||
}
|
||||
@@ -640,6 +662,7 @@ func (s *Segment) closeActual() (err error) {
|
||||
err = err2
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
Vendored
+13
-9
@@ -64,7 +64,7 @@ github.com/ProtonMail/go-crypto/openpgp/internal/ecc
|
||||
github.com/ProtonMail/go-crypto/openpgp/internal/encoding
|
||||
github.com/ProtonMail/go-crypto/openpgp/packet
|
||||
github.com/ProtonMail/go-crypto/openpgp/s2k
|
||||
# github.com/RoaringBitmap/roaring v1.2.3
|
||||
# github.com/RoaringBitmap/roaring v1.9.3
|
||||
## explicit; go 1.14
|
||||
github.com/RoaringBitmap/roaring
|
||||
github.com/RoaringBitmap/roaring/internal
|
||||
@@ -157,11 +157,11 @@ github.com/beorn7/perks/quantile
|
||||
# github.com/bitly/go-simplejson v0.5.0
|
||||
## explicit
|
||||
github.com/bitly/go-simplejson
|
||||
# github.com/bits-and-blooms/bitset v1.2.1
|
||||
## explicit; go 1.14
|
||||
# github.com/bits-and-blooms/bitset v1.12.0
|
||||
## explicit; go 1.16
|
||||
github.com/bits-and-blooms/bitset
|
||||
# github.com/blevesearch/bleve/v2 v2.4.0
|
||||
## explicit; go 1.20
|
||||
# github.com/blevesearch/bleve/v2 v2.4.2
|
||||
## explicit; go 1.21
|
||||
github.com/blevesearch/bleve/v2
|
||||
github.com/blevesearch/bleve/v2/analysis
|
||||
github.com/blevesearch/bleve/v2/analysis/analyzer/custom
|
||||
@@ -169,6 +169,10 @@ github.com/blevesearch/bleve/v2/analysis/analyzer/keyword
|
||||
github.com/blevesearch/bleve/v2/analysis/analyzer/standard
|
||||
github.com/blevesearch/bleve/v2/analysis/datetime/flexible
|
||||
github.com/blevesearch/bleve/v2/analysis/datetime/optional
|
||||
github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds
|
||||
github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds
|
||||
github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds
|
||||
github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds
|
||||
github.com/blevesearch/bleve/v2/analysis/lang/en
|
||||
github.com/blevesearch/bleve/v2/analysis/token/lowercase
|
||||
github.com/blevesearch/bleve/v2/analysis/token/porter
|
||||
@@ -198,14 +202,14 @@ github.com/blevesearch/bleve/v2/search/scorer
|
||||
github.com/blevesearch/bleve/v2/search/searcher
|
||||
github.com/blevesearch/bleve/v2/size
|
||||
github.com/blevesearch/bleve/v2/util
|
||||
# github.com/blevesearch/bleve_index_api v1.1.6
|
||||
# github.com/blevesearch/bleve_index_api v1.1.10
|
||||
## explicit; go 1.20
|
||||
github.com/blevesearch/bleve_index_api
|
||||
# github.com/blevesearch/geo v0.1.20
|
||||
## explicit; go 1.18
|
||||
github.com/blevesearch/geo/geojson
|
||||
github.com/blevesearch/geo/s2
|
||||
# github.com/blevesearch/go-faiss v1.0.13
|
||||
# github.com/blevesearch/go-faiss v1.0.20
|
||||
## explicit; go 1.19
|
||||
github.com/blevesearch/go-faiss
|
||||
# github.com/blevesearch/go-porterstemmer v1.0.3
|
||||
@@ -217,7 +221,7 @@ github.com/blevesearch/gtreap
|
||||
# github.com/blevesearch/mmap-go v1.0.4
|
||||
## explicit; go 1.13
|
||||
github.com/blevesearch/mmap-go
|
||||
# github.com/blevesearch/scorch_segment_api/v2 v2.2.9
|
||||
# github.com/blevesearch/scorch_segment_api/v2 v2.2.15
|
||||
## explicit; go 1.20
|
||||
github.com/blevesearch/scorch_segment_api/v2
|
||||
# github.com/blevesearch/segment v0.9.1
|
||||
@@ -251,7 +255,7 @@ github.com/blevesearch/zapx/v14
|
||||
# github.com/blevesearch/zapx/v15 v15.3.13
|
||||
## explicit; go 1.19
|
||||
github.com/blevesearch/zapx/v15
|
||||
# github.com/blevesearch/zapx/v16 v16.0.12
|
||||
# github.com/blevesearch/zapx/v16 v16.1.5
|
||||
## explicit; go 1.20
|
||||
github.com/blevesearch/zapx/v16
|
||||
# github.com/bluele/gcache v0.0.2
|
||||
|
||||
Reference in New Issue
Block a user