From 0c84ba3ad24412ca73bc748c3f17735442db6647 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Apr 2025 08:41:09 +0000 Subject: [PATCH] build(deps): bump github.com/blevesearch/bleve/v2 from 2.4.4 to 2.5.0 Bumps [github.com/blevesearch/bleve/v2](https://github.com/blevesearch/bleve) from 2.4.4 to 2.5.0. - [Release notes](https://github.com/blevesearch/bleve/releases) - [Commits](https://github.com/blevesearch/bleve/compare/v2.4.4...v2.5.0) --- updated-dependencies: - dependency-name: github.com/blevesearch/bleve/v2 dependency-version: 2.5.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 26 +- go.sum | 52 +- .../github.com/RoaringBitmap/roaring/clz.go | 13 - .../github.com/RoaringBitmap/roaring/ctz.go | 13 - .../RoaringBitmap/roaring/{ => v2}/.gitignore | 0 .../roaring/{ => v2}/.gitmodules | 0 .../RoaringBitmap/roaring/{ => v2}/AUTHORS | 0 .../roaring/{ => v2}/CONTRIBUTORS | 0 .../RoaringBitmap/roaring/{ => v2}/LICENSE | 0 .../roaring/{ => v2}/LICENSE-2.0.txt | 0 .../RoaringBitmap/roaring/{ => v2}/README.md | 24 +- .../roaring/{ => v2}/arraycontainer.go | 239 +- .../roaring/{ => v2}/bitmapcontainer.go | 241 +- .../RoaringBitmap/roaring/v2/clz.go | 19 + .../roaring/{ => v2}/clz_compat.go | 0 .../RoaringBitmap/roaring/v2/ctz.go | 21 + .../roaring/{ => v2}/ctz_compat.go | 0 .../roaring/{ => v2}/fastaggregation.go | 0 .../roaring/{ => v2}/internal/byte_input.go | 0 .../roaring/{ => v2}/internal/pools.go | 0 .../roaring/{ => v2}/manyiterator.go | 0 .../roaring/{ => v2}/parallel.go | 0 .../RoaringBitmap/roaring/{ => v2}/popcnt.go | 0 .../roaring/{ => v2}/popcnt_amd64.s | 0 .../roaring/{ => v2}/popcnt_asm.go | 0 .../roaring/{ => v2}/popcnt_compat.go | 0 .../roaring/{ => v2}/popcnt_generic.go | 0 .../roaring/{ => v2}/popcnt_slices.go | 0 .../roaring/{ => v2}/priorityqueue.go | 0 .../RoaringBitmap/roaring/{ => v2}/roaring.go | 261 +- .../roaring/{ => v2}/roaring64/Makefile | 0 .../roaring/{ => v2}/roaring64/bsi64.go | 363 +- .../{ => v2}/roaring64/fastaggregation64.go | 0 .../roaring/{ => v2}/roaring64/iterables64.go | 2 +- .../roaring/{ => v2}/roaring64/parallel64.go | 9 +- .../roaring/{ => v2}/roaring64/roaring64.go | 34 +- .../{ => v2}/roaring64/roaringarray64.go | 63 +- .../roaring/{ => v2}/roaring64/util.go | 2 +- .../roaring/{ => v2}/roaringarray.go | 89 +- .../roaring/{ => v2}/runcontainer.go | 284 +- .../roaring/{ => v2}/serialization.go | 0 .../roaring/{ => v2}/serialization_generic.go | 0 .../{ => v2}/serialization_littleendian.go | 9 + .../roaring/{ => v2}/serializationfuzz.go | 0 .../RoaringBitmap/roaring/{ => v2}/setutil.go | 261 +- .../roaring/{ => v2}/setutil_arm64.go | 0 .../roaring/{ => v2}/setutil_arm64.s | 0 .../roaring/{ => v2}/setutil_generic.go | 0 .../roaring/{ => v2}/shortiterator.go | 0 .../RoaringBitmap/roaring/{ => v2}/smat.go | 0 .../RoaringBitmap/roaring/{ => v2}/util.go | 12 +- .../bits-and-blooms/bitset/README.md | 26 +- .../bits-and-blooms/bitset/bitset.go | 801 +- .../bits-and-blooms/bitset/bitset_iter.go | 23 + .../bits-and-blooms/bitset/pext.gen.go | 8866 +++++++++++++++++ .../bits-and-blooms/bitset/popcnt.go | 76 +- .../bits-and-blooms/bitset/popcnt_19.go | 62 - .../bits-and-blooms/bitset/popcnt_amd64.go | 68 - .../bits-and-blooms/bitset/popcnt_amd64.s | 104 - .../bits-and-blooms/bitset/popcnt_generic.go | 25 - .../bits-and-blooms/bitset/select.go | 47 + .../bitset/trailing_zeros_18.go | 15 - .../bitset/trailing_zeros_19.go | 10 - .../blevesearch/bleve/v2/.travis.yml | 12 +- .../github.com/blevesearch/bleve/v2/README.md | 13 +- .../v2/analysis/analyzer/custom/custom.go | 5 +- .../v2/analysis/analyzer/keyword/keyword.go | 5 +- .../v2/analysis/analyzer/standard/standard.go | 5 +- .../v2/analysis/datetime/flexible/flexible.go | 5 +- .../v2/analysis/datetime/optional/optional.go | 5 +- .../timestamp/microseconds/microseconds.go | 5 +- .../timestamp/milliseconds/milliseconds.go | 5 +- .../timestamp/nanoseconds/nanoseconds.go | 5 +- .../datetime/timestamp/seconds/seconds.go | 5 +- .../bleve/v2/analysis/lang/en/analyzer_en.go | 5 +- .../v2/analysis/lang/en/plural_stemmer.go | 5 +- .../analysis/lang/en/possessive_filter_en.go | 5 +- .../analysis/lang/en/stemmer_en_snowball.go | 5 +- .../v2/analysis/lang/en/stop_filter_en.go | 5 +- .../v2/analysis/lang/en/stop_words_en.go | 5 +- .../v2/analysis/token/lowercase/lowercase.go | 5 +- .../bleve/v2/analysis/token/porter/porter.go | 5 +- .../bleve/v2/analysis/token/stop/stop.go | 5 +- .../v2/analysis/tokenizer/single/single.go | 5 +- .../v2/analysis/tokenizer/unicode/unicode.go | 5 +- .../blevesearch/bleve/v2/analysis/type.go | 9 + vendor/github.com/blevesearch/bleve/v2/doc.go | 2 +- .../blevesearch/bleve/v2/document/document.go | 24 + .../bleve/v2/document/field_boolean.go | 4 +- .../bleve/v2/document/field_geoshape.go | 11 +- .../blevesearch/bleve/v2/document/field_ip.go | 4 +- .../bleve/v2/document/field_synonym.go | 149 + .../github.com/blevesearch/bleve/v2/error.go | 2 + .../blevesearch/bleve/v2/geo/README.md | 2 +- .../blevesearch/bleve/v2/geo/geo.go | 2 +- .../blevesearch/bleve/v2/geo/geo_dist.go | 8 +- .../blevesearch/bleve/v2/geo/parse.go | 19 +- .../blevesearch/bleve/v2/geo/sloppy.go | 163 +- .../github.com/blevesearch/bleve/v2/index.go | 63 + .../bleve/v2/index/scorch/builder.go | 4 +- .../bleve/v2/index/scorch/introducer.go | 101 +- .../bleve/v2/index/scorch/merge.go | 191 +- .../v2/index/scorch/mergeplan/merge_plan.go | 31 +- .../bleve/v2/index/scorch/optimize.go | 2 +- .../bleve/v2/index/scorch/optimize_knn.go | 33 +- .../bleve/v2/index/scorch/persister.go | 226 +- .../bleve/v2/index/scorch/scorch.go | 82 +- .../bleve/v2/index/scorch/segment_plugin.go | 5 +- .../bleve/v2/index/scorch/snapshot_index.go | 279 +- .../v2/index/scorch/snapshot_index_dict.go | 14 +- .../v2/index/scorch/snapshot_index_doc.go | 2 +- .../v2/index/scorch/snapshot_index_str.go | 75 + .../v2/index/scorch/snapshot_index_tfr.go | 2 +- .../v2/index/scorch/snapshot_index_thes.go | 107 + .../v2/index/scorch/snapshot_index_vr.go | 42 +- .../bleve/v2/index/scorch/snapshot_segment.go | 2 +- .../v2/index/scorch/snapshot_vector_index.go | 77 +- .../bleve/v2/index/scorch/stats.go | 4 +- .../bleve/v2/index/scorch/unadorned.go | 2 +- .../bleve/v2/index/upsidedown/field_dict.go | 4 + .../bleve/v2/index/upsidedown/row.go | 16 +- .../v2/index/upsidedown/store/boltdb/store.go | 5 +- .../v2/index/upsidedown/store/gtreap/store.go | 7 +- .../bleve/v2/index/upsidedown/upsidedown.go | 5 +- .../blevesearch/bleve/v2/index_alias_impl.go | 253 +- .../blevesearch/bleve/v2/index_impl.go | 144 +- .../blevesearch/bleve/v2/mapping/analysis.go | 8 + .../blevesearch/bleve/v2/mapping/document.go | 56 +- .../blevesearch/bleve/v2/mapping/field.go | 17 +- .../blevesearch/bleve/v2/mapping/index.go | 116 +- .../blevesearch/bleve/v2/mapping/mapping.go | 16 + .../bleve/v2/mapping/mapping_vectors.go | 6 +- .../blevesearch/bleve/v2/mapping/synonym.go | 71 + .../blevesearch/bleve/v2/pre_search.go | 125 +- .../blevesearch/bleve/v2/registry/analyzer.go | 5 +- .../bleve/v2/registry/char_filter.go | 5 +- .../bleve/v2/registry/datetime_parser.go | 5 +- .../bleve/v2/registry/fragment_formatter.go | 5 +- .../bleve/v2/registry/fragmenter.go | 5 +- .../bleve/v2/registry/highlighter.go | 5 +- .../bleve/v2/registry/index_type.go | 7 +- .../blevesearch/bleve/v2/registry/registry.go | 11 + .../blevesearch/bleve/v2/registry/store.go | 7 +- .../bleve/v2/registry/synonym_source.go | 86 + .../bleve/v2/registry/token_filter.go | 5 +- .../bleve/v2/registry/token_maps.go | 5 +- .../bleve/v2/registry/tokenizer.go | 5 +- .../github.com/blevesearch/bleve/v2/search.go | 24 +- .../bleve/v2/search/collector/eligible.go | 65 +- .../bleve/v2/search/collector/topn.go | 17 + .../bleve/v2/search/explanation.go | 7 +- .../v2/search/facet/facet_builder_datetime.go | 10 +- .../v2/search/facet/facet_builder_numeric.go | 10 +- .../v2/search/facet/facet_builder_terms.go | 2 +- .../v2/search/highlight/format/html/html.go | 5 +- .../highlight/fragmenter/simple/simple.go | 5 +- .../search/highlight/highlighter/html/html.go | 5 +- .../highlighter/simple/highlighter_simple.go | 6 +- .../bleve/v2/search/levenshtein.go | 4 + .../bleve/v2/search/query/conjunction.go | 4 +- .../bleve/v2/search/query/disjunction.go | 7 +- .../bleve/v2/search/query/fuzzy.go | 55 + .../bleve/v2/search/query/geo_boundingbox.go | 3 + .../bleve/v2/search/query/ip_range.go | 2 +- .../blevesearch/bleve/v2/search/query/knn.go | 20 +- .../bleve/v2/search/query/match.go | 63 +- .../bleve/v2/search/query/match_phrase.go | 58 +- .../bleve/v2/search/query/multi_phrase.go | 67 +- .../bleve/v2/search/query/phrase.go | 67 +- .../bleve/v2/search/query/query.go | 364 +- .../bleve/v2/search/query/regexp.go | 7 +- .../bleve/v2/search/scorer/scorer_constant.go | 2 +- .../v2/search/scorer/scorer_disjunction.go | 2 +- .../bleve/v2/search/scorer/scorer_knn.go | 2 +- .../bleve/v2/search/scorer/scorer_term.go | 108 +- .../blevesearch/bleve/v2/search/search.go | 17 +- .../v2/search/searcher/search_conjunction.go | 7 +- .../v2/search/searcher/search_disjunction.go | 2 +- .../searcher/search_disjunction_heap.go | 2 - .../searcher/search_disjunction_slice.go | 18 +- .../bleve/v2/search/searcher/search_fuzzy.go | 120 +- .../search/searcher/search_geoboundingbox.go | 28 +- .../v2/search/searcher/search_geoshape.go | 19 +- .../bleve/v2/search/searcher/search_knn.go | 13 +- .../v2/search/searcher/search_multi_term.go | 51 + .../bleve/v2/search/searcher/search_phrase.go | 74 +- .../bleve/v2/search/searcher/search_regexp.go | 26 +- .../bleve/v2/search/searcher/search_term.go | 141 +- .../v2/search/searcher/search_term_prefix.go | 37 +- .../blevesearch/bleve/v2/search/sort.go | 14 +- .../blevesearch/bleve/v2/search/util.go | 70 +- .../blevesearch/bleve/v2/search_knn.go | 107 +- .../blevesearch/bleve/v2/search_no_knn.go | 2 +- .../blevesearch/bleve_index_api/document.go | 35 +- .../blevesearch/bleve_index_api/index.go | 104 +- .../bleve_index_api/indexing_options.go | 15 + .../blevesearch/bleve_index_api/vector.go | 4 +- .../bleve_index_api/vector_index.go | 17 +- .../github.com/blevesearch/go-faiss/index.go | 88 +- .../blevesearch/go-faiss/search_params.go | 106 +- .../scorch_segment_api/v2/segment.go | 61 +- .../scorch_segment_api/v2/segment_vector.go | 2 +- .../blevesearch/vellum/automaton.go | 6 + .../blevesearch/vellum/fst_iterator.go | 14 + .../blevesearch/vellum/levenshtein/dfa.go | 32 +- .../blevesearch/vellum/regexp/regexp.go | 11 + .../github.com/blevesearch/zapx/v11/dict.go | 9 +- .../github.com/blevesearch/zapx/v11/merge.go | 2 +- vendor/github.com/blevesearch/zapx/v11/new.go | 2 +- .../blevesearch/zapx/v11/posting.go | 2 +- .../blevesearch/zapx/v11/segment.go | 2 +- .../github.com/blevesearch/zapx/v11/write.go | 2 +- .../github.com/blevesearch/zapx/v12/dict.go | 9 +- .../github.com/blevesearch/zapx/v12/merge.go | 2 +- vendor/github.com/blevesearch/zapx/v12/new.go | 2 +- .../blevesearch/zapx/v12/posting.go | 2 +- .../blevesearch/zapx/v12/segment.go | 2 +- .../github.com/blevesearch/zapx/v12/write.go | 2 +- .../github.com/blevesearch/zapx/v13/dict.go | 9 +- .../github.com/blevesearch/zapx/v13/merge.go | 2 +- vendor/github.com/blevesearch/zapx/v13/new.go | 2 +- .../blevesearch/zapx/v13/posting.go | 2 +- .../blevesearch/zapx/v13/segment.go | 2 +- .../github.com/blevesearch/zapx/v13/write.go | 2 +- .../github.com/blevesearch/zapx/v14/dict.go | 9 +- .../github.com/blevesearch/zapx/v14/merge.go | 2 +- vendor/github.com/blevesearch/zapx/v14/new.go | 2 +- .../blevesearch/zapx/v14/posting.go | 2 +- .../blevesearch/zapx/v14/segment.go | 2 +- .../github.com/blevesearch/zapx/v14/write.go | 2 +- .../github.com/blevesearch/zapx/v15/dict.go | 9 +- .../github.com/blevesearch/zapx/v15/merge.go | 2 +- vendor/github.com/blevesearch/zapx/v15/new.go | 2 +- .../blevesearch/zapx/v15/posting.go | 2 +- .../blevesearch/zapx/v15/segment.go | 2 +- .../github.com/blevesearch/zapx/v15/write.go | 2 +- .../github.com/blevesearch/zapx/v16/build.go | 1 + .../github.com/blevesearch/zapx/v16/dict.go | 12 +- .../zapx/v16/faiss_vector_cache.go | 4 +- .../zapx/v16/faiss_vector_posting.go | 301 +- .../github.com/blevesearch/zapx/v16/merge.go | 2 +- vendor/github.com/blevesearch/zapx/v16/new.go | 36 +- .../blevesearch/zapx/v16/posting.go | 2 +- .../blevesearch/zapx/v16/section.go | 3 +- .../zapx/v16/section_faiss_vector_index.go | 41 +- .../zapx/v16/section_inverted_text_index.go | 76 +- .../zapx/v16/section_synonym_index.go | 786 ++ .../blevesearch/zapx/v16/segment.go | 82 +- .../blevesearch/zapx/v16/synonym_cache.go | 126 + .../blevesearch/zapx/v16/synonym_posting.go | 239 + .../blevesearch/zapx/v16/thesaurus.go | 159 + .../github.com/blevesearch/zapx/v16/write.go | 2 +- vendor/github.com/blevesearch/zapx/v16/zap.md | 88 +- vendor/modules.txt | 50 +- 254 files changed, 17180 insertions(+), 2192 deletions(-) delete mode 100644 vendor/github.com/RoaringBitmap/roaring/clz.go delete mode 100644 vendor/github.com/RoaringBitmap/roaring/ctz.go rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/.gitignore (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/.gitmodules (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/AUTHORS (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/CONTRIBUTORS (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/LICENSE (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/LICENSE-2.0.txt (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/README.md (94%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/arraycontainer.go (83%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/bitmapcontainer.go (83%) create mode 100644 vendor/github.com/RoaringBitmap/roaring/v2/clz.go rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/clz_compat.go (100%) create mode 100644 vendor/github.com/RoaringBitmap/roaring/v2/ctz.go rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/ctz_compat.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/fastaggregation.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/internal/byte_input.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/internal/pools.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/manyiterator.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/parallel.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/popcnt.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/popcnt_amd64.s (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/popcnt_asm.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/popcnt_compat.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/popcnt_generic.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/popcnt_slices.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/priorityqueue.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring.go (88%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/Makefile (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/bsi64.go (69%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/fastaggregation64.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/iterables64.go (99%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/parallel64.go (95%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/roaring64.go (98%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/roaringarray64.go (87%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaring64/util.go (92%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/roaringarray.go (91%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/runcontainer.go (90%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/serialization.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/serialization_generic.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/serialization_littleendian.go (99%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/serializationfuzz.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/setutil.go (56%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/setutil_arm64.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/setutil_arm64.s (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/setutil_generic.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/shortiterator.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/smat.go (100%) rename vendor/github.com/RoaringBitmap/roaring/{ => v2}/util.go (97%) create mode 100644 vendor/github.com/bits-and-blooms/bitset/bitset_iter.go create mode 100644 vendor/github.com/bits-and-blooms/bitset/pext.gen.go delete mode 100644 vendor/github.com/bits-and-blooms/bitset/popcnt_19.go delete mode 100644 vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go delete mode 100644 vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s delete mode 100644 vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go create mode 100644 vendor/github.com/bits-and-blooms/bitset/select.go delete mode 100644 vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go delete mode 100644 vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go create mode 100644 vendor/github.com/blevesearch/bleve/v2/document/field_synonym.go create mode 100644 vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_str.go create mode 100644 vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_thes.go create mode 100644 vendor/github.com/blevesearch/bleve/v2/mapping/synonym.go create mode 100644 vendor/github.com/blevesearch/bleve/v2/registry/synonym_source.go create mode 100644 vendor/github.com/blevesearch/zapx/v16/section_synonym_index.go create mode 100644 vendor/github.com/blevesearch/zapx/v16/synonym_cache.go create mode 100644 vendor/github.com/blevesearch/zapx/v16/synonym_posting.go create mode 100644 vendor/github.com/blevesearch/zapx/v16/thesaurus.go diff --git a/go.mod b/go.mod index 7e5d758b7..2d9eca5db 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/Nerzal/gocloak/v13 v13.9.0 github.com/bbalet/stopwords v1.0.0 github.com/beevik/etree v1.5.0 - github.com/blevesearch/bleve/v2 v2.4.4 + github.com/blevesearch/bleve/v2 v2.5.0 github.com/cenkalti/backoff v2.2.1+incompatible github.com/coreos/go-oidc/v3 v3.14.1 github.com/cs3org/go-cs3apis v0.0.0-20241105092511-3ad35d174fc1 @@ -121,7 +121,7 @@ require ( github.com/Masterminds/sprig v2.22.0+incompatible // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/ProtonMail/go-crypto v1.1.5 // indirect - github.com/RoaringBitmap/roaring v1.9.3 // indirect + github.com/RoaringBitmap/roaring/v2 v2.4.5 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect github.com/ajg/form v1.5.1 // indirect github.com/alexedwards/argon2id v1.0.0 // indirect @@ -131,24 +131,24 @@ require ( github.com/aws/aws-sdk-go v1.55.6 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bitly/go-simplejson v0.5.0 // indirect - github.com/bits-and-blooms/bitset v1.12.0 // indirect - github.com/blevesearch/bleve_index_api v1.1.12 // indirect + github.com/bits-and-blooms/bitset v1.22.0 // indirect + github.com/blevesearch/bleve_index_api v1.2.7 // indirect github.com/blevesearch/geo v0.1.20 // indirect - github.com/blevesearch/go-faiss v1.0.24 // indirect + github.com/blevesearch/go-faiss v1.0.25 // indirect github.com/blevesearch/go-porterstemmer v1.0.3 // indirect github.com/blevesearch/gtreap v0.1.1 // indirect github.com/blevesearch/mmap-go v1.0.4 // indirect - github.com/blevesearch/scorch_segment_api/v2 v2.2.16 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.3.9 // indirect github.com/blevesearch/segment v0.9.1 // indirect github.com/blevesearch/snowballstem v0.9.0 // indirect github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect - github.com/blevesearch/vellum v1.0.10 // indirect - github.com/blevesearch/zapx/v11 v11.3.10 // indirect - github.com/blevesearch/zapx/v12 v12.3.10 // indirect - github.com/blevesearch/zapx/v13 v13.3.10 // indirect - github.com/blevesearch/zapx/v14 v14.3.10 // indirect - github.com/blevesearch/zapx/v15 v15.3.16 // indirect - github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect + github.com/blevesearch/vellum v1.1.0 // indirect + github.com/blevesearch/zapx/v11 v11.4.1 // indirect + github.com/blevesearch/zapx/v12 v12.4.1 // indirect + github.com/blevesearch/zapx/v13 v13.4.1 // indirect + github.com/blevesearch/zapx/v14 v14.4.1 // indirect + github.com/blevesearch/zapx/v15 v15.4.1 // indirect + github.com/blevesearch/zapx/v16 v16.2.2 // indirect github.com/bluele/gcache v0.0.2 // indirect github.com/bombsimon/logrusr/v3 v3.1.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect diff --git a/go.sum b/go.sum index 397d83a4e..2c6bc6a78 100644 --- a/go.sum +++ b/go.sum @@ -87,8 +87,8 @@ github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAE github.com/OpenDNS/vegadns2client v0.0.0-20180418235048-a3fa4a771d87/go.mod h1:iGLljf5n9GjT6kc0HBvyI1nOKnGQbNB66VzSNbK5iks= github.com/ProtonMail/go-crypto v1.1.5 h1:eoAQfK2dwL+tFSFpr7TbOaPNUbPiJj4fLYwwGE1FQO4= github.com/ProtonMail/go-crypto v1.1.5/go.mod h1:rA3QumHc/FZ8pAHreoekgiAbzpNsfQAosU5td4SnOrE= -github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4S2OByM= -github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= +github.com/RoaringBitmap/roaring/v2 v2.4.5 h1:uGrrMreGjvAtTBobc0g5IrW1D5ldxDQYe2JW2gggRdg= +github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/hVXDS2dXi7/eUFE0= github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= @@ -142,45 +142,46 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bitly/go-simplejson v0.5.0 h1:6IH+V8/tVMab511d5bn4M7EwGXZf9Hj6i2xSwkNEM+Y= github.com/bitly/go-simplejson v0.5.0/go.mod h1:cXHtHw4XUPsvGaxgjIAn8PhEWG9NfngEKAMDJEczWVA= -github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= +github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= -github.com/blevesearch/bleve/v2 v2.4.4 h1:RwwLGjUm54SwyyykbrZs4vc1qjzYic4ZnAnY9TwNl60= -github.com/blevesearch/bleve/v2 v2.4.4/go.mod h1:fa2Eo6DP7JR+dMFpQe+WiZXINKSunh7WBtlDGbolKXk= -github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY= -github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/bleve/v2 v2.5.0 h1:HzYqBy/5/M9Ul9ESEmXzN/3Jl7YpmWBdHM/+zzv/3k4= +github.com/blevesearch/bleve/v2 v2.5.0/go.mod h1:PcJzTPnEynO15dCf9isxOga7YFRa/cMSsbnRwnszXUk= +github.com/blevesearch/bleve_index_api v1.2.7 h1:c8r9vmbaYQroAMSGag7zq5gEVPiuXrUQDqfnj7uYZSY= +github.com/blevesearch/bleve_index_api v1.2.7/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= -github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI= -github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-faiss v1.0.25 h1:lel1rkOUGbT1CJ0YgzKwC7k+XH0XVBHnCVWahdCXk4U= +github.com/blevesearch/go-faiss v1.0.25/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY= -github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0= +github.com/blevesearch/scorch_segment_api/v2 v2.3.9 h1:X6nJXnNHl7nasXW+U6y2Ns2Aw8F9STszkYkyBfQ+p0o= +github.com/blevesearch/scorch_segment_api/v2 v2.3.9/go.mod h1:IrzspZlVjhf4X29oJiEhBxEteTqOY9RlYlk1lCmYHr4= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= -github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI= -github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k= -github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk= -github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ= -github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s= -github.com/blevesearch/zapx/v12 v12.3.10/go.mod h1:0yeZg6JhaGxITlsS5co73aqPtM04+ycnI6D1v0mhbCs= -github.com/blevesearch/zapx/v13 v13.3.10 h1:0KY9tuxg06rXxOZHg3DwPJBjniSlqEgVpxIqMGahDE8= -github.com/blevesearch/zapx/v13 v13.3.10/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk= -github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz77pSwwKU= -github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= -github.com/blevesearch/zapx/v15 v15.3.16 h1:Ct3rv7FUJPfPk99TI/OofdC+Kpb4IdyfdMH48sb+FmE= -github.com/blevesearch/zapx/v15 v15.3.16/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= -github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5YgrzCeK3M1QwvZIpxYhChkXp7/L0RhDYsxXoE= -github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI= +github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= +github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= +github.com/blevesearch/zapx/v11 v11.4.1 h1:qFCPlFbsEdwbbckJkysptSQOsHn4s6ZOHL5GMAIAVHA= +github.com/blevesearch/zapx/v11 v11.4.1/go.mod h1:qNOGxIqdPC1MXauJCD9HBG487PxviTUUbmChFOAosGs= +github.com/blevesearch/zapx/v12 v12.4.1 h1:K77bhypII60a4v8mwvav7r4IxWA8qxhNjgF9xGdb9eQ= +github.com/blevesearch/zapx/v12 v12.4.1/go.mod h1:QRPrlPOzAxBNMI0MkgdD+xsTqx65zbuPr3Ko4Re49II= +github.com/blevesearch/zapx/v13 v13.4.1 h1:EnkEMZFUK0lsW/jOJJF2xOcp+W8TjEsyeN5BeAZEYYE= +github.com/blevesearch/zapx/v13 v13.4.1/go.mod h1:e6duBMlCvgbH9rkzNMnUa9hRI9F7ri2BRcHfphcmGn8= +github.com/blevesearch/zapx/v14 v14.4.1 h1:G47kGCshknBZzZAtjcnIAMn3oNx8XBLxp8DMq18ogyE= +github.com/blevesearch/zapx/v14 v14.4.1/go.mod h1:O7sDxiaL2r2PnCXbhh1Bvm7b4sP+jp4unE9DDPWGoms= +github.com/blevesearch/zapx/v15 v15.4.1 h1:B5IoTMUCEzFdc9FSQbhVOxAY+BO17c05866fNruiI7g= +github.com/blevesearch/zapx/v15 v15.4.1/go.mod h1:b/MreHjYeQoLjyY2+UaM0hGZZUajEbE0xhnr1A2/Q6Y= +github.com/blevesearch/zapx/v16 v16.2.2 h1:MifKJVRTEhMTgSlle2bDRTb39BGc9jXFRLPZc6r0Rzk= +github.com/blevesearch/zapx/v16 v16.2.2/go.mod h1:B9Pk4G1CqtErgQV9DyCSA9Lb7WZe4olYfGw7fVDZ4sk= github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw= github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0= github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY= @@ -1663,6 +1664,7 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= diff --git a/vendor/github.com/RoaringBitmap/roaring/clz.go b/vendor/github.com/RoaringBitmap/roaring/clz.go deleted file mode 100644 index ee0ebc6c9..000000000 --- a/vendor/github.com/RoaringBitmap/roaring/clz.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -// "go1.9", from Go version 1.9 onward -// See https://golang.org/pkg/go/build/#hdr-Build_Constraints - -package roaring - -import "math/bits" - -func countLeadingZeros(x uint64) int { - return bits.LeadingZeros64(x) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/ctz.go b/vendor/github.com/RoaringBitmap/roaring/ctz.go deleted file mode 100644 index fbcfe9128..000000000 --- a/vendor/github.com/RoaringBitmap/roaring/ctz.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -// "go1.9", from Go version 1.9 onward -// See https://golang.org/pkg/go/build/#hdr-Build_Constraints - -package roaring - -import "math/bits" - -func countTrailingZeros(x uint64) int { - return bits.TrailingZeros64(x) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/.gitignore b/vendor/github.com/RoaringBitmap/roaring/v2/.gitignore similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/.gitignore rename to vendor/github.com/RoaringBitmap/roaring/v2/.gitignore diff --git a/vendor/github.com/RoaringBitmap/roaring/.gitmodules b/vendor/github.com/RoaringBitmap/roaring/v2/.gitmodules similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/.gitmodules rename to vendor/github.com/RoaringBitmap/roaring/v2/.gitmodules diff --git a/vendor/github.com/RoaringBitmap/roaring/AUTHORS b/vendor/github.com/RoaringBitmap/roaring/v2/AUTHORS similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/AUTHORS rename to vendor/github.com/RoaringBitmap/roaring/v2/AUTHORS diff --git a/vendor/github.com/RoaringBitmap/roaring/CONTRIBUTORS b/vendor/github.com/RoaringBitmap/roaring/v2/CONTRIBUTORS similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/CONTRIBUTORS rename to vendor/github.com/RoaringBitmap/roaring/v2/CONTRIBUTORS diff --git a/vendor/github.com/RoaringBitmap/roaring/LICENSE b/vendor/github.com/RoaringBitmap/roaring/v2/LICENSE similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/LICENSE rename to vendor/github.com/RoaringBitmap/roaring/v2/LICENSE diff --git a/vendor/github.com/RoaringBitmap/roaring/LICENSE-2.0.txt b/vendor/github.com/RoaringBitmap/roaring/v2/LICENSE-2.0.txt similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/LICENSE-2.0.txt rename to vendor/github.com/RoaringBitmap/roaring/v2/LICENSE-2.0.txt diff --git a/vendor/github.com/RoaringBitmap/roaring/README.md b/vendor/github.com/RoaringBitmap/roaring/v2/README.md similarity index 94% rename from vendor/github.com/RoaringBitmap/roaring/README.md rename to vendor/github.com/RoaringBitmap/roaring/v2/README.md index acd3058b7..b7e9684af 100644 --- a/vendor/github.com/RoaringBitmap/roaring/README.md +++ b/vendor/github.com/RoaringBitmap/roaring/v2/README.md @@ -10,7 +10,7 @@ This is a go version of the Roaring bitmap data structure. Roaring bitmaps are used by several major systems such as [Apache Lucene][lucene] and derivative systems such as [Solr][solr] and -[Elasticsearch][elasticsearch], [Apache Druid (Incubating)][druid], [LinkedIn Pinot][pinot], [Netflix Atlas][atlas], [Apache Spark][spark], [OpenSearchServer][opensearchserver], [anacrolix/torrent][anacrolix/torrent], [Whoosh][whoosh], [Pilosa][pilosa], [Microsoft Visual Studio Team Services (VSTS)][vsts], and eBay's [Apache Kylin][kylin]. The YouTube SQL Engine, [Google Procella](https://research.google/pubs/pub48388/), uses Roaring bitmaps for indexing. +[Elasticsearch][elasticsearch], [Apache Druid (Incubating)][druid], [LinkedIn Pinot][pinot], [Netflix Atlas][atlas], [Apache Spark][spark], [OpenSearchServer][opensearchserver], [anacrolix/torrent][anacrolix/torrent], [Whoosh][whoosh], [Redpanda](https://github.com/redpanda-data/redpanda), [Pilosa][pilosa], [Microsoft Visual Studio Team Services (VSTS)][vsts], and eBay's [Apache Kylin][kylin]. The YouTube SQL Engine, [Google Procella](https://research.google/pubs/pub48388/), uses Roaring bitmaps for indexing. [lucene]: https://lucene.apache.org/ [solr]: https://lucene.apache.org/solr/ @@ -163,7 +163,7 @@ they include - github.com/philhofer/fwd - github.com/jtolds/gls -Note that the smat library requires Go 1.6 or better. +Note that the smat library requires Go 1.15 or better. #### Installation @@ -188,7 +188,7 @@ package main import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" "bytes" ) @@ -249,15 +249,20 @@ consider the following sample of code: buf := new(bytes.Buffer) size,err:=rb.WriteTo(buf) if err != nil { - t.Errorf("Failed writing") + fmt.Println("Failed writing") // return or panic } newrb:= New() size,err=newrb.ReadFrom(buf) if err != nil { - t.Errorf("Failed reading") + fmt.Println("Failed reading") // return or panic + } + // if buf is an untrusted source, you should validate the result + // (this adds a bit of complexity but it is necessary for security) + if newrb.Validate() != nil { + fmt.Println("Failed validation") // return or panic } if ! rb.Equals(newrb) { - t.Errorf("Cannot retrieve serialized version") + fmt.Println("Cannot retrieve serialized version") } ``` @@ -280,7 +285,7 @@ package main import ( "fmt" - "github.com/RoaringBitmap/roaring/roaring64" + "github.com/RoaringBitmap/roaring/v2/roaring64" "bytes" ) @@ -356,7 +361,7 @@ https://coveralls.io/github/RoaringBitmap/roaring?branch=master Type go test -bench Benchmark -run - - + To run benchmarks on [Real Roaring Datasets](https://github.com/RoaringBitmap/real-roaring-datasets) run the following: @@ -369,9 +374,8 @@ BENCH_REAL_DATA=1 go test -bench BenchmarkRealData -run - You can use roaring with gore: -- go get -u github.com/motemen/gore +- go install github.com/x-motemen/gore/cmd/gore@latest - Make sure that ``$GOPATH/bin`` is in your ``$PATH``. -- go get github.com/RoaringBitmap/roaring ```go $ gore diff --git a/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go b/vendor/github.com/RoaringBitmap/roaring/v2/arraycontainer.go similarity index 83% rename from vendor/github.com/RoaringBitmap/roaring/arraycontainer.go rename to vendor/github.com/RoaringBitmap/roaring/v2/arraycontainer.go index 80fa676ef..2e75c5ad4 100644 --- a/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/arraycontainer.go @@ -1,6 +1,7 @@ package roaring import ( + "errors" "fmt" ) @@ -8,6 +9,11 @@ type arrayContainer struct { content []uint16 } +var ( + ErrArrayIncorrectSort = errors.New("incorrectly sorted array") + ErrArrayInvalidSize = errors.New("invalid array size") +) + func (ac *arrayContainer) String() string { s := "{" for it := ac.getShortIterator(); it.hasNext(); { @@ -26,8 +32,7 @@ func (ac *arrayContainer) fillLeastSignificant16bits(x []uint32, i int, mask uin _ = x[len(ac.content)-1+i] _ = ac.content[len(ac.content)-1] for k := 0; k < len(ac.content); k++ { - x[k+i] = - uint32(ac.content[k]) | mask + x[k+i] = uint32(ac.content[k]) | mask } return i + len(ac.content) } @@ -60,10 +65,26 @@ func (ac *arrayContainer) minimum() uint16 { return ac.content[0] // assume not empty } +func (ac *arrayContainer) safeMinimum() (uint16, error) { + if len(ac.content) == 0 { + return 0, errors.New("empty array") + } + + return ac.minimum(), nil +} + func (ac *arrayContainer) maximum() uint16 { return ac.content[len(ac.content)-1] // assume not empty } +func (ac *arrayContainer) safeMaximum() (uint16, error) { + if len(ac.content) == 0 { + return 0, errors.New("empty array") + } + + return ac.maximum(), nil +} + func (ac *arrayContainer) getSizeInBytes() int { return ac.getCardinality() * 2 } @@ -168,7 +189,7 @@ func (ac *arrayContainer) notClose(firstOfRange, lastOfRange int) container { return ac.toBitmapContainer().not(firstOfRange, lastOfRange+1) } answer := newArrayContainer() - answer.content = make([]uint16, newCardinality, newCardinality) //a hack for sure + answer.content = make([]uint16, newCardinality, newCardinality) // a hack for sure copy(answer.content, ac.content[:startIndex]) outPos := startIndex @@ -194,11 +215,9 @@ func (ac *arrayContainer) notClose(firstOfRange, lastOfRange int) container { } answer.content = answer.content[:newCardinality] return answer - } func (ac *arrayContainer) equals(o container) bool { - srb, ok := o.(*arrayContainer) if ok { // Check if the containers are the same object. @@ -239,8 +258,8 @@ func (ac *arrayContainer) toBitmapContainer() *bitmapContainer { bc := newBitmapContainer() bc.loadData(ac) return bc - } + func (ac *arrayContainer) iadd(x uint16) (wasNew bool) { // Special case adding to the end of the container. l := len(ac.content) @@ -352,7 +371,6 @@ func (ac *arrayContainer) ior(a container) container { return ac.iorArray(x) case *bitmapContainer: return a.(*bitmapContainer).orArray(ac) - //return ac.iorBitmap(x) // note: this does not make sense case *runContainer16: if x.isFull() { return x.clone() @@ -589,7 +607,6 @@ func (ac *arrayContainer) iandBitmap(bc *bitmapContainer) container { } ac.content = ac.content[:pos] return ac - } func (ac *arrayContainer) xor(a container) container { @@ -630,7 +647,6 @@ func (ac *arrayContainer) xorArray(value2 *arrayContainer) container { length := exclusiveUnion2by2(value1.content, value2.content, answer.content) answer.content = answer.content[:length] return answer - } func (ac *arrayContainer) andNot(a container) container { @@ -822,7 +838,6 @@ func (ac *arrayContainer) inotClose(firstOfRange, lastOfRange int) container { } else { // no expansion needed ac.negateRange(buffer, startIndex, lastIndex, firstOfRange, lastOfRange+1) if cardinalityChange < 0 { - for i := startIndex + newValuesInRange; i < newCardinality; i++ { ac.content[i] = ac.content[i-cardinalityChange] } @@ -915,7 +930,6 @@ func (ac *arrayContainer) rank(x uint16) int { return answer + 1 } return -answer - 1 - } func (ac *arrayContainer) selectInt(x uint16) int { @@ -971,6 +985,179 @@ func (ac *arrayContainer) realloc(size int) { } } +// previousValue returns either the target if found or the previous smaller present value. +// If the target is out of bounds a -1 is returned. +// Ex: target=4 ac=[2,3,4,6,7] returns 4 +// Ex: target=5 ac=[2,3,4,6,7] returns 4 +// Ex: target=6 ac=[2,3,4,6,7] returns 6 +// Ex: target=8 ac=[2,3,4,6,7] returns 7 +// Ex: target=1 ac=[2,3,4,6,7] returns -1 +// Ex: target=0 ac=[2,3,4,6,7] returns -1 +func (ac *arrayContainer) previousValue(target uint16) int { + result := binarySearchUntil(ac.content, target) + + if result.index == len(ac.content) { + return int(ac.maximum()) + } + + if result.outOfBounds() { + return -1 + } + + return int(result.value) +} + +// previousAbsentValue returns either the target if not found or the next larger missing value. +// If the target is out of bounds a -1 is returned +// Ex: target=4 ac=[1,2,3,4,6,7] returns 0 +// Ex: target=5 ac=[1,2,3,4,6,7] returns 5 +// Ex: target=6 ac=[1,2,3,4,6,7] returns 5 +// Ex: target=8 ac=[1,2,3,4,6,7] returns 8 +func (ac *arrayContainer) previousAbsentValue(target uint16) int { + cardinality := len(ac.content) + + if cardinality == 0 { + return int(target) + } + + if target > ac.maximum() { + return int(target) + } + + result := binarySearchPast(ac.content, target) + + if result.notFound() { + return int(target) + } + + // If the target was found at index 1, then the next value down must be result.value-1 + if result.index == 1 { + if ac.minimum() != result.value-1 { + return int(result.value - 1) + } + } + + low := -1 + high := result.index + + // This uses the pigeon-hole principle. + // the if statement compares the difference in indices vs + // the difference in values. Suppose mid = 10 and result.index = 5 + // with ac.content[mid] = 100 and target = 10 + // then we have 5 slots for values but we need to fit in 90 values + // so some of the values must be missing + for low+1 < high { + midIndex := (high + low) >> 1 + indexDifference := result.index - midIndex + valueDifference := target - ac.content[midIndex] + if indexDifference < int(valueDifference) { + low = midIndex + } else { + high = midIndex + } + } + + if high == 0 { + return int(ac.minimum()) - 1 + } + + return int(ac.content[high] - 1) +} + +// nextAbsentValue returns either the target if not found or the next larger missing value. +// If the target is out of bounds a -1 is returned +// Ex: target=4 ac=[1,2,3,4,6,7] returns 5 +// Ex: target=5 ac=[1,2,3,4,6,7] returns 5 +// Ex: target=0 ac=[1,2,3,4,6,7] returns 0 +// Ex: target=8 ac=[1,2,3,4,6,7] returns 8 +func (ac *arrayContainer) nextAbsentValue(target uint16) int { + cardinality := len(ac.content) + + if cardinality == 0 { + return int(target) + } + if target < ac.minimum() { + return int(target) + } + + result := binarySearchPast(ac.content, target) + + if result.notFound() { + return int(target) + } + + if result.index == cardinality-2 { + if ac.maximum() != result.value+1 { + return int(result.value + 1) + } + } + + low := result.index + high := len(ac.content) + + // This uses the pigeon-hole principle. + // the if statement compares the difference in indices vs + // the difference in values. Suppose mid = 10 and result.index = 5 + // with ac.content[mid] = 100 and target = 10 + // then we have 5 slots for values but we need to fit in 90 values + // so some of the values must be missing + for low+1 < high { + midIndex := (high + low) >> 1 + indexDifference := midIndex - result.index + valueDifference := ac.content[midIndex] - target + if indexDifference < int(valueDifference) { + high = midIndex + } else { + low = midIndex + } + } + + if low == cardinality-1 { + return int(ac.content[cardinality-1] + 1) + } + + return int(ac.content[low] + 1) +} + +// nextValue returns either the target if found or the next larger value. +// if the target is out of bounds a -1 is returned +// +// Ex: target=4 ac=[1,2,3,4,6,7] returns 4 +// Ex: target=5 ac=[1,2,3,4,6,7] returns 6 +// Ex: target=6 ac=[1,2,3,4,6,7] returns 6 +// Ex: target=0 ac=[1,2,3,4,6,7] returns 1 +// Ex: target=100 ac=[1,2,3,4,6,7] returns -1 +func (ac *arrayContainer) nextValue(target uint16) int { + cardinality := len(ac.content) + if cardinality == 0 { + return -1 + } + + //if target < ac.minimum() { + // return -1 + //} + //if target > ac.maximum() { + // return -1 + // } + + result := binarySearchUntil(ac.content, target) + if result.exactMatch { + return int(result.value) + } + + if !result.exactMatch && result.index == -1 { + return int(ac.content[0]) + } + if result.outOfBounds() { + return -1 + } + + if result.index < len(ac.content)-1 { + return int(ac.content[result.index+1]) + } + return -1 +} + func newArrayContainer() *arrayContainer { p := new(arrayContainer) return p @@ -1039,15 +1226,12 @@ func (ac *arrayContainer) numberOfRuns() (nr int) { // convert to run or array *if needed* func (ac *arrayContainer) toEfficientContainer() container { - numRuns := ac.numberOfRuns() - sizeAsRunContainer := runContainer16SerializedSizeInBytes(numRuns) sizeAsBitmapContainer := bitmapContainerSizeInBytes() card := ac.getCardinality() sizeAsArrayContainer := arrayContainerSizeInBytes(card) - - if sizeAsRunContainer <= minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { + if sizeAsRunContainer < minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { return newRunContainer16FromArray(ac) } if card <= arrayDefaultMaxSize { @@ -1099,3 +1283,28 @@ func (ac *arrayContainer) addOffset(x uint16) (container, container) { return low, high } + +// validate checks cardinality and sort order of the array container +func (ac *arrayContainer) validate() error { + cardinality := ac.getCardinality() + + if cardinality <= 0 { + return ErrArrayInvalidSize + } + + if cardinality > arrayDefaultMaxSize { + return ErrArrayInvalidSize + } + + previous := ac.content[0] + for i := 1; i < len(ac.content); i++ { + next := ac.content[i] + if previous > next { + return ErrArrayIncorrectSort + } + previous = next + + } + + return nil +} diff --git a/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go b/vendor/github.com/RoaringBitmap/roaring/v2/bitmapcontainer.go similarity index 83% rename from vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go rename to vendor/github.com/RoaringBitmap/roaring/v2/bitmapcontainer.go index bf08bfca3..10bc0f1c7 100644 --- a/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/bitmapcontainer.go @@ -1,7 +1,9 @@ package roaring import ( + "errors" "fmt" + "math/bits" "unsafe" ) @@ -56,6 +58,17 @@ func (bc *bitmapContainer) minimum() uint16 { return MaxUint16 } +func (bc *bitmapContainer) safeMinimum() (uint16, error) { + if len(bc.bitmap) == 0 { + return 0, errors.New("Empty bitmap") + } + val := bc.minimum() + if val == MaxUint16 { + return 0, errors.New("Empty bitmap") + } + return val, nil +} + // i should be non-zero func clz(i uint64) int { n := 1 @@ -94,6 +107,17 @@ func (bc *bitmapContainer) maximum() uint16 { return uint16(0) } +func (bc *bitmapContainer) safeMaximum() (uint16, error) { + if len(bc.bitmap) == 0 { + return 0, errors.New("Empty bitmap") + } + val := bc.maximum() + if val == uint16(0) { + return 0, errors.New("Empty bitmap") + } + return val, nil +} + func (bc *bitmapContainer) iterate(cb func(x uint16) bool) bool { iterator := bitmapContainerShortIterator{bc, bc.NextSetBit(0)} @@ -116,6 +140,7 @@ func (bcsi *bitmapContainerShortIterator) next() uint16 { bcsi.i = bcsi.ptr.NextSetBit(uint(bcsi.i) + 1) return uint16(j) } + func (bcsi *bitmapContainerShortIterator) hasNext() bool { return bcsi.i >= 0 } @@ -201,6 +226,7 @@ func (bcmi *bitmapContainerManyIterator) nextMany(hs uint32, buf []uint32) int { return n } +// nextMany64 returns the number of values added to the buffer func (bcmi *bitmapContainerManyIterator) nextMany64(hs uint64, buf []uint64) int { n := 0 base := bcmi.base @@ -237,11 +263,10 @@ func (bc *bitmapContainer) getManyIterator() manyIterable { } func (bc *bitmapContainer) getSizeInBytes() int { - return len(bc.bitmap) * 8 // + bcBaseBytes + return len(bc.bitmap) * 8 } func (bc *bitmapContainer) serializedSizeInBytes() int { - //return bc.Msgsize()// NOO! This breaks GetSerializedSizeInBytes return len(bc.bitmap) * 8 } @@ -313,6 +338,7 @@ func (bc *bitmapContainer) iaddReturnMinimized(i uint16) container { return bc } +// iadd adds the arg i, returning true if not already present func (bc *bitmapContainer) iadd(i uint16) bool { x := int(i) previous := bc.bitmap[x/64] @@ -441,7 +467,7 @@ func (bc *bitmapContainer) ior(a container) container { if bc.isFull() { return newRunContainer16Range(0, MaxUint16) } - //bc.computeCardinality() + // bc.computeCardinality() return bc } panic(fmt.Errorf("unsupported container type %T", a)) @@ -516,7 +542,7 @@ func (bc *bitmapContainer) orArray(value2 *arrayContainer) container { } func (bc *bitmapContainer) orArrayCardinality(value2 *arrayContainer) int { - answer := 0 + answer := bc.getCardinality() c := value2.getCardinality() for k := 0; k < c; k++ { // branchless: @@ -819,9 +845,8 @@ func (bc *bitmapContainer) andBitmap(value2 *bitmapContainer) container { } ac := newArrayContainerSize(newcardinality) fillArrayAND(ac.content, bc.bitmap, value2.bitmap) - ac.content = ac.content[:newcardinality] //not sure why i need this + ac.content = ac.content[:newcardinality] return ac - } func (bc *bitmapContainer) intersectsArray(value2 *arrayContainer) bool { @@ -842,7 +867,6 @@ func (bc *bitmapContainer) intersectsBitmap(value2 *bitmapContainer) bool { } } return false - } func (bc *bitmapContainer) iandBitmap(value2 *bitmapContainer) container { @@ -995,7 +1019,7 @@ func (bc *bitmapContainer) iandNotBitmapSurely(value2 *bitmapContainer) containe return bc } -func (bc *bitmapContainer) contains(i uint16) bool { //testbit +func (bc *bitmapContainer) contains(i uint16) bool { // testbit x := uint(i) w := bc.bitmap[x>>6] mask := uint64(1) << (x & 63) @@ -1051,7 +1075,7 @@ func (bc *bitmapContainer) toArrayContainer() *arrayContainer { } func (bc *bitmapContainer) fillArray(container []uint16) { - //TODO: rewrite in assembly + // TODO: rewrite in assembly pos := 0 base := 0 for k := 0; k < len(bc.bitmap); k++ { @@ -1066,6 +1090,7 @@ func (bc *bitmapContainer) fillArray(container []uint16) { } } +// NextSetBit returns the next set bit e.g the next int packed into the bitmaparray func (bc *bitmapContainer) NextSetBit(i uint) int { var ( x = i / 64 @@ -1088,12 +1113,22 @@ func (bc *bitmapContainer) NextSetBit(i uint) int { return -1 } +// PrevSetBit returns the previous set bit e.g the previous int packed into the bitmaparray func (bc *bitmapContainer) PrevSetBit(i int) int { if i < 0 { return -1 } - x := i / 64 - if x >= len(bc.bitmap) { + + return bc.uPrevSetBit(uint(i)) +} + +func (bc *bitmapContainer) uPrevSetBit(i uint) int { + var ( + x = i >> 6 + length = uint(len(bc.bitmap)) + ) + + if x >= length { return -1 } @@ -1103,12 +1138,16 @@ func (bc *bitmapContainer) PrevSetBit(i int) int { w = w << uint(63-b) if w != 0 { - return i - countLeadingZeros(w) + return int(i) - countLeadingZeros(w) } + orig := x x-- - for ; x >= 0; x-- { + if x > orig { + return -1 + } + for ; x < orig; x-- { if bc.bitmap[x] != 0 { - return (x * 64) + 63 - countLeadingZeros(bc.bitmap[x]) + return int((x*64)+63) - countLeadingZeros(bc.bitmap[x]) } } return -1 @@ -1141,7 +1180,6 @@ func (bc *bitmapContainer) numberOfRuns() int { // convert to run or array *if needed* func (bc *bitmapContainer) toEfficientContainer() container { - numRuns := bc.numberOfRuns() sizeAsRunContainer := runContainer16SerializedSizeInBytes(numRuns) @@ -1149,7 +1187,7 @@ func (bc *bitmapContainer) toEfficientContainer() container { card := bc.getCardinality() sizeAsArrayContainer := arrayContainerSizeInBytes(card) - if sizeAsRunContainer <= minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { + if sizeAsRunContainer < minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { return newRunContainer16FromBitmapContainer(bc) } if card <= arrayDefaultMaxSize { @@ -1159,7 +1197,6 @@ func (bc *bitmapContainer) toEfficientContainer() container { } func newBitmapContainerFromRun(rc *runContainer16) *bitmapContainer { - if len(rc.iv) == 1 { return newBitmapContainerwithRange(int(rc.iv[0].start), int(rc.iv[0].last())) } @@ -1169,7 +1206,7 @@ func newBitmapContainerFromRun(rc *runContainer16) *bitmapContainer { setBitmapRange(bc.bitmap, int(rc.iv[i].start), int(rc.iv[i].last())+1) bc.cardinality += int(rc.iv[i].last()) + 1 - int(rc.iv[i].start) } - //bc.computeCardinality() + // bc.computeCardinality() return bc } @@ -1234,3 +1271,171 @@ func (bc *bitmapContainer) addOffset(x uint16) (container, container) { return low, high } + +// nextValue returns either the `target` if found or the next largest value. +// if the target is out of bounds a -1 is returned +// +// Example : +// Suppose the bitmap container represents the following slice +// [1,2,10,11,100] +// target=0 returns 1 +// target=1 returns 1 +// target=10 returns 10 +// target=90 returns 100 +func (bc *bitmapContainer) nextValue(target uint16) int { + if bc.cardinality == 0 { + return -1 + } + + return bc.NextSetBit(uint(target)) +} + +// nextAbsentValue returns the next absent value. +// if the target is out of bounds a -1 is returned +func (bc *bitmapContainer) nextAbsentValue(target uint16) int { + if bc.cardinality == 0 { + return -1 + } + + var ( + x = target >> 6 + length = uint(len(bc.bitmap)) + ) + if uint(x) >= length { + return -1 + } + w := bc.bitmap[x] + w = w >> uint(target%64) + if w == 0 { + return int(target) + } + + // Check if all 1's + // if statement - we skip the if we have all ones [1,1,1,1...1] + if ^w != 0 { + + if countTrailingZeros(w) > 0 { + // we have something like [X,Y,Z, 0,0,0]. This means the target bit is zero + return int(target) + } + + // other wise something like [X,Y,0,1,1,1..1], where x and y can be either 1 or 0. + + trailing := countTrailingOnes(w) + return int(target) + trailing + + } + x++ + for ; uint(x) < length; x++ { + if bc.bitmap[x] == 0 { + return int(x * 64) + } + if ^bc.bitmap[x] != 0 { + trailing := countTrailingOnes(bc.bitmap[x]) + return int(x*64) + trailing + } + + } + return -1 +} + +// previousValue returns either the `target` if found or the previous largest value. +// if the target is out of bounds a -1 is returned + +// Example : +// Suppose the bitmap container represents the following slice +// [1,2,10,11,100] +// target=0 returns -1 +// target=1 returns -1 +// target=2 returns -1 +// target=10 returns 9 +// target=50 returns 10 +// target=100 returns 99 +func (bc *bitmapContainer) previousValue(target uint16) int { + if bc.cardinality == 0 { + return -1 + } + return bc.uPrevSetBit(uint(target)) +} + +// previousAbsentValue returns the next absent value. +func (bc *bitmapContainer) previousAbsentValue(target uint16) int { + if bc.cardinality == 0 { + return -1 + } + + var ( + x = target >> 6 + length = uint(len(bc.bitmap)) + ) + if uint(x) >= length { + return -1 + } + w := bc.bitmap[x] + shifted := w >> uint(target%64) + if shifted == 0 { + return int(target) + } + + // Check if all 1's + // if statement - we skip if we have all ones [1,1,1,1...1] as no value is absent + if ^shifted != 0 { + + if countTrailingZeros(shifted) > 0 { + // we have something like shifted=[X,Y,Z,..., 0,0,0]. This means the target bit is zero + return int(target) + } + + // The rotate will rotate the target bit into the leading position. + // We know the target bit is not zero because of the countTrailingZero check above + // We then shift the target bit out of the way. + // Assume a structure like an original structure like [X,Y,Z,..., Target, A, B,C...] + // shifted will be [X,Y,Z...Target] + // shiftedRotated will be [A,B,C....] + // If countLeadingZeros > 0 then A is zero, if not at least A is 1 return + // Else count the number of ones's until a 0 + shiftedRotated := bits.RotateLeft64(w, int(64-uint(target%64))-1) << 1 + leadingZeros := countLeadingZeros(shiftedRotated) + if leadingZeros > 0 { + return int(target) - 1 + } + leadingOnes := countLeadingOnes(shiftedRotated) + if leadingOnes > 0 { + return int(target) - leadingOnes - 1 + } + + } + x++ + for ; uint(x) < length; x++ { + if bc.bitmap[x] == 0 { + return int(x * 64) + } + if ^bc.bitmap[x] != 0 { + trailing := countTrailingOnes(bc.bitmap[x]) + return int(x*64) + trailing + } + + } + return -1 +} + +// validate checks that the container size is non-negative +func (bc *bitmapContainer) validate() error { + if bc.cardinality < arrayDefaultMaxSize { + return fmt.Errorf("bitmap container size was less than: %d", arrayDefaultMaxSize) + } + + if maxCapacity < len(bc.bitmap)*64 { + return fmt.Errorf("bitmap slize size %d exceeded max capacity %d", maxCapacity, len(bc.bitmap)*64) + } + + if bc.cardinality > maxCapacity { + return fmt.Errorf("bitmap container size was greater than: %d", maxCapacity) + } + + if bc.cardinality != int(popcntSlice(bc.bitmap)) { + return fmt.Errorf("bitmap container size %d did not match underlying slice length: %d", bc.cardinality, int(popcntSlice(bc.bitmap))) + } + + return nil +} diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/clz.go b/vendor/github.com/RoaringBitmap/roaring/v2/clz.go new file mode 100644 index 000000000..ff49ac893 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/v2/clz.go @@ -0,0 +1,19 @@ +//go:build go1.9 +// +build go1.9 + +// "go1.9", from Go version 1.9 onward +// See https://golang.org/pkg/go/build/#hdr-Build_Constraints + +package roaring + +import "math/bits" + +// countLeadingOnes returns the number of leading zeros bits in x; the result is 64 for x == 0. +func countLeadingZeros(x uint64) int { + return bits.LeadingZeros64(x) +} + +// countLeadingOnes returns the number of leading ones bits in x; the result is 0 for x == 0. +func countLeadingOnes(x uint64) int { + return bits.LeadingZeros64(^x) +} diff --git a/vendor/github.com/RoaringBitmap/roaring/clz_compat.go b/vendor/github.com/RoaringBitmap/roaring/v2/clz_compat.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/clz_compat.go rename to vendor/github.com/RoaringBitmap/roaring/v2/clz_compat.go diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/ctz.go b/vendor/github.com/RoaringBitmap/roaring/v2/ctz.go new file mode 100644 index 000000000..b09bfbd1d --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/v2/ctz.go @@ -0,0 +1,21 @@ +//go:build go1.9 +// +build go1.9 + +// "go1.9", from Go version 1.9 onward +// See https://golang.org/pkg/go/build/#hdr-Build_Constraints + +package roaring + +import "math/bits" + +// countTrailingZeros returns the number of trailing zero bits in x; the result is 64 for x == 0. +func countTrailingZeros(x uint64) int { + return bits.TrailingZeros64(x) +} + +// countTrailingOnes returns the number of trailing one bits in x +// The result is 64 for x == 9,223,372,036,854,775,807. +// The result is 0 for x == 0. +func countTrailingOnes(x uint64) int { + return bits.TrailingZeros64(^x) +} diff --git a/vendor/github.com/RoaringBitmap/roaring/ctz_compat.go b/vendor/github.com/RoaringBitmap/roaring/v2/ctz_compat.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/ctz_compat.go rename to vendor/github.com/RoaringBitmap/roaring/v2/ctz_compat.go diff --git a/vendor/github.com/RoaringBitmap/roaring/fastaggregation.go b/vendor/github.com/RoaringBitmap/roaring/v2/fastaggregation.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/fastaggregation.go rename to vendor/github.com/RoaringBitmap/roaring/v2/fastaggregation.go diff --git a/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go b/vendor/github.com/RoaringBitmap/roaring/v2/internal/byte_input.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go rename to vendor/github.com/RoaringBitmap/roaring/v2/internal/byte_input.go diff --git a/vendor/github.com/RoaringBitmap/roaring/internal/pools.go b/vendor/github.com/RoaringBitmap/roaring/v2/internal/pools.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/internal/pools.go rename to vendor/github.com/RoaringBitmap/roaring/v2/internal/pools.go diff --git a/vendor/github.com/RoaringBitmap/roaring/manyiterator.go b/vendor/github.com/RoaringBitmap/roaring/v2/manyiterator.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/manyiterator.go rename to vendor/github.com/RoaringBitmap/roaring/v2/manyiterator.go diff --git a/vendor/github.com/RoaringBitmap/roaring/parallel.go b/vendor/github.com/RoaringBitmap/roaring/v2/parallel.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/parallel.go rename to vendor/github.com/RoaringBitmap/roaring/v2/parallel.go diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt.go b/vendor/github.com/RoaringBitmap/roaring/v2/popcnt.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/popcnt.go rename to vendor/github.com/RoaringBitmap/roaring/v2/popcnt.go diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_amd64.s b/vendor/github.com/RoaringBitmap/roaring/v2/popcnt_amd64.s similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/popcnt_amd64.s rename to vendor/github.com/RoaringBitmap/roaring/v2/popcnt_amd64.s diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_asm.go b/vendor/github.com/RoaringBitmap/roaring/v2/popcnt_asm.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/popcnt_asm.go rename to vendor/github.com/RoaringBitmap/roaring/v2/popcnt_asm.go diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_compat.go b/vendor/github.com/RoaringBitmap/roaring/v2/popcnt_compat.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/popcnt_compat.go rename to vendor/github.com/RoaringBitmap/roaring/v2/popcnt_compat.go diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_generic.go b/vendor/github.com/RoaringBitmap/roaring/v2/popcnt_generic.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/popcnt_generic.go rename to vendor/github.com/RoaringBitmap/roaring/v2/popcnt_generic.go diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_slices.go b/vendor/github.com/RoaringBitmap/roaring/v2/popcnt_slices.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/popcnt_slices.go rename to vendor/github.com/RoaringBitmap/roaring/v2/popcnt_slices.go diff --git a/vendor/github.com/RoaringBitmap/roaring/priorityqueue.go b/vendor/github.com/RoaringBitmap/roaring/v2/priorityqueue.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/priorityqueue.go rename to vendor/github.com/RoaringBitmap/roaring/v2/priorityqueue.go diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring.go similarity index 88% rename from vendor/github.com/RoaringBitmap/roaring/roaring.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring.go index a31cdbd9e..9972a51e2 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaring.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring.go @@ -12,7 +12,7 @@ import ( "io" "strconv" - "github.com/RoaringBitmap/roaring/internal" + "github.com/RoaringBitmap/roaring/v2/internal" "github.com/bits-and-blooms/bitset" ) @@ -26,7 +26,6 @@ func (rb *Bitmap) ToBase64() (string, error) { buf := new(bytes.Buffer) _, err := rb.WriteTo(buf) return base64.StdEncoding.EncodeToString(buf.Bytes()), err - } // FromBase64 deserializes a bitmap from Base64 @@ -54,10 +53,12 @@ func (rb *Bitmap) ToBytes() ([]byte, error) { return rb.highlowcontainer.toBytes() } -const wordSize = uint64(64) -const log2WordSize = uint64(6) -const capacity = ^uint64(0) -const bitmapContainerSize = (1 << 16) / 64 // bitmap size in words +const ( + wordSize = uint64(64) + log2WordSize = uint64(6) + capacity = ^uint64(0) + bitmapContainerSize = (1 << 16) / 64 // bitmap size in words +) // DenseSize returns the size of the bitmap when stored as a dense bitmap. func (rb *Bitmap) DenseSize() uint64 { @@ -276,14 +277,19 @@ func (rb *Bitmap) Checksum() uint64 { return hash } -// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy. +// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy +// (for advanced users only, you must be an expert Go programmer!). +// E.g., you can use this method to read a serialized bitmap from a memory-mapped file written out +// with the WriteTo method. +// The format specification is +// https://github.com/RoaringBitmap/RoaringFormatSpec // It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap. // This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually. // The containers in the resulting bitmap are immutable containers tied to the provided byte array and they rely on // copy-on-write which means that modifying them creates copies. Thus FromUnsafeBytes is more likely to be appropriate for read-only use cases, // when the resulting bitmap can be considered immutable. // -// See also the FromBuffer function. +// See also the FromBuffer function. We recommend benchmarking both functions to determine which one is more suitable for your use case. // See https://github.com/RoaringBitmap/roaring/pull/395 for more details. func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) { stream := internal.NewByteBuffer(data) @@ -291,11 +297,13 @@ func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, e } // ReadFrom reads a serialized version of this bitmap from stream. +// E.g., you can use this method to read a serialized bitmap from a file written +// with the WriteTo method. // The format is compatible with other RoaringBitmap // implementations (Java, C) and is documented here: // https://github.com/RoaringBitmap/RoaringFormatSpec -// Since io.Reader is regarded as a stream and cannot be read twice. -// So add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom. +// Since io.Reader is regarded as a stream and cannot be read twice, +// we add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom. // It is not necessary to pass cookieHeader when call roaring.ReadFrom to read the roaring32 data directly. func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) { stream, ok := reader.(internal.ByteInput) @@ -313,7 +321,18 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err return } -// FromBuffer creates a bitmap from its serialized version stored in buffer +// MustReadFrom calls ReadFrom internally. +// After deserialization Validate will be called. +// If the Bitmap fails to validate, a panic with the validation error will be thrown +func (rb *Bitmap) MustReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) { + rb.ReadFrom(reader, cookieHeader...) + if err := rb.Validate(); err != nil { + panic(err) + } + return +} + +// FromBuffer creates a bitmap from its serialized version stored in buffer (E.g., as written by WriteTo). // // The format specification is available here: // https://github.com/RoaringBitmap/RoaringFormatSpec @@ -960,7 +979,6 @@ func (rb *Bitmap) CheckedAdd(x uint32) bool { newac := newArrayContainer() rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newac.iaddReturnMinimized(lowbits(x))) return true - } // AddInt adds the integer x to the bitmap (convenience method: the parameter is casted to uint32 and we call Add) @@ -998,7 +1016,6 @@ func (rb *Bitmap) CheckedRemove(x uint32) bool { return C.getCardinality() < oldcard } return false - } // IsEmpty returns true if the Bitmap is empty (it is faster than doing (GetCardinality() == 0)) @@ -1088,7 +1105,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1187,7 +1204,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1256,7 +1273,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1396,7 +1413,7 @@ main: break main } s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1584,7 +1601,7 @@ main: } s1 = x1.highlowcontainer.getKeyAtIndex(pos1) s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { //s1 > s2 + } else { // s1 > s2 pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) if pos2 == length2 { break main @@ -1632,7 +1649,6 @@ func BitmapOf(dat ...uint32) *Bitmap { // The function uses 64-bit parameters even though a Bitmap stores 32-bit values because it is allowed and meaningful to use [0,uint64(0x100000000)) as a range // while uint64(0x100000000) cannot be represented as a 32-bit value. func (rb *Bitmap) Flip(rangeStart, rangeEnd uint64) { - if rangeEnd > MaxUint32+1 { panic("rangeEnd > MaxUint32+1") } @@ -1869,6 +1885,206 @@ func (rb *Bitmap) CloneCopyOnWriteContainers() { rb.highlowcontainer.cloneCopyOnWriteContainers() } +// NextValue returns the next largest value in the bitmap, or -1 +// if none is present. This function should not be used inside +// a performance-sensitive loop: prefer iterators if +// performance is a concern. +func (rb *Bitmap) NextValue(target uint32) int64 { + originalKey := highbits(target) + query := lowbits(target) + var nextValue int64 + nextValue = -1 + containerIndex := rb.highlowcontainer.advanceUntil(originalKey, -1) + for containerIndex < rb.highlowcontainer.size() && nextValue == -1 { + containerKey := rb.highlowcontainer.getKeyAtIndex(containerIndex) + container := rb.highlowcontainer.getContainer(containerKey) + // if containerKey > orginalKey then we are past the container which mapped to the orignal key + // in that case we can just return the minimum from that container + var responseBit int64 + if containerKey > originalKey { + bit, err := container.safeMinimum() + if err == nil { + responseBit = -1 + } + responseBit = int64(bit) + } else { + responseBit = int64(container.nextValue(query)) + } + + if responseBit == -1 { + nextValue = -1 + } else { + nextValue = int64(combineLoHi32(uint32(responseBit), uint32(containerKey))) + } + containerIndex++ + } + + return nextValue +} + +// PreviousValue returns the previous largest value in the bitmap, or -1 +// if none is present. This function should not be used inside +// a performance-sensitive loop: prefer iterators if +// performance is a concern. +func (rb *Bitmap) PreviousValue(target uint32) int64 { + if rb.IsEmpty() { + return -1 + } + + originalKey := highbits(uint32(target)) + query := lowbits(uint32(target)) + var prevValue int64 + prevValue = -1 + containerIndex := rb.highlowcontainer.advanceUntil(originalKey, -1) + + if containerIndex == rb.highlowcontainer.size() { + return int64(rb.Maximum()) + } + + if rb.highlowcontainer.getKeyAtIndex(containerIndex) > originalKey { + // target absent, key of first container after target too high + containerIndex-- + } + + for containerIndex != -1 && prevValue == -1 { + containerKey := rb.highlowcontainer.getKeyAtIndex(containerIndex) + container := rb.highlowcontainer.getContainer(containerKey) + // if containerKey > originalKey then we are past the container which mapped to the original key + // in that case we can just return the minimum from that container + var responseBit int + if containerKey < originalKey { + bit, err := container.safeMaximum() + + if err == nil { + responseBit = -1 + } + responseBit = int(bit) + } else { + responseBit = container.previousValue(query) + } + + if responseBit == -1 { + prevValue = -1 + } else { + prevValue = int64(combineLoHi32(uint32(responseBit), uint32(containerKey))) + } + containerIndex-- + } + + return prevValue +} + +// NextAbsentValue returns the next largest missing value in the bitmap, or -1 +// if none is present. This function should not be used inside +// a performance-sensitive loop: prefer iterators if +// performance is a concern. +func (rb *Bitmap) NextAbsentValue(target uint32) int64 { + originalKey := highbits(target) + query := lowbits(target) + var nextValue int64 + nextValue = -1 + + containerIndex := rb.highlowcontainer.advanceUntil(originalKey, -1) + if containerIndex == rb.highlowcontainer.size() { + // if we are here it means no container found, just return the target + return int64(target) + } + + containerKey := rb.highlowcontainer.getKeyAtIndex(containerIndex) + + keyspace := uint32(containerKey) << 16 + if target < keyspace { + // target is less than the start of the keyspace start + // that means target cannot be in the keyspace + return int64(target) + } + + container := rb.highlowcontainer.getContainer(containerKey) + nextValue = int64(container.nextAbsentValue(query)) + for { + if nextValue != (1 << 16) { + return int64(combineLoHi32(uint32(nextValue), keyspace)) + } + + if containerIndex == rb.highlowcontainer.size()-1 { + val, err := container.safeMaximum() + if err == nil { + return -1 + } + return int64(val) + 1 + } + containerIndex++ + nextContainerKey := rb.highlowcontainer.getKeyAtIndex(containerIndex) + if containerKey < nextContainerKey { + // There is a gap between keys + // Just increment the current key and shift to get HoB + return int64(containerKey+1) << 16 + } + containerKey = nextContainerKey + container = rb.highlowcontainer.getContainer(containerKey) + nextValue = int64(container.nextAbsentValue(0)) + } +} + +// PreviousAbsentValue returns the previous largest missing value in the bitmap, or -1 +// if none is present. This function should not be used inside +// a performance-sensitive loop: prefer iterators if +// performance is a concern. +func (rb *Bitmap) PreviousAbsentValue(target uint32) int64 { + originalKey := highbits(target) + query := lowbits(target) + var prevValue int64 + prevValue = -1 + + containerIndex := rb.highlowcontainer.advanceUntil(originalKey, -1) + + if containerIndex == rb.highlowcontainer.size() { + // if we are here it means no container found, just return the target + return int64(target) + } + + if containerIndex == -1 { + // if we are here it means no container found, just return the target + return int64(target) + } + + containerKey := rb.highlowcontainer.getKeyAtIndex(containerIndex) + keyspace := uint32(containerKey) << 16 + if target < keyspace { + // target is less than the start of the keyspace start + // that means target cannot be in the keyspace + return int64(target) + } + + container := rb.highlowcontainer.getContainer(containerKey) + prevValue = int64(container.previousAbsentValue(query)) + for { + if prevValue != -1 { + return int64(combineLoHi32(uint32(prevValue), keyspace)) + } + + if containerIndex == 0 { + val, err := container.safeMinimum() + if err == nil { + // OR panic, Java panics + return -1 + } + return int64(val) - 1 + } + containerIndex-- + nextContainerKey := rb.highlowcontainer.getKeyAtIndex(containerIndex) + if nextContainerKey < containerKey-1 { + // There is a gap between keys, eg missing container + // Just decrement the current key and shift to get HoB of the missing container + return (int64(containerKey) << 16) - 1 + } + containerKey = nextContainerKey + container = rb.highlowcontainer.getContainer(containerKey) + highestPossible16 := (1 << 16) - 1 + prevValue = int64(container.previousAbsentValue(uint16(highestPossible16))) + } +} + // FlipInt calls Flip after casting the parameters (convenience method) func FlipInt(bm *Bitmap, rangeStart, rangeEnd int) *Bitmap { return Flip(bm, uint64(rangeStart), uint64(rangeEnd)) @@ -1916,3 +2132,10 @@ func (rb *Bitmap) Stats() Statistics { } return stats } + +// Validate checks if the bitmap is internally consistent. +// You may call it after deserialization to check that the bitmap is valid. +// This function returns an error if the bitmap is invalid, nil otherwise. +func (rb *Bitmap) Validate() error { + return rb.highlowcontainer.validate() +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/Makefile similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/Makefile rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/Makefile diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/bsi64.go similarity index 69% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/bsi64.go index 6cae3284c..46dbe1210 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaring64/bsi64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/bsi64.go @@ -3,17 +3,9 @@ package roaring64 import ( "fmt" "io" - "math/bits" + "math/big" "runtime" "sync" - "sync/atomic" -) - -const ( - // Min64BitSigned - Minimum 64 bit value - Min64BitSigned = -9223372036854775808 - // Max64BitSigned - Maximum 64 bit value - Max64BitSigned = 9223372036854775807 ) // BSI is at its simplest is an array of bitmaps that represent an encoded @@ -32,13 +24,16 @@ type BSI struct { runOptimized bool } -// NewBSI constructs a new BSI. Min/Max values are optional. If set to 0 -// then the underlying BSI will be automatically sized. +// NewBSI constructs a new BSI. Note that it is your responsibility to ensure that +// the min/max values are set correctly. Queries CompareValue, MinMax, etc. will not +// work correctly if the min/max values are not set correctly. func NewBSI(maxValue int64, minValue int64) *BSI { - bitsz := bits.Len64(uint64(minValue)) - if bits.Len64(uint64(maxValue)) > bitsz { - bitsz = bits.Len64(uint64(maxValue)) + bitszmin := big.NewInt(minValue).BitLen() + 1 + bitszmax := big.NewInt(maxValue).BitLen() + 1 + bitsz := bitszmin + if bitszmax > bitsz { + bitsz = bitszmax } ba := make([]Bitmap, bitsz) return &BSI{bA: ba, MaxValue: maxValue, MinValue: minValue} @@ -81,41 +76,97 @@ func (b *BSI) GetCardinality() uint64 { // BitCount returns the number of bits needed to represent values. func (b *BSI) BitCount() int { - return len(b.bA) + return len(b.bA) - 1 // Exclude sign bit } -// SetValue sets a value for a given columnID. -func (b *BSI) SetValue(columnID uint64, value int64) { +// IsBigUInt returns the number of bits needed to represent values. +func (b *BSI) isBig() bool { + return len(b.bA) > 64 +} + +// IsNegative returns true for negative values +func (b *BSI) IsNegative(columnID uint64) bool { + if len(b.bA) == 0 { + return false + } + return b.bA[b.BitCount()].Contains(columnID) +} + +// SetBigValue sets a value that exceeds 64 bits +func (b *BSI) SetBigValue(columnID uint64, value *big.Int) { // If max/min values are set to zero then automatically determine bit array size if b.MaxValue == 0 && b.MinValue == 0 { - minBits := bits.Len64(uint64(value)) + minBits := value.BitLen() + 1 + if minBits == 1 { + minBits = 2 + } for len(b.bA) < minBits { b.bA = append(b.bA, Bitmap{}) } } - for i := 0; i < b.BitCount(); i++ { - if uint64(value)&(1< 0 { - b.bA[i].Add(columnID) - } else { + for i := b.BitCount(); i >= 0; i-- { + if value.Bit(i) == 0 { b.bA[i].Remove(columnID) + } else { + b.bA[i].Add(columnID) } } b.eBM.Add(columnID) } +// SetValue sets a value for a given columnID. +func (b *BSI) SetValue(columnID uint64, value int64) { + b.SetBigValue(columnID, big.NewInt(value)) +} + // GetValue gets the value at the column ID. Second param will be false for non-existent values. func (b *BSI) GetValue(columnID uint64) (value int64, exists bool) { + bv, exists := b.GetBigValue(columnID) + if !exists { + return + } + if !bv.IsInt64() { + if bv.Sign() == -1 { + msg := fmt.Errorf("can't represent a negative %d bit value as an int64", b.BitCount()) + panic(msg) + } + if bv.Sign() == 1 { + msg := fmt.Errorf("can't represent a positive %d bit value as an int64", b.BitCount()) + panic(msg) + } + } + return bv.Int64(), exists +} + +// GetBigValue gets the value at the column ID. Second param will be false for non-existent values. +func (b *BSI) GetBigValue(columnID uint64) (value *big.Int, exists bool) { exists = b.eBM.Contains(columnID) if !exists { return } - for i := 0; i < b.BitCount(); i++ { + val := big.NewInt(0) + for i := b.BitCount(); i >= 0; i-- { if b.bA[i].Contains(columnID) { - value |= 1 << i + bigBit := big.NewInt(1) + bigBit.Lsh(bigBit, uint(i)) + val.Or(val, bigBit) } } - return + + if b.IsNegative(columnID) { + val = negativeTwosComplementToInt(val) + } + return val, exists +} + +func negativeTwosComplementToInt(val *big.Int) *big.Int { + inverted := new(big.Int).Not(val) + mask := new(big.Int).Lsh(big.NewInt(1), uint(val.BitLen())) + inverted.And(inverted, mask.Sub(mask, big.NewInt(1))) + inverted.Add(inverted, big.NewInt(1)) + val.Neg(inverted) + return val } type action func(t *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.WaitGroup) @@ -235,13 +286,15 @@ const ( type task struct { bsi *BSI op Operation - valueOrStart int64 - end int64 - values map[int64]struct{} + valueOrStart *big.Int + end *big.Int + values map[string]struct{} bits *Bitmap } // CompareValue compares value. +// Values should be in the range of the BSI (max, min). If the value is outside the range, the result +// might erroneous. The operation parameter indicates the type of comparison to be made. // For all operations with the exception of RANGE, the value to be compared is specified by valueOrStart. // For the RANGE parameter the comparison criteria is >= valueOrStart and <= end. // The parallelism parameter indicates the number of CPU threads to be applied for processing. A value @@ -249,6 +302,26 @@ type task struct { func (b *BSI) CompareValue(parallelism int, op Operation, valueOrStart, end int64, foundSet *Bitmap) *Bitmap { + return b.CompareBigValue(parallelism, op, big.NewInt(valueOrStart), big.NewInt(end), foundSet) +} + +// CompareBigValue compares value. +// Values should be in the range of the BSI (max, min). If the value is outside the range, the result +// might erroneous. The operation parameter indicates the type of comparison to be made. +// For all operations with the exception of RANGE, the value to be compared is specified by valueOrStart. +// For the RANGE parameter the comparison criteria is >= valueOrStart and <= end. +// The parallelism parameter indicates the number of CPU threads to be applied for processing. A value +// of zero indicates that all available CPU resources will be potentially utilized. +func (b *BSI) CompareBigValue(parallelism int, op Operation, valueOrStart, end *big.Int, + foundSet *Bitmap) *Bitmap { + + if valueOrStart == nil { + valueOrStart = b.MinMaxBig(parallelism, MIN, &b.eBM) + } + if end == nil && op == RANGE { + end = b.MinMaxBig(parallelism, MAX, &b.eBM) + } + comp := &task{bsi: b, op: op, valueOrStart: valueOrStart, end: end} if foundSet == nil { return parallelExecutor(parallelism, comp, compareValue, &b.eBM) @@ -256,6 +329,53 @@ func (b *BSI) CompareValue(parallelism int, op Operation, valueOrStart, end int6 return parallelExecutor(parallelism, comp, compareValue, foundSet) } +// Returns a twos complement value given a value, the return will be bit extended to 'bits' length +// if the value is negative +func twosComplement(num *big.Int, bitCount int) *big.Int { + // Check if the number is negative + isNegative := num.Sign() < 0 + + // Get the absolute value if negative + abs := new(big.Int).Abs(num) + + // Convert to binary string + binStr := abs.Text(2) + + // Pad with zeros to the left + if len(binStr) < bitCount { + binStr = fmt.Sprintf("%0*s", bitCount, binStr) + } + + // If negative, calculate two's complement + if isNegative { + // Invert bits + inverted := make([]byte, len(binStr)) + for i := range binStr { + if binStr[i] == '0' { + inverted[i] = '1' + } else { + inverted[i] = '0' + } + } + + // Add 1 + carry := byte(1) + for i := len(inverted) - 1; i >= 0; i-- { + inverted[i] += carry + if inverted[i] == '2' { + inverted[i] = '0' + } else { + break + } + } + binStr = string(inverted) + } + + bigInt := new(big.Int) + _, _ = bigInt.SetString(binStr, 2) + return bigInt +} + func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.WaitGroup) { defer wg.Done() @@ -265,32 +385,31 @@ func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.Wa results.RunOptimize() } - x := e.bsi.BitCount() - startIsNegative := x == 64 && uint64(e.valueOrStart)&(1< 0 - endIsNegative := x == 64 && uint64(e.end)&(1< 0 + startIsNegative := e.valueOrStart.Sign() == -1 + endIsNegative := true + if e.end != nil { + endIsNegative = e.end.Sign() == -1 + } for i := 0; i < len(batch); i++ { cID := batch[i] eq1, eq2 := true, true lt1, lt2, gt1 := false, false, false - j := e.bsi.BitCount() - 1 - isNegative := false - if x == 64 { - isNegative = e.bsi.bA[j].Contains(cID) - j-- - } + j := e.bsi.BitCount() + isNegative := e.bsi.IsNegative(cID) compStartValue := e.valueOrStart compEndValue := e.end if isNegative != startIsNegative { - compStartValue = ^e.valueOrStart + 1 + compStartValue = twosComplement(e.valueOrStart, e.bsi.BitCount()+1) } - if isNegative != endIsNegative { - compEndValue = ^e.end + 1 + if isNegative != endIsNegative && e.end != nil { + compEndValue = twosComplement(e.end, e.bsi.BitCount()+1) } + for ; j >= 0; j-- { sliceContainsBit := e.bsi.bA[j].Contains(cID) - if uint64(compStartValue)&(1< 0 { + if compStartValue.Bit(j) == 1 { // BIT in value is SET if !sliceContainsBit { if eq1 { @@ -303,7 +422,9 @@ func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.Wa } } eq1 = false - break + if e.op != RANGE { + break + } } } } else { @@ -319,6 +440,7 @@ func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.Wa } } eq1 = false + if e.op != RANGE { break } @@ -326,7 +448,7 @@ func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.Wa } } - if e.op == RANGE && uint64(compEndValue)&(1< 0 { + if e.op == RANGE && compEndValue.Bit(j) == 1 { // BIT in value is SET if !sliceContainsBit { if eq2 { @@ -347,11 +469,9 @@ func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.Wa lt2 = true } eq2 = false - break } } } - } switch e.op { @@ -387,15 +507,24 @@ func compareValue(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.Wa resultsChan <- results } -// MinMax - Find minimum or maximum value. +// MinMax - Find minimum or maximum int64 value. func (b *BSI) MinMax(parallelism int, op Operation, foundSet *Bitmap) int64 { + return b.MinMaxBig(parallelism, op, foundSet).Int64() +} + +// MinMaxBig - Find minimum or maximum value. +func (b *BSI) MinMaxBig(parallelism int, op Operation, foundSet *Bitmap) *big.Int { var n int = parallelism if n == 0 { n = runtime.NumCPU() } - resultsChan := make(chan int64, n) + resultsChan := make(chan *big.Int, n) + + if foundSet == nil { + foundSet = &b.eBM + } card := foundSet.GetCardinality() x := card / uint64(n) @@ -418,63 +547,87 @@ func (b *BSI) MinMax(parallelism int, op Operation, foundSet *Bitmap) int64 { wg.Wait() close(resultsChan) - var minMax int64 + var minMax *big.Int + minSigned, maxSigned := minMaxSignedInt(b.BitCount() + 1) if op == MAX { - minMax = Min64BitSigned + minMax = minSigned } else { - minMax = Max64BitSigned + minMax = maxSigned } for val := range resultsChan { - if (op == MAX && val > minMax) || (op == MIN && val <= minMax) { + if (op == MAX && val.Cmp(minMax) > 0) || (op == MIN && val.Cmp(minMax) <= 0) { minMax = val } } return minMax } -func (b *BSI) minOrMax(op Operation, batch []uint64, resultsChan chan int64, wg *sync.WaitGroup) { +func minMaxSignedInt(bits int) (*big.Int, *big.Int) { + // Calculate the maximum value + max := new(big.Int).Lsh(big.NewInt(1), uint(bits-1)) + max.Sub(max, big.NewInt(1)) + + // Calculate the minimum value + min := new(big.Int).Neg(max) + min.Sub(min, big.NewInt(1)) + + return min, max +} + +func (b *BSI) minOrMax(op Operation, batch []uint64, resultsChan chan *big.Int, wg *sync.WaitGroup) { defer wg.Done() - x := b.BitCount() - var value int64 = Max64BitSigned + x := b.BitCount() + 1 + var value *big.Int + minSigned, maxSigned := minMaxSignedInt(x) if op == MAX { - value = Min64BitSigned + value = minSigned + } else { + value = maxSigned } for i := 0; i < len(batch); i++ { cID := batch[i] eq := true lt, gt := false, false - j := b.BitCount() - 1 - var cVal int64 - valueIsNegative := uint64(value)&(1< 0 && bits.Len64(uint64(value)) == 64 - isNegative := false - if x == 64 { - isNegative = b.bA[j].Contains(cID) - if isNegative { - cVal |= 1 << uint64(j) - } - j-- - } + j := b.BitCount() + cVal := new(big.Int) + valueIsNegative := value.Sign() == -1 + isNegative := b.IsNegative(cID) + compValue := value if isNegative != valueIsNegative { - compValue = ^value + 1 + // convert compValue to twos complement + inverted := new(big.Int).Not(compValue) + mask := new(big.Int).Lsh(big.NewInt(1), uint(compValue.BitLen())) + inverted.And(inverted, mask.Sub(mask, big.NewInt(1))) + inverted.Add(inverted, big.NewInt(1)) } + + done := false for ; j >= 0; j-- { sliceContainsBit := b.bA[j].Contains(cID) if sliceContainsBit { - cVal |= 1 << uint64(j) + bigBit := big.NewInt(1) + bigBit.Lsh(bigBit, uint(j)) + cVal.Or(cVal, bigBit) + if isNegative { + cVal = negativeTwosComplementToInt(cVal) + } } - if uint64(compValue)&(1< 0 { + if done { + continue + } + if compValue.Bit(j) == 1 { // BIT in value is SET if !sliceContainsBit { if eq { eq = false if op == MAX && valueIsNegative && !isNegative { gt = true - break + done = true } if op == MIN && (!valueIsNegative || (valueIsNegative == isNegative)) { lt = true @@ -491,11 +644,13 @@ func (b *BSI) minOrMax(op Operation, batch []uint64, resultsChan chan int64, wg } if op == MAX && (valueIsNegative || (valueIsNegative == isNegative)) { gt = true + done = true } } } } } + if lt || gt { value = cVal } @@ -506,19 +661,37 @@ func (b *BSI) minOrMax(op Operation, batch []uint64, resultsChan chan int64, wg // Sum all values contained within the foundSet. As a convenience, the cardinality of the foundSet // is also returned (for calculating the average). -func (b *BSI) Sum(foundSet *Bitmap) (sum int64, count uint64) { +func (b *BSI) Sum(foundSet *Bitmap) (int64, uint64) { + val, count := b.SumBigValues(foundSet) + return val.Int64(), count +} +// SumBigValues - Sum all values contained within the foundSet. As a convenience, the cardinality of the foundSet +// is also returned (for calculating the average). This method will sum arbitrarily large values. +func (b *BSI) SumBigValues(foundSet *Bitmap) (sum *big.Int, count uint64) { + if foundSet == nil { + foundSet = &b.eBM + } + sum = new(big.Int) count = foundSet.GetCardinality() + resultsChan := make(chan int64, b.BitCount()) var wg sync.WaitGroup for i := 0; i < b.BitCount(); i++ { wg.Add(1) go func(j int) { defer wg.Done() - atomic.AddInt64(&sum, int64(foundSet.AndCardinality(&b.bA[j])< bits { - bits = bsis[i].BitCount() + bits = len(bsis[i].bA ) } } // Make sure we have enough bit slices - for bits > b.BitCount() { + for bits > len(b.bA) { bm := Bitmap{} bm.RunOptimize() b.bA = append(b.bA, bm) @@ -725,10 +900,20 @@ func (b *BSI) WriteTo(w io.Writer) (n int64, err error) { // BatchEqual returns a bitmap containing the column IDs where the values are contained within the list of values provided. func (b *BSI) BatchEqual(parallelism int, values []int64) *Bitmap { + //convert list of int64 values to big.Int(s) + bigValues := make([]*big.Int, len(values)) + for i, v := range values { + bigValues[i] = big.NewInt(v) + } + return b.BatchEqualBig(parallelism, bigValues) +} - valMap := make(map[int64]struct{}, len(values)) +// BatchEqualBig returns a bitmap containing the column IDs where the values are contained within the list of values provided. +func (b *BSI) BatchEqualBig(parallelism int, values []*big.Int) *Bitmap { + + valMap := make(map[string]struct{}, len(values)) for i := 0; i < len(values); i++ { - valMap[values[i]] = struct{}{} + valMap[string(values[i].Bytes())] = struct{}{} } comp := &task{bsi: b, values: valMap} return parallelExecutor(parallelism, comp, batchEqual, &b.eBM) @@ -746,8 +931,8 @@ func batchEqual(e *task, batch []uint64, resultsChan chan *Bitmap, for i := 0; i < len(batch); i++ { cID := batch[i] - if value, ok := e.bsi.GetValue(uint64(cID)); ok { - if _, yes := e.values[int64(value)]; yes { + if value, ok := e.bsi.GetBigValue(uint64(cID)); ok { + if _, yes := e.values[string(value.Bytes())]; yes { results.Add(cID) } } @@ -786,8 +971,8 @@ func (b *BSI) ClearValues(foundSet *Bitmap) { // NewBSIRetainSet - Construct a new BSI from a clone of existing BSI, retain only values contained in foundSet func (b *BSI) NewBSIRetainSet(foundSet *Bitmap) *BSI { - newBSI := NewBSI(b.MaxValue, b.MinValue) - newBSI.bA = make([]Bitmap, b.BitCount()) + newBSI := NewDefaultBSI() + newBSI.bA = make([]Bitmap, b.BitCount()+1) var wg sync.WaitGroup wg.Add(1) go func() { @@ -823,13 +1008,13 @@ func (b *BSI) Add(other *BSI) { func (b *BSI) addDigit(foundSet *Bitmap, i int) { - if i >= len(b.bA) { + if i >= b.BitCount()+1 || b.BitCount() == 0 { b.bA = append(b.bA, Bitmap{}) } carry := And(&b.bA[i], foundSet) b.bA[i].Xor(foundSet) if !carry.IsEmpty() { - if i+1 >= len(b.bA) { + if i+1 >= b.BitCount() { b.bA = append(b.bA, Bitmap{}) } b.addDigit(carry, i+1) @@ -841,7 +1026,12 @@ func (b *BSI) addDigit(foundSet *Bitmap, i int) { // is useful for situations where there is a one-to-many relationship between the vectored integer sets. The resulting BSI // contains the number of times a particular value appeared in the input BSI. func (b *BSI) TransposeWithCounts(parallelism int, foundSet, filterSet *Bitmap) *BSI { - + if foundSet == nil { + foundSet = &b.eBM + } + if filterSet == nil { + filterSet = &b.eBM + } return parallelExecutorBSIResults(parallelism, b, transposeWithCounts, foundSet, filterSet, true) } @@ -871,6 +1061,9 @@ func transposeWithCounts(input *BSI, filterSet *Bitmap, batch []uint64, resultsC // Increment - In-place increment of values in a BSI. Found set select columns for incrementing. func (b *BSI) Increment(foundSet *Bitmap) { + if foundSet == nil { + foundSet = &b.eBM + } b.addDigit(foundSet, 0) b.eBM.Or(foundSet) } diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/fastaggregation64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/fastaggregation64.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/fastaggregation64.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/fastaggregation64.go diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/iterables64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/iterables64.go similarity index 99% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/iterables64.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/iterables64.go index 73e4f1856..df19fac1f 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaring64/iterables64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/iterables64.go @@ -1,7 +1,7 @@ package roaring64 import ( - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) // IntIterable64 allows you to iterate over the values in a Bitmap diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/parallel64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/parallel64.go similarity index 95% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/parallel64.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/parallel64.go index 6fe1803b2..5dadc8dea 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaring64/parallel64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/parallel64.go @@ -4,7 +4,7 @@ import ( "fmt" "runtime" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) var defaultWorkerCount = runtime.NumCPU() @@ -144,6 +144,8 @@ func (c parChunk) size() int { return c.ra.size() } +// parNaiveStartAt returns the index of the first key that is inclusive between start and last +// Returns the size if there is no such key func parNaiveStartAt(ra *roaringArray64, start uint32, last uint32) int { for idx, key := range ra.keys { if key >= start && key <= last { @@ -170,7 +172,6 @@ func orOnRange(ra1, ra2 *roaringArray64, start, last uint32) *roaringArray64 { key2 = ra2.getKeyAtIndex(idx2) for key1 <= last && key2 <= last { - if key1 < key2 { answer.appendCopy(*ra1, idx1) idx1++ @@ -188,7 +189,7 @@ func orOnRange(ra1, ra2 *roaringArray64, start, last uint32) *roaringArray64 { } else { c1 := ra1.getContainerAtIndex(idx1) - //answer.appendContainer(key1, c1.lazyOR(ra2.getContainerAtIndex(idx2)), false) + // answer.appendContainer(key1, c1.lazyOR(ra2.getContainerAtIndex(idx2)), false) answer.appendContainer(key1, roaring.Or(c1, ra2.getContainerAtIndex(idx2)), false) idx1++ idx2++ @@ -261,7 +262,7 @@ func iorOnRange(ra1, ra2 *roaringArray64, start, last uint32) *roaringArray64 { } else { c1 := ra1.getWritableContainerAtIndex(idx1) - //ra1.containers[idx1] = c1.lazyIOR(ra2.getContainerAtIndex(idx2)) + // ra1.containers[idx1] = c1.lazyIOR(ra2.getContainerAtIndex(idx2)) c1.Or(ra2.getContainerAtIndex(idx2)) ra1.setContainerAtIndex(idx1, c1) diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaring64.go similarity index 98% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaring64.go index 9e7f6b7f1..ebea5ffcb 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaring64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaring64.go @@ -8,12 +8,14 @@ import ( "io" "strconv" - "github.com/RoaringBitmap/roaring" - "github.com/RoaringBitmap/roaring/internal" + "github.com/RoaringBitmap/roaring/v2" + "github.com/RoaringBitmap/roaring/v2/internal" ) -const serialCookieNoRunContainer = 12346 // only arrays and bitmaps -const serialCookie = 12347 // runs, arrays, and bitmaps +const ( + serialCookieNoRunContainer = 12346 // only arrays and bitmaps + serialCookie = 12347 // runs, arrays, and bitmaps +) // Bitmap represents a compressed bitmap where you can add integers. type Bitmap struct { @@ -25,7 +27,6 @@ func (rb *Bitmap) ToBase64() (string, error) { buf := new(bytes.Buffer) _, err := rb.WriteTo(buf) return base64.StdEncoding.EncodeToString(buf.Bytes()), err - } // FromBase64 deserializes a bitmap from Base64 @@ -52,7 +53,6 @@ func (rb *Bitmap) ToBytes() ([]byte, error) { // implementations (Java, Go, C++) and it has a specification : // https://github.com/RoaringBitmap/RoaringFormatSpec#extention-for-64-bit-implementations func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) { - var n int64 buf := make([]byte, 8) binary.LittleEndian.PutUint64(buf, uint64(rb.highlowcontainer.size())) @@ -87,11 +87,10 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) { func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) { stream := internal.NewByteBuffer(data) sizeBuf := make([]byte, 8) - n, err := stream.Read(sizeBuf) + _, err = stream.Read(sizeBuf) if err != nil { return 0, err } - p += int64(n) size := binary.LittleEndian.Uint64(sizeBuf) rb.highlowcontainer.resize(0) @@ -115,17 +114,16 @@ func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) { if err != nil { return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read key #%d: %w", i, err) } - p += 4 rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf) rb.highlowcontainer.containers[i] = roaring.NewBitmap() n, err := rb.highlowcontainer.containers[i].ReadFrom(stream) + if n == 0 || err != nil { return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) } - p += int64(n) } - return p, nil + return stream.GetReadBytes(), nil } // ReadFrom reads a serialized version of this bitmap from stream. @@ -167,23 +165,15 @@ func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) { rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf) rb.highlowcontainer.containers[i] = roaring.NewBitmap() n, err := rb.highlowcontainer.containers[i].ReadFrom(stream) + if n == 0 || err != nil { return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) } p += int64(n) } - return p, nil } -// FromBuffer creates a bitmap from its serialized version stored in buffer -// func (rb *Bitmap) FromBuffer(data []byte) (p int64, err error) { -// -// // TODO: Add buffer interning as in base roaring package. -// buf := bytes.NewBuffer(data) -// return rb.ReadFrom(buf) -// } - // MarshalBinary implements the encoding.BinaryMarshaler interface for the bitmap // (same as ToBytes) func (rb *Bitmap) MarshalBinary() ([]byte, error) { @@ -1251,6 +1241,10 @@ func (rb *Bitmap) GetSerializedSizeInBytes() uint64 { return rb.highlowcontainer.serializedSizeInBytes() } +func (rb *Bitmap) Validate() error { + return rb.highlowcontainer.validate() +} + // Roaring32AsRoaring64 inserts a 32-bit roaring bitmap into // a 64-bit roaring bitmap. No copy is made. func Roaring32AsRoaring64(bm32 *roaring.Bitmap) *Bitmap { diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaringarray64.go similarity index 87% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaringarray64.go index 26aabd722..09c366ff7 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaring64/roaringarray64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaringarray64.go @@ -1,7 +1,9 @@ package roaring64 import ( - "github.com/RoaringBitmap/roaring" + "errors" + + "github.com/RoaringBitmap/roaring/v2" ) type roaringArray64 struct { @@ -11,6 +13,11 @@ type roaringArray64 struct { copyOnWrite bool } +var ( + ErrKeySortOrder = errors.New("keys were out of order") + ErrCardinalityConstraint = errors.New("size of arrays was not coherent") +) + // runOptimize compresses the element containers to minimize space consumed. // Q: how does this interact with copyOnWrite and needCopyOnWrite? // A: since we aren't changing the logical content, just the representation, @@ -140,7 +147,6 @@ func (ra *roaringArray64) clear() { } func (ra *roaringArray64) clone() *roaringArray64 { - sa := roaringArray64{} sa.copyOnWrite = ra.copyOnWrite @@ -328,6 +334,15 @@ func (ra *roaringArray64) hasRunCompression() bool { return false } +/** + * Find the smallest integer index strictly larger than pos such that array[index].key>=min. If none can + * be found, return size. Based on code by O. Kaser. + * + * @param min minimal value + * @param pos index to exceed + * @return the smallest index greater than pos such that array[index].key is at least as large as + * min, or size if it is not possible. + */ func (ra *roaringArray64) advanceUntil(min uint32, pos int) int { lower := pos + 1 @@ -401,3 +416,47 @@ func (ra *roaringArray64) serializedSizeInBytes() uint64 { } return answer } + +func (ra *roaringArray64) checkKeysSorted() bool { + if len(ra.keys) == 0 || len(ra.keys) == 1 { + return true + } + previous := ra.keys[0] + for nextIdx := 1; nextIdx < len(ra.keys); nextIdx++ { + next := ra.keys[nextIdx] + if previous >= next { + return false + } + previous = next + + } + return true +} + +// validate checks the referential integrity +// ensures len(keys) == len(containers), recurses and checks each container type +func (ra *roaringArray64) validate() error { + if !ra.checkKeysSorted() { + return ErrKeySortOrder + } + + if len(ra.keys) != len(ra.containers) { + return ErrCardinalityConstraint + } + + if len(ra.keys) != len(ra.needCopyOnWrite) { + return ErrCardinalityConstraint + } + + for _, maps := range ra.containers { + err := maps.Validate() + if err != nil { + return err + } + if maps.IsEmpty() { + return errors.New("empty container") + } + } + + return nil +} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring64/util.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/util.go similarity index 92% rename from vendor/github.com/RoaringBitmap/roaring/roaring64/util.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaring64/util.go index 3743cd7db..6e576340a 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaring64/util.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/util.go @@ -1,6 +1,6 @@ package roaring64 -import "github.com/RoaringBitmap/roaring" +import "github.com/RoaringBitmap/roaring/v2" func highbits(x uint64) uint32 { return uint32(x >> 32) diff --git a/vendor/github.com/RoaringBitmap/roaring/roaringarray.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaringarray.go similarity index 91% rename from vendor/github.com/RoaringBitmap/roaring/roaringarray.go rename to vendor/github.com/RoaringBitmap/roaring/v2/roaringarray.go index 079195dda..40be90a56 100644 --- a/vendor/github.com/RoaringBitmap/roaring/roaringarray.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaringarray.go @@ -3,10 +3,11 @@ package roaring import ( "bytes" "encoding/binary" + "errors" "fmt" "io" - "github.com/RoaringBitmap/roaring/internal" + "github.com/RoaringBitmap/roaring/v2/internal" ) type container interface { @@ -30,7 +31,6 @@ type container interface { iadd(x uint16) bool // inplace, returns true if x was new. iaddReturnMinimized(uint16) container // may change return type to minimize storage. - //addRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) iaddRange(start, endx int) container // i stands for inplace, range is [firstOfRange,endx) iremove(x uint16) bool // inplace, returns true if x was present. @@ -61,7 +61,6 @@ type container interface { lazyOR(r container) container lazyIOR(r container) container getSizeInBytes() int - //removeRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) iremoveRange(start, final int) container // i stands for inplace, range is [firstOfRange,lastOfRange) selectInt(x uint16) int // selectInt returns the xth integer in the container serializedSizeInBytes() int @@ -71,6 +70,14 @@ type container interface { toEfficientContainer() container String() string containerType() contype + + safeMinimum() (uint16, error) + safeMaximum() (uint16, error) + nextValue(x uint16) int + previousValue(x uint16) int + nextAbsentValue(x uint16) int + previousAbsentValue(x uint16) int + validate() error } type contype uint8 @@ -82,6 +89,11 @@ const ( run32Contype ) +var ( + ErrKeySortOrder = errors.New("keys were out of order") + ErrCardinalityConstraint = errors.New("size of arrays was not coherent") +) + // careful: range is [firstOfRange,lastOfRange] func rangeOfOnes(start, last int) container { if start > MaxUint16 { @@ -178,7 +190,6 @@ func (ra *roaringArray) appendCopiesUntil(sa roaringArray, stoppingKey uint16) { } else { // since there is no copy-on-write, we need to clone the container (this is important) ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) - } } } @@ -204,7 +215,6 @@ func (ra *roaringArray) appendCopiesAfter(sa roaringArray, beforeStart uint16) { } else { // since there is no copy-on-write, we need to clone the container (this is important) ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) - } } } @@ -239,7 +249,6 @@ func (ra *roaringArray) clear() { } func (ra *roaringArray) clone() *roaringArray { - sa := roaringArray{} sa.copyOnWrite = ra.copyOnWrite @@ -288,6 +297,8 @@ func (ra *roaringArray) cloneCopyOnWriteContainers() { // return (ra.binarySearch(0, int64(len(ra.keys)), x) >= 0) //} +// getContainer returns the container with key `x` +// if no such container exists `nil` is returned func (ra *roaringArray) getContainer(x uint16) container { i := ra.binarySearch(0, int64(len(ra.keys)), x) if i < 0 { @@ -325,7 +336,6 @@ func (ra *roaringArray) getUnionedWritableContainer(pos int, other container) co return ra.getContainerAtIndex(pos).or(other) } return ra.getContainerAtIndex(pos).ior(other) - } func (ra *roaringArray) getWritableContainerAtIndex(i int) container { @@ -336,7 +346,10 @@ func (ra *roaringArray) getWritableContainerAtIndex(i int) container { return ra.containers[i] } +// getIndex returns the index of the container with key `x` +// if no such container exists a negative value is returned func (ra *roaringArray) getIndex(x uint16) int { + // Todo : test // before the binary search, we optimize for frequent cases size := len(ra.keys) if (size == 0) || (ra.keys[size-1] == x) { @@ -396,7 +409,10 @@ func (ra *roaringArray) size() int { return len(ra.keys) } +// binarySearch returns the index of the key. +// negative value returned if not found func (ra *roaringArray) binarySearch(begin, end int64, ikey uint16) int { + // TODO: add unit tests low := begin high := end - 1 for low+16 <= high { @@ -455,7 +471,6 @@ func (ra *roaringArray) headerSize() uint64 { return 4 + (size+7)/8 + 8*size // - 4 because we pack the size with the cookie } return 4 + 4 + 8*size - } // should be dirt cheap @@ -489,7 +504,7 @@ func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) { binary.LittleEndian.PutUint16(buf[2:], uint16(len(ra.keys)-1)) nw += 2 // compute isRun bitmap without temporary allocation - var runbitmapslice = buf[nw : nw+isRunSizeInBytes] + runbitmapslice := buf[nw : nw+isRunSizeInBytes] for i, c := range ra.containers { switch c.(type) { case *runContainer16: @@ -577,7 +592,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte // create is-run-container bitmap isRunBitmapSize := (int(size) + 7) / 8 isRunBitmap, err = stream.Next(isRunBitmapSize) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("malformed bitmap, failed to read is-run bitmap, got: %s", err) } @@ -596,7 +610,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte // descriptive header buf, err := stream.Next(2 * 2 * int(size)) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read descriptive header: %s", err) } @@ -637,13 +650,11 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte if isRunBitmap != nil && isRunBitmap[i/8]&(1<<(i%8)) != 0 { // run container nr, err := stream.ReadUInt16() - if err != nil { return 0, fmt.Errorf("failed to read runtime container size: %s", err) } buf, err := stream.Next(int(nr) * 4) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read runtime container content: %s", err) } @@ -656,7 +667,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte } else if card > arrayDefaultMaxSize { // bitmap container buf, err := stream.Next(arrayDefaultMaxSize * 2) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read bitmap container: %s", err) } @@ -670,7 +680,6 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte } else { // array container buf, err := stream.Next(card * 2) - if err != nil { return stream.GetReadBytes(), fmt.Errorf("failed to read array container: %s", err) } @@ -696,6 +705,15 @@ func (ra *roaringArray) hasRunCompression() bool { return false } +/** + * Find the smallest integer index larger than pos such that array[index].key>=min. If none can + * be found, return size. Based on code by O. Kaser. + * + * @param min minimal value + * @param pos index to exceed + * @return the smallest index greater than pos such that array[index].key is at least as large as + * min, or size if it is not possible. + */ func (ra *roaringArray) advanceUntil(min uint16, pos int) int { lower := pos + 1 @@ -759,3 +777,44 @@ func (ra *roaringArray) needsCopyOnWrite(i int) bool { func (ra *roaringArray) setNeedsCopyOnWrite(i int) { ra.needCopyOnWrite[i] = true } + +func (ra *roaringArray) checkKeysSorted() bool { + if len(ra.keys) == 0 || len(ra.keys) == 1 { + return true + } + previous := ra.keys[0] + for nextIdx := 1; nextIdx < len(ra.keys); nextIdx++ { + next := ra.keys[nextIdx] + if previous >= next { + return false + } + previous = next + + } + return true +} + +// validate checks the referential integrity +// ensures len(keys) == len(containers), recurses and checks each container type +func (ra *roaringArray) validate() error { + if !ra.checkKeysSorted() { + return ErrKeySortOrder + } + + if len(ra.keys) != len(ra.containers) { + return ErrCardinalityConstraint + } + + if len(ra.keys) != len(ra.needCopyOnWrite) { + return ErrCardinalityConstraint + } + + for _, container := range ra.containers { + err := container.validate() + if err != nil { + return err + } + } + + return nil +} diff --git a/vendor/github.com/RoaringBitmap/roaring/runcontainer.go b/vendor/github.com/RoaringBitmap/roaring/v2/runcontainer.go similarity index 90% rename from vendor/github.com/RoaringBitmap/roaring/runcontainer.go rename to vendor/github.com/RoaringBitmap/roaring/v2/runcontainer.go index 7098ba28f..ac9ea1b45 100644 --- a/vendor/github.com/RoaringBitmap/roaring/runcontainer.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/runcontainer.go @@ -39,9 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ import ( + "errors" "fmt" "sort" - "unsafe" ) // runContainer16 does run-length encoding of sets of @@ -59,6 +59,16 @@ type interval16 struct { length uint16 // length minus 1 } +var ( + ErrRunIntervalsEmpty = errors.New("run contained no interval") + ErrRunNonSorted = errors.New("runs were not sorted") + ErrRunIntervalEqual = errors.New("intervals were equal") + ErrRunIntervalOverlap = errors.New("intervals overlapped or were continguous") + ErrRunIntervalSize = errors.New("too many intervals relative to data") + MaxNumIntervals = 2048 + MaxIntervalsSum = 2048 +) + func newInterval16Range(start, last uint16) interval16 { if last < start { panic(fmt.Sprintf("last (%d) cannot be smaller than start (%d)", last, start)) @@ -201,7 +211,6 @@ func newRunContainer16FromVals(alreadySorted bool, vals ...uint16) *runContainer // somewhat efficiently. For reference, see the Java // https://github.com/RoaringBitmap/RoaringBitmap/blob/master/src/main/java/org/roaringbitmap/RunContainer.java#L145-L192 func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 { - rc := &runContainer16{} nbrRuns := bc.numberOfRuns() if nbrRuns == 0 { @@ -251,7 +260,6 @@ func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 { curWord = curWordWith1s & (curWordWith1s + 1) // We've lathered and rinsed, so repeat... } - } // newRunContainer16FromArray populates a new @@ -293,7 +301,6 @@ func newRunContainer16FromArray(arr *arrayContainer) *runContainer16 { // If you have a small number of additions to an already // big runContainer16, calling Add() may be faster. func (rc *runContainer16) set(alreadySorted bool, vals ...uint16) { - rc2 := newRunContainer16FromVals(alreadySorted, vals...) un := rc.union(rc2) rc.iv = un.iv @@ -374,7 +381,6 @@ func intersectInterval16s(a, b interval16) (res interval16, isEmpty bool) { // union merges two runContainer16s, producing // a new runContainer16 with the union of rc and b. func (rc *runContainer16) union(b *runContainer16) *runContainer16 { - // rc is also known as 'a' here, but golint insisted we // call it rc for consistency with the rest of the methods. @@ -457,7 +463,6 @@ func (rc *runContainer16) union(b *runContainer16) *runContainer16 { break aAdds } } - } if !bDone { @@ -471,7 +476,6 @@ func (rc *runContainer16) union(b *runContainer16) *runContainer16 { break bAdds } } - } m = append(m, merged) @@ -489,7 +493,6 @@ func (rc *runContainer16) union(b *runContainer16) *runContainer16 { // unionCardinality returns the cardinality of the merger of two runContainer16s, the union of rc and b. func (rc *runContainer16) unionCardinality(b *runContainer16) uint { - // rc is also known as 'a' here, but golint insisted we // call it rc for consistency with the rest of the methods. answer := uint(0) @@ -528,7 +531,7 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { } if !mergedUpdated { // we know that merged is disjoint from cura and curb - //m = append(m, merged) + // m = append(m, merged) answer += uint(merged.last()) - uint(merged.start) + 1 mergedUsed = false } @@ -539,11 +542,11 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { if !canMerge16(cura, curb) { if cura.start < curb.start { answer += uint(cura.last()) - uint(cura.start) + 1 - //m = append(m, cura) + // m = append(m, cura) na++ } else { answer += uint(curb.last()) - uint(curb.start) + 1 - //m = append(m, curb) + // m = append(m, curb) nb++ } } else { @@ -574,7 +577,6 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { break aAdds } } - } if !bDone { @@ -588,10 +590,9 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { break bAdds } } - } - //m = append(m, merged) + // m = append(m, merged) answer += uint(merged.last()) - uint(merged.start) + 1 } for _, r := range rc.iv[na:] { @@ -615,7 +616,6 @@ func (rc *runContainer16) indexOfIntervalAtOrAfter(key int, startIndex int) int // intersect returns a new runContainer16 holding the // intersection of rc (also known as 'a') and b. func (rc *runContainer16) intersect(b *runContainer16) *runContainer16 { - a := rc numa := int(len(a.iv)) numb := int(len(b.iv)) @@ -645,8 +645,7 @@ func (rc *runContainer16) intersect(b *runContainer16) *runContainer16 { toploop: for acuri < numa && bcuri < numb { - isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = - intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) + isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) if !isOverlap { switch { @@ -664,7 +663,6 @@ toploop: } bstart = int(b.iv[bcuri].start) } - } else { // isOverlap output = append(output, intersection) @@ -748,8 +746,7 @@ toploop: for acuri < numa && bcuri < numb { pass++ - isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = - intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) + isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) if !isOverlap { switch { @@ -767,7 +764,6 @@ toploop: } bstart = int(b.iv[bcuri].start) } - } else { // isOverlap answer += int(intersection.last()) - int(intersection.start) + 1 @@ -941,7 +937,7 @@ func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (w // b) whichInterval16 == -1 if key is before our first // interval16 in rc.iv; // -// c) whichInterval16 is set to the minimum index of rc.iv +// c) whichInterval16 is set to the maximum index of rc.iv // which comes strictly before the key; // so rc.iv[whichInterval16].last < key, // and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start @@ -1014,10 +1010,10 @@ func newRunContainer16TakeOwnership(iv []interval16) *runContainer16 { return rc } -const baseRc16Size = int(unsafe.Sizeof(runContainer16{})) -const perIntervalRc16Size = int(unsafe.Sizeof(interval16{})) - -const baseDiskRc16Size = int(unsafe.Sizeof(uint16(0))) +const ( + baseRc16Size = 2 + perIntervalRc16Size = 4 +) // see also runContainer16SerializedSizeInBytes(numRuns int) int @@ -1030,7 +1026,7 @@ func (rc *runContainer16) getSizeInBytes() int { // runContainer16SerializedSizeInBytes returns the number of bytes of disk // required to hold numRuns in a runContainer16. func runContainer16SerializedSizeInBytes(numRuns int) int { - return perIntervalRc16Size*numRuns + baseDiskRc16Size + return perIntervalRc16Size*numRuns + baseRc16Size } // Add adds a single value k to the set. @@ -1274,7 +1270,7 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int { break } } else { - ri.curPosInIndex += uint16(moreVals) //moreVals always fits in uint16 + ri.curPosInIndex += uint16(moreVals) // moreVals always fits in uint16 } } @@ -1315,7 +1311,7 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { break } } else { - ri.curPosInIndex += uint16(moreVals) //moreVals always fits in uint16 + ri.curPosInIndex += uint16(moreVals) // moreVals always fits in uint16 } } @@ -1324,7 +1320,6 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { // remove removes key from the container. func (rc *runContainer16) removeKey(key uint16) (wasPresent bool) { - var index int index, wasPresent, _ = rc.search(int(key)) if !wasPresent { @@ -1361,7 +1356,7 @@ func (rc *runContainer16) deleteAt(curIndex *int, curPosInIndex *uint16) { *curPosInIndex-- // if we leave *curIndex alone, then Next() will work properly even after the delete. default: - //middle + // middle // split into two, adding an interval16 new0 := newInterval16Range(rc.iv[ci].start, rc.iv[ci].start+*curPosInIndex-1) @@ -1376,7 +1371,6 @@ func (rc *runContainer16) deleteAt(curIndex *int, curPosInIndex *uint16) { *curIndex++ *curPosInIndex = 0 } - } func have4Overlap16(astart, alast, bstart, blast int) bool { @@ -1503,6 +1497,26 @@ func (iv interval16) isSuperSetOf(b interval16) bool { return iv.start <= b.start && b.last() <= iv.last() } +func (iv interval16) isNonContiguousDisjoint(b interval16) bool { + // cover the zero start case + if iv.start == b.start { + return false + } + + nonContiguous1 := uint32(iv.start) == uint32(b.last())+1 || uint32(iv.last()) == uint32(b.start)+1 + nonContiguous2 := uint32(b.start) == uint32(iv.last())+1 || uint32(b.last()) == uint32(iv.start)+1 + if nonContiguous1 || nonContiguous2 { + return false + } + ivl := iv.last() + bl := b.last() + + c1 := iv.start <= b.start && b.start <= ivl + c2 := b.start <= iv.start && iv.start <= bl + + return !c1 && !c2 +} + func (iv interval16) subtractInterval(del interval16) (left []interval16, delcount int) { isect, isEmpty := intersectInterval16s(iv, del) @@ -1678,7 +1692,6 @@ func (rc *runContainer16) isubtract(del interval16) { // port of run_container_andnot from CRoaring... // https://github.com/RoaringBitmap/CRoaring/blob/master/src/containers/run.c#L435-L496 func (rc *runContainer16) AndNotRunContainer16(b *runContainer16) *runContainer16 { - if len(b.iv) == 0 || len(rc.iv) == 0 { return rc } @@ -1781,10 +1794,25 @@ func (rc *runContainer16) minimum() uint16 { return rc.iv[0].start // assume not empty } +func (rc *runContainer16) safeMinimum() (uint16, error) { + if len(rc.iv) == 0 { + return 0, errors.New("Empty runs") + } + + return rc.minimum(), nil +} + func (rc *runContainer16) maximum() uint16 { return rc.iv[len(rc.iv)-1].last() // assume not empty } +func (rc *runContainer16) safeMaximum() (uint16, error) { + if len(rc.iv) == 0 { + return 0, errors.New("Empty runs") + } + return rc.maximum(), nil // assume not empty +} + func (rc *runContainer16) isFull() bool { return (len(rc.iv) == 1) && ((rc.iv[0].start == 0) && (rc.iv[0].last() == MaxUint16)) } @@ -1949,7 +1977,6 @@ func (rc *runContainer16) getManyIterator() manyIterable { // add the values in the range [firstOfRange, endx). endx // is still abe to express 2^16 because it is an int not an uint16. func (rc *runContainer16) iaddRange(firstOfRange, endx int) container { - if firstOfRange > endx { panic(fmt.Sprintf("invalid %v = endx > firstOfRange", endx)) } @@ -2002,7 +2029,6 @@ func (rc *runContainer16) not(firstOfRange, endx int) container { // makes 2 more passes through the arrays than should be // strictly necessary. Measure both ways though--this may not matter. func (rc *runContainer16) Not(firstOfRange, endx int) *runContainer16 { - if firstOfRange > endx { panic(fmt.Sprintf("invalid %v = endx > firstOfRange == %v", endx, firstOfRange)) } @@ -2066,12 +2092,12 @@ func (rc *runContainer16) equals(o container) bool { rit := rc.getShortIterator() bit := o.getShortIterator() - //k := 0 + // k := 0 for rit.hasNext() { if bit.next() != rit.next() { return false } - //k++ + // k++ } return true } @@ -2132,7 +2158,7 @@ func (rc *runContainer16) andBitmapContainerCardinality(bc *bitmapContainer) int for i := range rc.iv { answer += bc.getCardinalityInRange(uint(rc.iv[i].start), uint(rc.iv[i].last())+1) } - //bc.computeCardinality() + // bc.computeCardinality() return answer } @@ -2150,7 +2176,7 @@ func (rc *runContainer16) orArray(ac *arrayContainer) container { } intervals, cardMinusOne := runArrayUnionToRuns(rc, ac) result := newRunContainer16TakeOwnership(intervals) - if len(intervals) >= 2048 && cardMinusOne >= arrayDefaultMaxSize { + if len(intervals) >= MaxNumIntervals && cardMinusOne >= arrayDefaultMaxSize { return newBitmapContainerFromRun(result) } if len(intervals)*2 > 1+int(cardMinusOne) { @@ -2190,7 +2216,6 @@ func (rc *runContainer16) inplaceUnion(rc2 *runContainer16) container { } func (rc *runContainer16) iorBitmapContainer(bc *bitmapContainer) container { - it := bc.getShortIterator() for it.hasNext() { rc.Add(it.next()) @@ -2206,11 +2231,11 @@ func (rc *runContainer16) iorArray(ac *arrayContainer) container { return rc } var cardMinusOne uint16 - //TODO: perform the union algorithm in-place using rc.iv + // TODO: perform the union algorithm in-place using rc.iv // this can be done with methods like the in-place array container union // but maybe lazily moving the remaining elements back. rc.iv, cardMinusOne = runArrayUnionToRuns(rc, ac) - if len(rc.iv) >= 2048 && cardMinusOne >= arrayDefaultMaxSize { + if len(rc.iv) >= MaxNumIntervals && cardMinusOne >= arrayDefaultMaxSize { return newBitmapContainerFromRun(rc) } if len(rc.iv)*2 > 1+int(cardMinusOne) { @@ -2438,12 +2463,8 @@ func (rc *runContainer16) toBitmapContainer() *bitmapContainer { } func (rc *runContainer16) iandNotRunContainer16(x2 *runContainer16) container { - rcb := rc.toBitmapContainer() - x2b := x2.toBitmapContainer() - rcb.iandNotBitmapSurely(x2b) // TODO: check size and optimize the return value - // TODO: is inplace modification really required? If not, elide the copy. - rc2 := newRunContainer16FromBitmapContainer(rcb) + rc2 := rc.AndNotRunContainer16(x2) *rc = *rc2 return rc } @@ -2492,7 +2513,7 @@ func (rc *runContainer16) toEfficientContainer() container { sizeAsBitmapContainer := bitmapContainerSizeInBytes() card := rc.getCardinality() sizeAsArrayContainer := arrayContainerSizeInBytes(card) - if sizeAsRunContainer <= minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { + if sizeAsRunContainer < minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { return rc } if card <= arrayDefaultMaxSize { @@ -2511,7 +2532,6 @@ func (rc *runContainer16) toArrayContainer() *arrayContainer { } func newRunContainer16FromContainer(c container) *runContainer16 { - switch x := c.(type) { case *runContainer16: return x.Clone() @@ -2622,3 +2642,169 @@ func (rc *runContainer16) addOffset(x uint16) (container, container) { return low, high } + +// nextValue returns either the `target` if found or the next larger value. +// If the target is in the interior or a run then `target` will be returned +// Ex: If our run structure resmembles [[a,c], [d,f]] with a <= target <= c then `target` will be returned. +// Ex: If c < target < d then d is returned. +// Ex: If target < a then a is returned +// if the target > max, this is out of bounds and -1 is returned +func (rc *runContainer16) nextValue(target uint16) int { + if len(rc.iv) == 0 { + return -1 + } + + whichIndex, alreadyPresent, _ := rc.search(int(target)) + + if alreadyPresent { + return int(target) + } + + if whichIndex == -1 { + return int(rc.iv[0].start) + } + + if whichIndex == len(rc.iv)-1 { + return -1 + } + + // The if relies on the non-contiguous nature of runs. + // If we have two runs [a,b] and another run [c,d] + // We can rely on the invariant that b+1 < c + // We will return c + possibleNext := whichIndex + 1 + if possibleNext < len(rc.iv) { + return int(rc.iv[possibleNext].start) + } + + return -1 +} + +// nextAbsentValue returns the next absent value. +// By construction the next absent value will be located between gaps in runs +// +// Ex: if our runs resemble [[a,b],[c,d]] and a <= target <= b then b+1 will not be equal to c, b+1 will be returned +// Ex: if target < a then target is returned +// Ex: if target > d then target is returned +func (rc *runContainer16) nextAbsentValue(target uint16) int { + whichIndex, alreadyPresent, _ := rc.search(int(target)) + + if !alreadyPresent { + return int(target) + } + + return int(rc.iv[whichIndex].last()) + 1 +} + +// previousValue will return the previous present value +// If the target is in the interior of a run then `target` will be returned +// +// Example: +// If our run structure resmembles [[a,c], [d,f]] with a <= target <= c then target will be returned. +// If c < target < d then c is returned. +// if target > f then f is returned +// if the target is less than a, this is out of bounds and -1 is returned +func (rc *runContainer16) previousValue(target uint16) int { + whichIndex, alreadyPresent, _ := rc.search(int(target)) + + if len(rc.iv) == 0 { + return int(target) + } + + if alreadyPresent { + return int(target) + } + if whichIndex == -1 { + return -1 + } + + return int(rc.iv[whichIndex].last()) +} + +// previousAbsentValue will return the previous absent value +// If the target is in the interior of a run then then the start of the range minus 1 will be returned +// +// Example: +// If our run structure resmembles [[x,z], [a,c], [d,f]] with a <= target <= c then a-1 will be returned. +// if the target < x then target is returned +// if target > f then target is returned +func (rc *runContainer16) previousAbsentValue(target uint16) int { + whichIndex, alreadyPresent, _ := rc.search(int(target)) + + if !alreadyPresent { + return int(target) + } + + return int(rc.iv[whichIndex].start) - 1 +} + +// isNonContiguousDisjoint returns an error if the intervals overlap e.g have non-empty intersection +func isNonContiguousDisjoint(outer interval16, inner interval16) error { + if !outer.isNonContiguousDisjoint(inner) { + return ErrRunIntervalOverlap + } + + return nil +} + +// validate checks the run container referential integrity +// Ensures runs are not degenerate, non-contiguous and non-overlapping +func (rc *runContainer16) validate() error { + if rc.getCardinality() == 0 { + return ErrRunIntervalsEmpty + } + + intervalsSum := 0 + for outeridx := range rc.iv { + // The length being stored is the actual length - 1. + // So we need to add 1 to get the actual length. + // It is not possible to have a run with length 0. + + outerInterval := rc.iv[outeridx] + + intervalsSum += outerInterval.runlen() + for inneridx := outeridx + 1; inneridx < len(rc.iv); inneridx++ { + + innerInterval := rc.iv[inneridx] + + if outerInterval.equal(innerInterval) { + return ErrRunIntervalEqual + } + + // only check the start of runs + // if the run length overlap the next check will catch that. + if outerInterval.start >= innerInterval.start { + return ErrRunNonSorted + } + + err := isNonContiguousDisjoint(outerInterval, innerInterval) + if err != nil { + return err + } + } + + } + /* + if number of distinct values in the container >= 2048 then + check that the number of runs is no more than 2047 + (otherwise you could use a bitset container) + else + check that the number of runs < (number of distinct values) / 2 + (otherwise you could use an array container) + */ + + sizeAsRunContainer := runContainer16SerializedSizeInBytes(len(rc.iv)) + sizeAsBitmapContainer := bitmapContainerSizeInBytes() + sizeAsArrayContainer := arrayContainerSizeInBytes(intervalsSum) + // this is always ok: + if sizeAsRunContainer < minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { + return nil + } + if sizeAsRunContainer >= sizeAsBitmapContainer { + return ErrRunIntervalSize + } + if sizeAsRunContainer >= sizeAsArrayContainer { + return ErrRunIntervalSize + } + return nil +} diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization.go b/vendor/github.com/RoaringBitmap/roaring/v2/serialization.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/serialization.go rename to vendor/github.com/RoaringBitmap/roaring/v2/serialization.go diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization_generic.go b/vendor/github.com/RoaringBitmap/roaring/v2/serialization_generic.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/serialization_generic.go rename to vendor/github.com/RoaringBitmap/roaring/v2/serialization_generic.go diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go b/vendor/github.com/RoaringBitmap/roaring/v2/serialization_littleendian.go similarity index 99% rename from vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go rename to vendor/github.com/RoaringBitmap/roaring/v2/serialization_littleendian.go index 6e3a5d554..16d356caf 100644 --- a/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/serialization_littleendian.go @@ -299,6 +299,15 @@ func (rb *Bitmap) FrozenView(buf []byte) error { return rb.highlowcontainer.frozenView(buf) } +func (rb *Bitmap) MustFrozenView(buf []byte) error { + if err := rb.FrozenView(buf); err != nil { + return err + } + err := rb.Validate() + + return err +} + /* Verbatim specification from CRoaring. * * FROZEN SERIALIZATION FORMAT DESCRIPTION diff --git a/vendor/github.com/RoaringBitmap/roaring/serializationfuzz.go b/vendor/github.com/RoaringBitmap/roaring/v2/serializationfuzz.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/serializationfuzz.go rename to vendor/github.com/RoaringBitmap/roaring/v2/serializationfuzz.go diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil.go b/vendor/github.com/RoaringBitmap/roaring/v2/setutil.go similarity index 56% rename from vendor/github.com/RoaringBitmap/roaring/setutil.go rename to vendor/github.com/RoaringBitmap/roaring/v2/setutil.go index 663c4fa37..8def774f5 100644 --- a/vendor/github.com/RoaringBitmap/roaring/setutil.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/setutil.go @@ -1,26 +1,12 @@ package roaring -func equal(a, b []uint16) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} - func difference(set1 []uint16, set2 []uint16, buffer []uint16) int { - if 0 == len(set2) { + if len(set2) == 0 { buffer = buffer[:len(set1)] - for k := 0; k < len(set1); k++ { - buffer[k] = set1[k] - } + copy(buffer, set1) return len(set1) } - if 0 == len(set1) { + if len(set1) == 0 { return 0 } pos := 0 @@ -66,7 +52,6 @@ func difference(set1 []uint16, set2 []uint16, buffer []uint16) int { } } return pos - } func exclusiveUnion2by2(set1 []uint16, set2 []uint16, buffer []uint16) int { @@ -135,6 +120,7 @@ func exclusiveUnion2by2(set1 []uint16, set2 []uint16, buffer []uint16) int { return pos } +// union2by2Cardinality computes the cardinality of the union func union2by2Cardinality(set1 []uint16, set2 []uint16) int { pos := 0 k1 := 0 @@ -186,8 +172,8 @@ func union2by2Cardinality(set1 []uint16, set2 []uint16) int { func intersection2by2( set1 []uint16, set2 []uint16, - buffer []uint16) int { - + buffer []uint16, +) int { if len(set1)*64 < len(set2) { return onesidedgallopingintersect2by2(set1, set2, buffer) } else if len(set2)*64 < len(set1) { @@ -197,10 +183,11 @@ func intersection2by2( } } +// intersection2by2Cardinality computes the cardinality of the intersection func intersection2by2Cardinality( set1 []uint16, - set2 []uint16) int { - + set2 []uint16, +) int { if len(set1)*64 < len(set2) { return onesidedgallopingintersect2by2Cardinality(set1, set2) } else if len(set2)*64 < len(set1) { @@ -210,44 +197,45 @@ func intersection2by2Cardinality( } } +// intersects2by2 computes whether the two sets intersect func intersects2by2( set1 []uint16, - set2 []uint16) bool { + set2 []uint16, +) bool { // could be optimized if one set is much larger than the other one - if (0 == len(set1)) || (0 == len(set2)) { + if (len(set1) == 0) || (len(set2) == 0) { return false } - k1 := 0 - k2 := 0 - s1 := set1[k1] - s2 := set2[k2] + index1 := 0 + index2 := 0 + value1 := set1[index1] + value2 := set2[index2] mainwhile: for { - if s2 < s1 { + if value2 < value1 { for { - k2++ - if k2 == len(set2) { + index2++ + if index2 == len(set2) { break mainwhile } - s2 = set2[k2] - if s2 >= s1 { + value2 = set2[index2] + if value2 >= value1 { break } } } - if s1 < s2 { + if value1 < value2 { for { - k1++ - if k1 == len(set1) { + index1++ + if index1 == len(set1) { break mainwhile } - s1 = set1[k1] - if s1 >= s2 { + value1 = set1[index1] + if value1 >= value2 { break } } - } else { // (set2[k2] == set1[k1]) return true @@ -259,9 +247,9 @@ mainwhile: func localintersect2by2( set1 []uint16, set2 []uint16, - buffer []uint16) int { - - if (0 == len(set1)) || (0 == len(set2)) { + buffer []uint16, +) int { + if (len(set1) == 0) || (len(set2) == 0) { return 0 } k1 := 0 @@ -295,7 +283,6 @@ mainwhile: break } } - } else { // (set2[k2] == set1[k1]) buffer[pos] = s1 @@ -315,57 +302,57 @@ mainwhile: return pos } +// / localintersect2by2Cardinality computes the cardinality of the intersection func localintersect2by2Cardinality( set1 []uint16, - set2 []uint16) int { - - if (0 == len(set1)) || (0 == len(set2)) { + set2 []uint16, +) int { + if (len(set1) == 0) || (len(set2) == 0) { return 0 } - k1 := 0 - k2 := 0 + index1 := 0 + index2 := 0 pos := 0 - s1 := set1[k1] - s2 := set2[k2] + value1 := set1[index1] + value2 := set2[index2] mainwhile: for { - if s2 < s1 { + if value2 < value1 { for { - k2++ - if k2 == len(set2) { + index2++ + if index2 == len(set2) { break mainwhile } - s2 = set2[k2] - if s2 >= s1 { + value2 = set2[index2] + if value2 >= value1 { break } } } - if s1 < s2 { + if value1 < value2 { for { - k1++ - if k1 == len(set1) { + index1++ + if index1 == len(set1) { break mainwhile } - s1 = set1[k1] - if s1 >= s2 { + value1 = set1[index1] + if value1 >= value2 { break } } - } else { // (set2[k2] == set1[k1]) pos++ - k1++ - if k1 == len(set1) { + index1++ + if index1 == len(set1) { break } - s1 = set1[k1] - k2++ - if k2 == len(set2) { + value1 = set1[index1] + index2++ + if index2 == len(set2) { break } - s2 = set2[k2] + value2 = set2[index2] } } return pos @@ -375,7 +362,8 @@ func advanceUntil( array []uint16, pos int, length int, - min uint16) int { + min uint16, +) int { lower := pos + 1 if lower >= length || array[lower] >= min { @@ -423,14 +411,13 @@ func advanceUntil( } } return upper - } func onesidedgallopingintersect2by2( smallset []uint16, largeset []uint16, - buffer []uint16) int { - + buffer []uint16, +) int { if 0 == len(smallset) { return 0 } @@ -478,8 +465,8 @@ mainwhile: func onesidedgallopingintersect2by2Cardinality( smallset []uint16, - largeset []uint16) int { - + largeset []uint16, +) int { if 0 == len(smallset) { return 0 } @@ -548,3 +535,131 @@ func binarySearch(array []uint16, ikey uint16) int { } return -(low + 1) } + +// searchResult provides information about a search request. +// The values will depend on the context of the search +type searchResult struct { + value uint16 + index int + exactMatch bool +} + +// notFound returns a bool depending the search context +// For cases `previousValue` and `nextValue` if target is present in the slice +// this function will return `true` otherwise `false` +// For `nextAbsentValue` and `previousAbsentValue` this will only return `False` +func (sr *searchResult) notFound() bool { + return !sr.exactMatch +} + +// outOfBounds indicates whether the target was outside the lower and upper bounds of the container +func (sr *searchResult) outOfBounds() bool { + return sr.index <= -1 +} + +// binarySearchUntil is a helper function around binarySearchUntilWithBounds +// The user does not have to pass in the lower and upper bound +// The lower bound is taken to be `0` and the upper bound `len(array)-1` +func binarySearchUntil(array []uint16, target uint16) searchResult { + return binarySearchUntilWithBounds(array, target, 0, len(array)-1) +} + +// binarySearchUntilWithBounds returns a `searchResult`. +// If an exact match is found the `searchResult{target, , true}` will be returned, where `` is +// `target`s index in `array`, and `result.notFound()` evaluates to `false`. +// If a match is not found, but `target` was in-bounds then the result.index will be the closest smaller value +// Example: [ 8,9,11,12] if the target was 10, then `searchResult{9, 1, false}` will be returned. +// If `target` was out of bounds `searchResult{0, -1, false}` will be returned. +func binarySearchUntilWithBounds(array []uint16, target uint16, lowIndex int, maxIndex int) searchResult { + highIndex := maxIndex + + closestIndex := -1 + + if target < array[lowIndex] { + return searchResult{0, closestIndex, false} + } + + if target > array[maxIndex] { + return searchResult{0, len(array), false} + } + + for lowIndex <= highIndex { + middleIndex := (lowIndex + highIndex) / 2 + middleValue := array[middleIndex] + + if middleValue == target { + return searchResult{middleValue, middleIndex, true} + } + + if target < middleValue { + + if middleIndex > 0 && target > array[middleIndex-1] { + return searchResult{array[middleIndex-1], middleIndex - 1, false} + } + + highIndex = middleIndex + } else { + if middleIndex < maxIndex && target < array[middleIndex+1] { + return searchResult{middleValue, middleIndex, false} + } + lowIndex = middleIndex + 1 + } + + } + + return searchResult{array[closestIndex], closestIndex, false} +} + +// binarySearchPast is a wrapper around binarySearchPastWithBounds +// The user does not have to pass in the lower and upper bound +// The lower bound is taken to be `0` and the upper bound `len(array)-1` +func binarySearchPast(array []uint16, target uint16) searchResult { + return binarySearchPastWithBounds(array, target, 0, len(array)-1) +} + +// binarySearchPastWithBounds looks for the smallest value larger than or equal to `target` +// If `target` is out of bounds a `searchResult` indicating out of bounds is returned +// `target` does not have to exist in the slice. +// +// Example: +// Suppose the slice is [...10,13...] with `target` equal to 11 +// The searchResult will have searchResult.value = 13 +func binarySearchPastWithBounds(array []uint16, target uint16, lowIndex int, maxIndex int) searchResult { + highIndex := maxIndex + + closestIndex := -1 + + if target < array[lowIndex] { + return searchResult{0, closestIndex, false} + } + + if target > array[maxIndex] { + return searchResult{0, len(array), false} + } + + for lowIndex <= highIndex { + middleIndex := (lowIndex + highIndex) / 2 + middleValue := array[middleIndex] + + if middleValue == target { + return searchResult{middleValue, middleIndex, true} + } + + if target < middleValue { + + if middleIndex > 0 && target > array[middleIndex-1] { + return searchResult{array[middleIndex], middleIndex, false} + } + + highIndex = middleIndex + } else { + if middleIndex < maxIndex && target < array[middleIndex+1] { + return searchResult{array[middleIndex+1], middleIndex + 1, false} + } + lowIndex = middleIndex + 1 + } + + } + + return searchResult{array[closestIndex], closestIndex, false} +} diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.go b/vendor/github.com/RoaringBitmap/roaring/v2/setutil_arm64.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/setutil_arm64.go rename to vendor/github.com/RoaringBitmap/roaring/v2/setutil_arm64.go diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.s b/vendor/github.com/RoaringBitmap/roaring/v2/setutil_arm64.s similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/setutil_arm64.s rename to vendor/github.com/RoaringBitmap/roaring/v2/setutil_arm64.s diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil_generic.go b/vendor/github.com/RoaringBitmap/roaring/v2/setutil_generic.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/setutil_generic.go rename to vendor/github.com/RoaringBitmap/roaring/v2/setutil_generic.go diff --git a/vendor/github.com/RoaringBitmap/roaring/shortiterator.go b/vendor/github.com/RoaringBitmap/roaring/v2/shortiterator.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/shortiterator.go rename to vendor/github.com/RoaringBitmap/roaring/v2/shortiterator.go diff --git a/vendor/github.com/RoaringBitmap/roaring/smat.go b/vendor/github.com/RoaringBitmap/roaring/v2/smat.go similarity index 100% rename from vendor/github.com/RoaringBitmap/roaring/smat.go rename to vendor/github.com/RoaringBitmap/roaring/v2/smat.go diff --git a/vendor/github.com/RoaringBitmap/roaring/util.go b/vendor/github.com/RoaringBitmap/roaring/v2/util.go similarity index 97% rename from vendor/github.com/RoaringBitmap/roaring/util.go rename to vendor/github.com/RoaringBitmap/roaring/v2/util.go index 48b9d5a10..f58a86b2e 100644 --- a/vendor/github.com/RoaringBitmap/roaring/util.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/util.go @@ -52,6 +52,7 @@ func fill(arr []uint64, val uint64) { arr[i] = val } } + func fillRange(arr []uint64, start, end int, val uint64) { for i := start; i < end; i++ { arr[i] = val @@ -112,10 +113,19 @@ func fillArrayXOR(container []uint16, bitmap1, bitmap2 []uint64) { func highbits(x uint32) uint16 { return uint16(x >> 16) } + func lowbits(x uint32) uint16 { return uint16(x & maxLowBit) } +func combineLoHi16(lob uint16, hob uint16) uint32 { + return combineLoHi32(uint32(lob), uint32(hob)) +} + +func combineLoHi32(lob uint32, hob uint32) uint32 { + return uint32(lob) | (hob << 16) +} + const maxLowBit = 0xFFFF func flipBitmapRange(bitmap []uint64, start int, end int) { @@ -146,7 +156,6 @@ func resetBitmapRange(bitmap []uint64, start int, end int) { bitmap[i] = 0 } bitmap[endword] &= ^(^uint64(0) >> (uint(-end) % 64)) - } func setBitmapRange(bitmap []uint64, start int, end int) { @@ -242,7 +251,6 @@ func selectBitPosition(w uint64, j int) int { } } return seen + int(counter) - } func panicOn(err error) { diff --git a/vendor/github.com/bits-and-blooms/bitset/README.md b/vendor/github.com/bits-and-blooms/bitset/README.md index 848234e2f..b245facb7 100644 --- a/vendor/github.com/bits-and-blooms/bitset/README.md +++ b/vendor/github.com/bits-and-blooms/bitset/README.md @@ -12,7 +12,7 @@ This library is part of the [awesome go collection](https://github.com/avelino/a * [beego](https://github.com/beego/beego) * [CubeFS](https://github.com/cubefs/cubefs) * [Amazon EKS Distro](https://github.com/aws/eks-distro) -* [sourcegraph](https://github.com/sourcegraph/sourcegraph) +* [sourcegraph](https://github.com/sourcegraph/sourcegraph-public-snapshot) * [torrent](https://github.com/anacrolix/torrent) @@ -25,7 +25,7 @@ It provides methods for setting, clearing, flipping, and testing individual inte But it also provides set intersection, union, difference, complement, and symmetric operations, as well as tests to check whether any, all, or no bits are set, and querying a bitset's current length and number of positive bits. -BitSets are expanded to the size of the largest set bit; the memory allocation is approximately Max bits, where Max is the largest set bit. BitSets are never shrunk. On creation, a hint can be given for the number of bits that will be used. +BitSets are expanded to the size of the largest set bit; the memory allocation is approximately Max bits, where Max is the largest set bit. BitSets are never shrunk automatically, but `Shrink` and `Compact` methods are available. On creation, a hint can be given for the number of bits that will be used. Many of the methods, including Set, Clear, and Flip, return a BitSet pointer, which allows for chaining. @@ -69,6 +69,13 @@ func main() { } ``` +If you have Go 1.23 or better, you can iterate over the set bits like so: + +```go +for i := range b.EachSet() {} +``` + + Package documentation is at: https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc @@ -125,13 +132,20 @@ E.g., ## Memory Usage -The memory usage of a bitset using `N` bits is at least `N/8` bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring). +The memory usage of a bitset using `N` bits is at least `N/8` bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](https://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring). -## Implementation Note +The `roaring` library allows you to go back and forth between compressed Roaring bitmaps and the conventional bitset instances: +```Go + mybitset := roaringbitmap.ToBitSet() + newroaringbitmap := roaring.FromBitSet(mybitset) +``` -Go 1.9 introduced a native `math/bits` library. We provide backward compatibility to Go 1.7, which might be removed. -It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `uint64`). If so, the version will be bumped. +### Goroutine safety + +In general, it's not safe to access the same BitSet using different goroutines--they are unsynchronized for performance. + +Should you want to access a BitSet from more than one goroutine, you should provide synchronization. Typically this is done by using channels to pass the *BitSet around (in Go style; so there is only ever one owner), or by using `sync.Mutex` to serialize operations on BitSets. ## Installation diff --git a/vendor/github.com/bits-and-blooms/bitset/bitset.go b/vendor/github.com/bits-and-blooms/bitset/bitset.go index 8fb9e9fa2..46d05b9ed 100644 --- a/vendor/github.com/bits-and-blooms/bitset/bitset.go +++ b/vendor/github.com/bits-and-blooms/bitset/bitset.go @@ -44,17 +44,21 @@ import ( "errors" "fmt" "io" + "math/bits" "strconv" ) // the wordSize of a bit set -const wordSize = uint(64) +const wordSize = 64 // the wordSize of a bit set in bytes const wordBytes = wordSize / 8 +// wordMask is wordSize-1, used for bit indexing in a word +const wordMask = wordSize - 1 + // log2WordSize is lg(wordSize) -const log2WordSize = uint(6) +const log2WordSize = 6 // allBits has every bit set const allBits uint64 = 0xffffffffffffffff @@ -68,9 +72,16 @@ var base64Encoding = base64.URLEncoding // Base64StdEncoding Marshal/Unmarshal BitSet with base64.StdEncoding(Default: base64.URLEncoding) func Base64StdEncoding() { base64Encoding = base64.StdEncoding } -// LittleEndian Marshal/Unmarshal Binary as Little Endian(Default: binary.BigEndian) +// LittleEndian sets Marshal/Unmarshal Binary as Little Endian (Default: binary.BigEndian) func LittleEndian() { binaryOrder = binary.LittleEndian } +// BigEndian sets Marshal/Unmarshal Binary as Big Endian (Default: binary.BigEndian) +func BigEndian() { binaryOrder = binary.BigEndian } + +// BinaryOrder returns the current binary order, see also LittleEndian() +// and BigEndian() to change the order. +func BinaryOrder() binary.ByteOrder { return binaryOrder } + // A BitSet is a set of bits. The zero value of a BitSet is an empty set of length 0. type BitSet struct { length uint @@ -94,41 +105,63 @@ func (b *BitSet) SetBitsetFrom(buf []uint64) { b.set = buf } -// From is a constructor used to create a BitSet from an array of integers +// From is a constructor used to create a BitSet from an array of words func From(buf []uint64) *BitSet { return FromWithLength(uint(len(buf))*64, buf) } -// FromWithLength constructs from an array of integers and length. -func FromWithLength(len uint, set []uint64) *BitSet { - return &BitSet{len, set} +// FromWithLength constructs from an array of words and length in bits. +// This function is for advanced users, most users should prefer +// the From function. +// As a user of FromWithLength, you are responsible for ensuring +// that the length is correct: your slice should have length at +// least (length+63)/64 in 64-bit words. +func FromWithLength(length uint, set []uint64) *BitSet { + if len(set) < wordsNeeded(length) { + panic("BitSet.FromWithLength: slice is too short") + } + return &BitSet{length, set} } -// Bytes returns the bitset as array of integers +// Bytes returns the bitset as array of 64-bit words, giving direct access to the internal representation. +// It is not a copy, so changes to the returned slice will affect the bitset. +// It is meant for advanced users. +// +// Deprecated: Bytes is deprecated. Use [BitSet.Words] instead. func (b *BitSet) Bytes() []uint64 { return b.set } +// Words returns the bitset as array of 64-bit words, giving direct access to the internal representation. +// It is not a copy, so changes to the returned slice will affect the bitset. +// It is meant for advanced users. +func (b *BitSet) Words() []uint64 { + return b.set +} + // wordsNeeded calculates the number of words needed for i bits func wordsNeeded(i uint) int { - if i > (Cap() - wordSize + 1) { + if i > (Cap() - wordMask) { return int(Cap() >> log2WordSize) } - return int((i + (wordSize - 1)) >> log2WordSize) + return int((i + wordMask) >> log2WordSize) } // wordsNeededUnbound calculates the number of words needed for i bits, possibly exceeding the capacity. -// This function is useful if you know that the capacity cannot be exceeded (e.g., you have an existing bitmap). +// This function is useful if you know that the capacity cannot be exceeded (e.g., you have an existing BitSet). func wordsNeededUnbound(i uint) int { - return int((i + (wordSize - 1)) >> log2WordSize) + return (int(i) + wordMask) >> log2WordSize } // wordsIndex calculates the index of words in a `uint64` func wordsIndex(i uint) uint { - return i & (wordSize - 1) + return i & wordMask } -// New creates a new BitSet with a hint that length bits will be required +// New creates a new BitSet with a hint that length bits will be required. +// The memory usage is at least length/8 bytes. +// In case of allocation failure, the function will return a BitSet with zero +// capacity. func New(length uint) (bset *BitSet) { defer func() { if r := recover(); r != nil { @@ -147,13 +180,30 @@ func New(length uint) (bset *BitSet) { return bset } +// MustNew creates a new BitSet with the given length bits. +// It panics if length exceeds the possible capacity or by a lack of memory. +func MustNew(length uint) (bset *BitSet) { + if length >= Cap() { + panic("You are exceeding the capacity") + } + + return &BitSet{ + length, + make([]uint64, wordsNeeded(length)), // may panic on lack of memory + } +} + // Cap returns the total possible capacity, or number of bits +// that can be stored in the BitSet theoretically. Under 32-bit system, +// it is 4294967295 and under 64-bit system, it is 18446744073709551615. +// Note that this is further limited by the maximum allocation size in Go, +// and your available memory, as any Go data structure. func Cap() uint { return ^uint(0) } // Len returns the number of bits in the BitSet. -// Note the difference to method Count, see example. +// Note that it differ from Count function. func (b *BitSet) Len() uint { return b.length } @@ -184,12 +234,32 @@ func (b *BitSet) Test(i uint) bool { return b.set[i>>log2WordSize]&(1<> log2WordSize) + subWordIndex := wordsIndex(i) + + // The word that the index falls within, shifted so the index is at bit 0 + var firstWord, secondWord uint64 + if firstWordIndex < len(b.set) { + firstWord = b.set[firstWordIndex] >> subWordIndex + } + + // The next word, masked to only include the necessary bits and shifted to cover the + // top of the word + if (firstWordIndex + 1) < len(b.set) { + secondWord = b.set[firstWordIndex+1] << uint64(wordSize-subWordIndex) + } + + return firstWord | secondWord +} + // Set bit i to 1, the capacity of the bitset is automatically // increased accordingly. -// If i>= Cap(), this function will panic. // Warning: using a very large value for 'i' // may lead to a memory shortage and a panic: the caller is responsible // for providing sensible parameters in line with their memory capacity. +// The memory usage is at least slightly over i/8 bytes. func (b *BitSet) Set(i uint) *BitSet { if i >= b.length { // if we need more bits, make 'em b.extendSet(i) @@ -198,7 +268,7 @@ func (b *BitSet) Set(i uint) *BitSet { return b } -// Clear bit i to 0 +// Clear bit i to 0. This never cause a memory allocation. It is always safe. func (b *BitSet) Clear(i uint) *BitSet { if i >= b.length { return b @@ -208,7 +278,6 @@ func (b *BitSet) Clear(i uint) *BitSet { } // SetTo sets bit i to value. -// If i>= Cap(), this function will panic. // Warning: using a very large value for 'i' // may lead to a memory shortage and a panic: the caller is responsible // for providing sensible parameters in line with their memory capacity. @@ -220,7 +289,6 @@ func (b *BitSet) SetTo(i uint, value bool) *BitSet { } // Flip bit at i. -// If i>= Cap(), this function will panic. // Warning: using a very large value for 'i' // may lead to a memory shortage and a panic: the caller is responsible // for providing sensible parameters in line with their memory capacity. @@ -233,7 +301,6 @@ func (b *BitSet) Flip(i uint) *BitSet { } // FlipRange bit in [start, end). -// If end>= Cap(), this function will panic. // Warning: using a very large value for 'end' // may lead to a memory shortage and a panic: the caller is responsible // for providing sensible parameters in line with their memory capacity. @@ -241,23 +308,54 @@ func (b *BitSet) FlipRange(start, end uint) *BitSet { if start >= end { return b } + if end-1 >= b.length { // if we need more bits, make 'em b.extendSet(end - 1) } - var startWord uint = start >> log2WordSize - var endWord uint = end >> log2WordSize + + startWord := int(start >> log2WordSize) + endWord := int(end >> log2WordSize) + + // b.set[startWord] ^= ^(^uint64(0) << wordsIndex(start)) + // e.g: + // start = 71, + // startWord = 1 + // wordsIndex(start) = 71 % 64 = 7 + // (^uint64(0) << 7) = 0b111111....11110000000 + // + // mask = ^(^uint64(0) << 7) = 0b000000....00001111111 + // + // flips the first 7 bits in b.set[1] and + // in the range loop, the b.set[1] gets again flipped + // so the two expressions flip results in a flip + // in b.set[1] from [7,63] + // + // handle startWord special, get's reflipped in range loop b.set[startWord] ^= ^(^uint64(0) << wordsIndex(start)) - if endWord > 0 { - // bounds check elimination - data := b.set - _ = data[endWord-1] - for i := startWord; i < endWord; i++ { - data[i] = ^data[i] - } + + for idx := range b.set[startWord:endWord] { + b.set[startWord+idx] = ^b.set[startWord+idx] } - if end&(wordSize-1) != 0 { - b.set[endWord] ^= ^uint64(0) >> wordsIndex(-end) + + // handle endWord special + // e.g. + // end = 135 + // endWord = 2 + // + // wordsIndex(-7) = 57 + // see the golang spec: + // "For unsigned integer values, the operations +, -, *, and << are computed + // modulo 2n, where n is the bit width of the unsigned integer's type." + // + // mask = ^uint64(0) >> 57 = 0b00000....0001111111 + // + // flips in b.set[2] from [0,7] + // + // is end at word boundary? + if idx := wordsIndex(-end); idx != 0 { + b.set[endWord] ^= ^uint64(0) >> wordsIndex(idx) } + return b } @@ -275,6 +373,7 @@ func (b *BitSet) FlipRange(start, end uint) *BitSet { // memory usage until the GC runs. Normally this should not be a problem, but if you // have an extremely large BitSet its important to understand that the old BitSet will // remain in memory until the GC frees it. +// If you are memory constrained, this function may cause a panic. func (b *BitSet) Shrink(lastbitindex uint) *BitSet { length := lastbitindex + 1 idx := wordsNeeded(length) @@ -294,6 +393,11 @@ func (b *BitSet) Shrink(lastbitindex uint) *BitSet { // Compact shrinks BitSet to so that we preserve all set bits, while minimizing // memory usage. Compact calls Shrink. +// A new slice is allocated to store the new bits, so you may see an increase in +// memory usage until the GC runs. Normally this should not be a problem, but if you +// have an extremely large BitSet its important to understand that the old BitSet will +// remain in memory until the GC frees it. +// If you are memory constrained, this function may cause a panic. func (b *BitSet) Compact() *BitSet { idx := len(b.set) - 1 for ; idx >= 0 && b.set[idx] == 0; idx-- { @@ -353,7 +457,8 @@ func (b *BitSet) InsertAt(idx uint) *BitSet { return b } -// String creates a string representation of the Bitmap +// String creates a string representation of the BitSet. It is only intended for +// human-readable output and not for serialization. func (b *BitSet) String() string { // follows code from https://github.com/RoaringBitmap/roaring var buffer bytes.Buffer @@ -415,6 +520,50 @@ func (b *BitSet) DeleteAt(i uint) *BitSet { return b } +// AppendTo appends all set bits to buf and returns the (maybe extended) buf. +// In case of allocation failure, the function will panic. +// +// See also [BitSet.AsSlice] and [BitSet.NextSetMany]. +func (b *BitSet) AppendTo(buf []uint) []uint { + // In theory, we could overflow uint, but in practice, we will not. + for idx, word := range b.set { + for word != 0 { + // In theory idx<= len(b.set) { return 0, false } - w := b.set[x] - w = w >> wordsIndex(i) - if w != 0 { - return i + trailingZeroes64(w), true - } - x++ - // bounds check elimination in the loop - if x < 0 { - return 0, false - } - for x < len(b.set) { - if b.set[x] != 0 { - return uint(x)*wordSize + trailingZeroes64(b.set[x]), true - } - x++ + // process first (partial) word + word := b.set[x] >> wordsIndex(i) + if word != 0 { + return i + uint(bits.TrailingZeros64(word)), true } + + // process the following full words until next bit is set + // x < len(b.set), no out-of-bounds panic in following slice expression + x++ + for idx, word := range b.set[x:] { + if word != 0 { + return uint((x+idx)<> log2WordSize) if x >= len(b.set) || capacity == 0 { - return 0, myanswer[:0] + return 0, result[:0] } - skip := wordsIndex(i) - word := b.set[x] >> skip - myanswer = myanswer[:capacity] - size := int(0) + + // process first (partial) word + word := b.set[x] >> wordsIndex(i) + + size := 0 for word != 0 { - r := trailingZeroes64(word) - t := word & ((^word) + 1) - myanswer[size] = r + i + result[size] = i + uint(bits.TrailingZeros64(word)) + size++ if size == capacity { - goto End + return result[size-1], result[:size] } - word = word ^ t + + // clear the rightmost set bit + word &= word - 1 } + + // process the following full words + // x < len(b.set), no out-of-bounds panic in following slice expression x++ for idx, word := range b.set[x:] { for word != 0 { - r := trailingZeroes64(word) - t := word & ((^word) + 1) - myanswer[size] = r + (uint(x+idx) << 6) + result[size] = uint((x+idx)< 0 { - return myanswer[size-1], myanswer[:size] + return result[size-1], result[:size] } - return 0, myanswer[:0] + return 0, result[:0] } // NextClear returns the next clear bit from the specified index, @@ -517,31 +676,89 @@ func (b *BitSet) NextClear(i uint) (uint, bool) { if x >= len(b.set) { return 0, false } - w := b.set[x] - w = w >> wordsIndex(i) - wA := allBits >> wordsIndex(i) - index := i + trailingZeroes64(^w) - if w != wA && index < b.length { + + // process first (maybe partial) word + word := b.set[x] + word = word >> wordsIndex(i) + wordAll := allBits >> wordsIndex(i) + + index := i + uint(bits.TrailingZeros64(^word)) + if word != wordAll && index < b.length { return index, true } + + // process the following full words until next bit is cleared + // x < len(b.set), no out-of-bounds panic in following slice expression x++ - // bounds check elimination in the loop - if x < 0 { - return 0, false - } - for x < len(b.set) { - if b.set[x] != allBits { - index = uint(x)*wordSize + trailingZeroes64(^b.set[x]) + for idx, word := range b.set[x:] { + if word != allBits { + index = uint((x+idx)*wordSize + bits.TrailingZeros64(^word)) if index < b.length { return index, true } } - x++ + } + + return 0, false +} + +// PreviousSet returns the previous set bit from the specified index, +// including possibly the current index +// along with an error code (true = valid, false = no bit found i.e. all bits are clear) +func (b *BitSet) PreviousSet(i uint) (uint, bool) { + x := int(i >> log2WordSize) + if x >= len(b.set) { + return 0, false + } + word := b.set[x] + + // Clear the bits above the index + word = word & ((1 << (wordsIndex(i) + 1)) - 1) + if word != 0 { + return uint(x<= 0; x-- { + word = b.set[x] + if word != 0 { + return uint(x<> log2WordSize) + if x >= len(b.set) { + return 0, false + } + word := b.set[x] + + // Flip all bits and find the highest one bit + word = ^word + + // Clear the bits above the index + word = word & ((1 << (wordsIndex(i) + 1)) - 1) + + if word != 0 { + return uint(x<= 0; x-- { + word = b.set[x] + word = ^word + if word != 0 { + return uint(x< other.Count() && b.IsSuperSet(other) } -// DumpAsBits dumps a bit set as a string of bits +// DumpAsBits dumps a bit set as a string of bits. Following the usual convention in Go, +// the least significant bits are printed last (index 0 is at the end of the string). +// This is useful for debugging and testing. It is not suitable for serialization. func (b *BitSet) DumpAsBits() string { if b.set == nil { return "." @@ -959,18 +1193,18 @@ func (b *BitSet) DumpAsBits() string { // BinaryStorageSize returns the binary storage requirements (see WriteTo) in bytes. func (b *BitSet) BinaryStorageSize() int { - return int(wordBytes + wordBytes*uint(b.wordCount())) + return wordBytes + wordBytes*b.wordCount() } func readUint64Array(reader io.Reader, data []uint64) error { length := len(data) bufferSize := 128 - buffer := make([]byte, bufferSize*int(wordBytes)) + buffer := make([]byte, bufferSize*wordBytes) for i := 0; i < length; i += bufferSize { end := i + bufferSize if end > length { end = length - buffer = buffer[:wordBytes*uint(end-i)] + buffer = buffer[:wordBytes*(end-i)] } chunk := data[i:end] if _, err := io.ReadFull(reader, buffer); err != nil { @@ -985,12 +1219,12 @@ func readUint64Array(reader io.Reader, data []uint64) error { func writeUint64Array(writer io.Writer, data []uint64) error { bufferSize := 128 - buffer := make([]byte, bufferSize*int(wordBytes)) + buffer := make([]byte, bufferSize*wordBytes) for i := 0; i < len(data); i += bufferSize { end := i + bufferSize if end > len(data) { end = len(data) - buffer = buffer[:wordBytes*uint(end-i)] + buffer = buffer[:wordBytes*(end-i)] } chunk := data[i:end] for i, x := range chunk { @@ -1007,6 +1241,15 @@ func writeUint64Array(writer io.Writer, data []uint64) error { // WriteTo writes a BitSet to a stream. The format is: // 1. uint64 length // 2. []uint64 set +// The length is the number of bits in the BitSet. +// +// The set is a slice of uint64s containing between length and length + 63 bits. +// It is interpreted as a big-endian array of uint64s by default (see BinaryOrder()) +// meaning that the first 8 bits are stored at byte index 7, the next 8 bits are stored +// at byte index 6... the bits 64 to 71 are stored at byte index 8, etc. +// If you change the binary order, you need to do so for both reading and writing. +// We recommend using the default binary order. +// // Upon success, the number of bytes written is returned. // // Performance: if this function is used to write to a disk or network @@ -1037,6 +1280,7 @@ func (b *BitSet) WriteTo(stream io.Writer) (int64, error) { // The format is: // 1. uint64 length // 2. []uint64 set +// See WriteTo for details. // Upon success, the number of bytes read is returned. // If the current BitSet is not large enough to hold the data, // it is extended. In case of error, the BitSet is either @@ -1088,6 +1332,7 @@ func (b *BitSet) ReadFrom(stream io.Reader) (int64, error) { } // MarshalBinary encodes a BitSet into a binary form and returns the result. +// Please see WriteTo for details. func (b *BitSet) MarshalBinary() ([]byte, error) { var buf bytes.Buffer _, err := b.WriteTo(&buf) @@ -1099,6 +1344,7 @@ func (b *BitSet) MarshalBinary() ([]byte, error) { } // UnmarshalBinary decodes the binary form generated by MarshalBinary. +// Please see WriteTo for details. func (b *BitSet) UnmarshalBinary(data []byte) error { buf := bytes.NewReader(data) _, err := b.ReadFrom(buf) @@ -1135,3 +1381,376 @@ func (b *BitSet) UnmarshalJSON(data []byte) error { _, err = b.ReadFrom(bytes.NewReader(buf)) return err } + +// Rank returns the number of set bits up to and including the index +// that are set in the bitset. +// See https://en.wikipedia.org/wiki/Ranking#Ranking_in_statistics +func (b *BitSet) Rank(index uint) (rank uint) { + index++ // Rank is up to and including + + // needed more than once + length := len(b.set) + + // TODO: built-in min requires go1.21 or later + // idx := min(int(index>>6), len(b.set)) + idx := int(index >> 6) + if idx > length { + idx = length + } + + // sum up the popcounts until idx ... + // TODO: cannot range over idx (...): requires go1.22 or later + // for j := range idx { + for j := 0; j < idx; j++ { + if w := b.set[j]; w != 0 { + rank += uint(bits.OnesCount64(w)) + } + } + + // ... plus partial word at idx, + // make Rank inlineable and faster in the end + // don't test index&63 != 0, just add, less branching + if idx < length { + rank += uint(bits.OnesCount64(b.set[idx] << (64 - index&63))) + } + + return +} + +// Select returns the index of the jth set bit, where j is the argument. +// The caller is responsible to ensure that 0 <= j < Count(): when j is +// out of range, the function returns the length of the bitset (b.length). +// +// Note that this function differs in convention from the Rank function which +// returns 1 when ranking the smallest value. We follow the conventional +// textbook definition of Select and Rank. +func (b *BitSet) Select(index uint) uint { + leftover := index + for idx, word := range b.set { + w := uint(bits.OnesCount64(word)) + if w > leftover { + return uint(idx)*64 + select64(word, leftover) + } + leftover -= w + } + return b.length +} + +// top detects the top bit set +func (b *BitSet) top() (uint, bool) { + for idx := len(b.set) - 1; idx >= 0; idx-- { + if word := b.set[idx]; word != 0 { + return uint(idx<= b.length { + b.length = top + bits + 1 + } + + pad, idx := top%wordSize, top>>log2WordSize + shift, pages := bits%wordSize, bits>>log2WordSize + if bits%wordSize == 0 { // happy case: just add pages + copy(dst[pages:nsize], b.set) + } else { + if pad+shift >= wordSize { + dst[idx+pages+1] = b.set[idx] >> (wordSize - shift) + } + + for i := int(idx); i >= 0; i-- { + if i > 0 { + dst[i+int(pages)] = (b.set[i] << shift) | (b.set[i-1] >> (wordSize - shift)) + } else { + dst[i+int(pages)] = b.set[i] << shift + } + } + } + + // zeroing extra pages + for i := 0; i < int(pages); i++ { + dst[i] = 0 + } + + b.set = dst +} + +// ShiftRight shifts the bitset like >> operation would do. +func (b *BitSet) ShiftRight(bits uint) { + panicIfNull(b) + + if bits == 0 { + return + } + + top, ok := b.top() + if !ok { + return + } + + if bits >= top { + b.set = make([]uint64, wordsNeeded(b.length)) + return + } + + pad, idx := top%wordSize, top>>log2WordSize + shift, pages := bits%wordSize, bits>>log2WordSize + if bits%wordSize == 0 { // happy case: just clear pages + b.set = b.set[pages:] + b.length -= pages * wordSize + } else { + for i := 0; i <= int(idx-pages); i++ { + if i < int(idx-pages) { + b.set[i] = (b.set[i+int(pages)] >> shift) | (b.set[i+int(pages)+1] << (wordSize - shift)) + } else { + b.set[i] = b.set[i+int(pages)] >> shift + } + } + + if pad < shift { + b.set[int(idx-pages)] = 0 + } + } + + for i := int(idx-pages) + 1; i <= int(idx); i++ { + b.set[i] = 0 + } +} + +// OnesBetween returns the number of set bits in the range [from, to). +// The range is inclusive of 'from' and exclusive of 'to'. +// Returns 0 if from >= to. +func (b *BitSet) OnesBetween(from, to uint) uint { + panicIfNull(b) + + if from >= to { + return 0 + } + + // Calculate indices and masks for the starting and ending words + startWord := from >> log2WordSize // Divide by wordSize + endWord := to >> log2WordSize + startOffset := from & wordMask // Mod wordSize + endOffset := to & wordMask + + // Case 1: Bits lie within a single word + if startWord == endWord { + // Create mask for bits between from and to + mask := uint64((1<= startOffset + count = uint(bits.OnesCount64(b.set[startWord] & startMask)) + + // 2b: Count all bits in complete words between start and end + if endWord > startWord+1 { + count += uint(popcntSlice(b.set[startWord+1 : endWord])) + } + + // 2c: Count bits in last word (from start of word to endOffset) + if endOffset > 0 { + endMask := uint64(1<> log2WordSize + bitOffset := outPos & wordMask + + // Write extracted bits, handling word boundary crossing + dst.set[wordIdx] |= extracted << bitOffset + if bitOffset+bitsExtracted > wordSize { + dst.set[wordIdx+1] = extracted >> (wordSize - bitOffset) + } + + outPos += bitsExtracted + } +} + +// Deposit creates a new BitSet and deposits bits according to a mask. +// See DepositTo for details. +func (b *BitSet) Deposit(mask *BitSet) *BitSet { + dst := New(mask.length) + b.DepositTo(mask, dst) + return dst +} + +// DepositTo spreads bits from a compacted form in the BitSet into positions +// specified by mask in dst. This is the inverse operation of Extract. +// +// For example, if mask has bits set at positions 1,4,5, then DepositTo will +// take consecutive bits 0,1,2 from the source BitSet and place them into +// positions 1,4,5 in the destination BitSet. +func (b *BitSet) DepositTo(mask *BitSet, dst *BitSet) { + panicIfNull(b) + panicIfNull(mask) + panicIfNull(dst) + + if len(dst.set) == 0 || len(mask.set) == 0 || len(b.set) == 0 { + return + } + + inPos := uint(0) + length := len(mask.set) + if len(dst.set) < length { + length = len(dst.set) + } + + // Process each word + for i := 0; i < length; i++ { + if mask.set[i] == 0 { + continue // Skip words with no bits to deposit + } + + // Calculate source word index + wordIdx := inPos >> log2WordSize + if wordIdx >= uint(len(b.set)) { + break // No more source bits available + } + + // Get source bits, handling word boundary crossing + sourceBits := b.set[wordIdx] + bitOffset := inPos & wordMask + if wordIdx+1 < uint(len(b.set)) && bitOffset != 0 { + // Combine bits from current and next word + sourceBits = (sourceBits >> bitOffset) | + (b.set[wordIdx+1] << (wordSize - bitOffset)) + } else { + sourceBits >>= bitOffset + } + + // Deposit bits according to mask + dst.set[i] = (dst.set[i] &^ mask.set[i]) | pdep(sourceBits, mask.set[i]) + inPos += uint(bits.OnesCount64(mask.set[i])) + } +} + +//go:generate go run cmd/pextgen/main.go -pkg=bitset + +func pext(w, m uint64) (result uint64) { + var outPos uint + + // Process byte by byte + for i := 0; i < 8; i++ { + shift := i << 3 // i * 8 using bit shift + b := uint8(w >> shift) + mask := uint8(m >> shift) + + extracted := pextLUT[b][mask] + bits := popLUT[mask] + + result |= uint64(extracted) << outPos + outPos += uint(bits) + } + + return result +} + +func pdep(w, m uint64) (result uint64) { + var inPos uint + + // Process byte by byte + for i := 0; i < 8; i++ { + shift := i << 3 // i * 8 using bit shift + mask := uint8(m >> shift) + bits := popLUT[mask] + + // Get the bits we'll deposit from the source + b := uint8(w >> inPos) + + // Deposit them according to the mask for this byte + deposited := pdepLUT[b][mask] + + // Add to result + result |= uint64(deposited) << shift + inPos += uint(bits) + } + + return result +} diff --git a/vendor/github.com/bits-and-blooms/bitset/bitset_iter.go b/vendor/github.com/bits-and-blooms/bitset/bitset_iter.go new file mode 100644 index 000000000..79bf8a018 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/bitset_iter.go @@ -0,0 +1,23 @@ +//go:build go1.23 +// +build go1.23 + +package bitset + +import ( + "iter" + "math/bits" +) + +func (b *BitSet) EachSet() iter.Seq[uint] { + return func(yield func(uint) bool) { + for wordIndex, word := range b.set { + idx := 0 + for trail := bits.TrailingZeros64(word); trail != 64; trail = bits.TrailingZeros64(word >> idx) { + if !yield(uint(wordIndex<> 1) & 0x5555555555555555 - x = (x>>2)&0x3333333333333333 + x&0x3333333333333333 - x += x >> 4 - x &= 0x0f0f0f0f0f0f0f0f - x *= 0x0101010101010101 - return x >> 56 -} +import "math/bits" -func popcntSliceGo(s []uint64) uint64 { - cnt := uint64(0) +func popcntSlice(s []uint64) uint64 { + var cnt int for _, x := range s { - cnt += popcount(x) + cnt += bits.OnesCount64(x) } - return cnt + return uint64(cnt) } -func popcntMaskSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] &^ m[i]) +func popcntMaskSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") } - return cnt + for i := range s { + cnt += bits.OnesCount64(s[i] &^ m[i]) + } + return uint64(cnt) } -func popcntAndSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] & m[i]) +func popcntAndSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") } - return cnt + for i := range s { + cnt += bits.OnesCount64(s[i] & m[i]) + } + return uint64(cnt) } -func popcntOrSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] | m[i]) +func popcntOrSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") } - return cnt + for i := range s { + cnt += bits.OnesCount64(s[i] | m[i]) + } + return uint64(cnt) } -func popcntXorSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] ^ m[i]) +func popcntXorSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") } - return cnt + for i := range s { + cnt += bits.OnesCount64(s[i] ^ m[i]) + } + return uint64(cnt) } diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go deleted file mode 100644 index 7855c04b5..000000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go +++ /dev/null @@ -1,62 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -package bitset - -import "math/bits" - -func popcntSlice(s []uint64) uint64 { - var cnt int - for _, x := range s { - cnt += bits.OnesCount64(x) - } - return uint64(cnt) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] &^ m[i]) - } - return uint64(cnt) -} - -func popcntAndSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] & m[i]) - } - return uint64(cnt) -} - -func popcntOrSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] | m[i]) - } - return uint64(cnt) -} - -func popcntXorSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] ^ m[i]) - } - return uint64(cnt) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go deleted file mode 100644 index 116e04440..000000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go +++ /dev/null @@ -1,68 +0,0 @@ -//go:build !go1.9 && amd64 && !appengine -// +build !go1.9,amd64,!appengine - -package bitset - -// *** the following functions are defined in popcnt_amd64.s - -//go:noescape - -func hasAsm() bool - -// useAsm is a flag used to select the GO or ASM implementation of the popcnt function -var useAsm = hasAsm() - -//go:noescape - -func popcntSliceAsm(s []uint64) uint64 - -//go:noescape - -func popcntMaskSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntAndSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntOrSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntXorSliceAsm(s, m []uint64) uint64 - -func popcntSlice(s []uint64) uint64 { - if useAsm { - return popcntSliceAsm(s) - } - return popcntSliceGo(s) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - if useAsm { - return popcntMaskSliceAsm(s, m) - } - return popcntMaskSliceGo(s, m) -} - -func popcntAndSlice(s, m []uint64) uint64 { - if useAsm { - return popcntAndSliceAsm(s, m) - } - return popcntAndSliceGo(s, m) -} - -func popcntOrSlice(s, m []uint64) uint64 { - if useAsm { - return popcntOrSliceAsm(s, m) - } - return popcntOrSliceGo(s, m) -} - -func popcntXorSlice(s, m []uint64) uint64 { - if useAsm { - return popcntXorSliceAsm(s, m) - } - return popcntXorSliceGo(s, m) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s deleted file mode 100644 index 666c0dcc1..000000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s +++ /dev/null @@ -1,104 +0,0 @@ -// +build !go1.9 -// +build amd64,!appengine - -TEXT ·hasAsm(SB),4,$0-1 -MOVQ $1, AX -CPUID -SHRQ $23, CX -ANDQ $1, CX -MOVB CX, ret+0(FP) -RET - -#define POPCNTQ_DX_DX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0xd2 - -TEXT ·popcntSliceAsm(SB),4,$0-32 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntSliceEnd -popcntSliceLoop: -BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0x16 // POPCNTQ (SI), DX -ADDQ DX, AX -ADDQ $8, SI -LOOP popcntSliceLoop -popcntSliceEnd: -MOVQ AX, ret+24(FP) -RET - -TEXT ·popcntMaskSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntMaskSliceEnd -MOVQ m+24(FP), DI -popcntMaskSliceLoop: -MOVQ (DI), DX -NOTQ DX -ANDQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntMaskSliceLoop -popcntMaskSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntAndSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntAndSliceEnd -MOVQ m+24(FP), DI -popcntAndSliceLoop: -MOVQ (DI), DX -ANDQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntAndSliceLoop -popcntAndSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntOrSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntOrSliceEnd -MOVQ m+24(FP), DI -popcntOrSliceLoop: -MOVQ (DI), DX -ORQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntOrSliceLoop -popcntOrSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntXorSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntXorSliceEnd -MOVQ m+24(FP), DI -popcntXorSliceLoop: -MOVQ (DI), DX -XORQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntXorSliceLoop -popcntXorSliceEnd: -MOVQ AX, ret+48(FP) -RET diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go deleted file mode 100644 index 9e0ad464e..000000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go +++ /dev/null @@ -1,25 +0,0 @@ -//go:build !go1.9 && (!amd64 || appengine) -// +build !go1.9 -// +build !amd64 appengine - -package bitset - -func popcntSlice(s []uint64) uint64 { - return popcntSliceGo(s) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - return popcntMaskSliceGo(s, m) -} - -func popcntAndSlice(s, m []uint64) uint64 { - return popcntAndSliceGo(s, m) -} - -func popcntOrSlice(s, m []uint64) uint64 { - return popcntOrSliceGo(s, m) -} - -func popcntXorSlice(s, m []uint64) uint64 { - return popcntXorSliceGo(s, m) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/select.go b/vendor/github.com/bits-and-blooms/bitset/select.go new file mode 100644 index 000000000..a43c6bd6a --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/select.go @@ -0,0 +1,47 @@ +package bitset + +import "math/bits" + +func select64(w uint64, j uint) uint { + seen := 0 + // Divide 64bit + part := w & 0xFFFFFFFF + n := uint(bits.OnesCount64(part)) + if n <= j { + part = w >> 32 + seen += 32 + j -= n + } + ww := part + + // Divide 32bit + part = ww & 0xFFFF + + n = uint(bits.OnesCount64(part)) + if n <= j { + part = ww >> 16 + seen += 16 + j -= n + } + ww = part + + // Divide 16bit + part = ww & 0xFF + n = uint(bits.OnesCount64(part)) + if n <= j { + part = ww >> 8 + seen += 8 + j -= n + } + ww = part + + // Lookup in final byte + counter := 0 + for ; counter < 8; counter++ { + j -= uint((ww >> counter) & 1) + if j+1 == 0 { + break + } + } + return uint(seen + counter) +} diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go deleted file mode 100644 index 12336e76a..000000000 --- a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go +++ /dev/null @@ -1,15 +0,0 @@ -//go:build !go1.9 -// +build !go1.9 - -package bitset - -var deBruijn = [...]byte{ - 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4, - 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5, - 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11, - 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, -} - -func trailingZeroes64(v uint64) uint { - return uint(deBruijn[((v&-v)*0x03f79d71b4ca8b09)>>58]) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go deleted file mode 100644 index cfb0a8409..000000000 --- a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -package bitset - -import "math/bits" - -func trailingZeroes64(v uint64) uint { - return uint(bits.TrailingZeros64(v)) -} diff --git a/vendor/github.com/blevesearch/bleve/v2/.travis.yml b/vendor/github.com/blevesearch/bleve/v2/.travis.yml index 7b7297afe..e6fa002e6 100644 --- a/vendor/github.com/blevesearch/bleve/v2/.travis.yml +++ b/vendor/github.com/blevesearch/bleve/v2/.travis.yml @@ -3,9 +3,9 @@ sudo: false language: go go: - - "1.12.x" - - "1.13.x" - - "1.14.x" + - "1.21.x" + - "1.22.x" + - "1.23.x" script: - go get golang.org/x/tools/cmd/cover @@ -17,9 +17,9 @@ script: - go vet $(go list ./... | grep -v vendor/) - go test ./test -v -indexType scorch - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/); - - docs/project-code-coverage.sh - - docs/build_children.sh + - scripts/project-code-coverage.sh + - scripts/build_children.sh notifications: email: - - marty.schoch@gmail.com + - fts-team@couchbase.com diff --git a/vendor/github.com/blevesearch/bleve/v2/README.md b/vendor/github.com/blevesearch/bleve/v2/README.md index fa75ef3db..ef1a6ddda 100644 --- a/vendor/github.com/blevesearch/bleve/v2/README.md +++ b/vendor/github.com/blevesearch/bleve/v2/README.md @@ -1,11 +1,11 @@ # ![bleve](docs/bleve.png) bleve -[![Tests](https://github.com/blevesearch/bleve/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/blevesearch/bleve/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster) +[![Tests](https://github.com/blevesearch/bleve/actions/workflows/tests.yml/badge.svg?branch=master&event=push)](https://github.com/blevesearch/bleve/actions/workflows/tests.yml?query=event%3Apush+branch%3Amaster) [![Coverage Status](https://coveralls.io/repos/github/blevesearch/bleve/badge.svg?branch=master)](https://coveralls.io/github/blevesearch/bleve?branch=master) -[![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) -[![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![Go Reference](https://pkg.go.dev/badge/github.com/blevesearch/bleve/v2.svg)](https://pkg.go.dev/github.com/blevesearch/bleve/v2) +[![Join the chat](https://badges.gitter.im/join_chat.svg)](https://app.gitter.im/#/room/#blevesearch_bleve:gitter.im) [![codebeat](https://codebeat.co/badges/38a7cbc9-9cf5-41c0-a315-0746178230f4)](https://codebeat.co/projects/github-com-blevesearch-bleve) -[![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve) +[![Go Report Card](https://goreportcard.com/badge/github.com/blevesearch/bleve/v2)](https://goreportcard.com/report/github.com/blevesearch/bleve/v2) [![Sourcegraph](https://sourcegraph.com/github.com/blevesearch/bleve/-/badge.svg)](https://sourcegraph.com/github.com/blevesearch/bleve?badge) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) @@ -24,7 +24,8 @@ A modern indexing + search library in GO * [query string syntax](http://www.blevesearch.com/docs/Query-String-Query/) * [geo spatial search](https://github.com/blevesearch/bleve/blob/master/geo/README.md) * approximate k-nearest neighbors via [vector search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md) -* [tf-idf](https://en.wikipedia.org/wiki/Tf-idf) scoring + * [synonym search](https://github.com/blevesearch/bleve/blob/master/docs/synonyms.md) +* [tf-idf](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#tf-idf) / [bm25](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#bm25) scoring models * Hybrid search: exact + semantic * Query time boosting * Search result match highlighting with document fragments @@ -42,7 +43,7 @@ message := struct{ Body string }{ Id: "example", - From: "marty.schoch@gmail.com", + From: "xyz@couchbase.com", Body: "bleve indexing is easy", } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go index 5e28c95a5..5df940e5e 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go @@ -101,7 +101,10 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( } func init() { - registry.RegisterAnalyzer(Name, AnalyzerConstructor) + err := registry.RegisterAnalyzer(Name, AnalyzerConstructor) + if err != nil { + panic(err) + } } func getCharFilters(charFilterNames []string, cache *registry.Cache) ([]analysis.CharFilter, error) { diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/keyword/keyword.go b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/keyword/keyword.go index 6bb56d6f7..6eb052eb8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/keyword/keyword.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/keyword/keyword.go @@ -34,5 +34,8 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( } func init() { - registry.RegisterAnalyzer(Name, AnalyzerConstructor) + err := registry.RegisterAnalyzer(Name, AnalyzerConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/standard/standard.go b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/standard/standard.go index 96387bd79..fa752be57 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/standard/standard.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/standard/standard.go @@ -48,5 +48,8 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( } func init() { - registry.RegisterAnalyzer(Name, AnalyzerConstructor) + err := registry.RegisterAnalyzer(Name, AnalyzerConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/flexible/flexible.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/flexible/flexible.go index cb5f234d5..36cc9e809 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/flexible/flexible.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/flexible/flexible.go @@ -60,5 +60,8 @@ func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Ca } func init() { - registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + err := registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/optional/optional.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/optional/optional.go index 6dc7bfbcc..db30049df 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/optional/optional.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/optional/optional.go @@ -43,5 +43,8 @@ func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Ca } func init() { - registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + err := registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds/microseconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds/microseconds.go index a0e2c9495..88cde758f 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds/microseconds.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/microseconds/microseconds.go @@ -48,5 +48,8 @@ func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Ca } func init() { - registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + err := registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds/milliseconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds/milliseconds.go index 63826b451..645c525de 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds/milliseconds.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds/milliseconds.go @@ -48,5 +48,8 @@ func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Ca } func init() { - registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + err := registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds/nanoseconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds/nanoseconds.go index 8bb1ab1b6..f50eac1aa 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds/nanoseconds.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds/nanoseconds.go @@ -48,5 +48,8 @@ func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Ca } func init() { - registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + err := registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds/seconds.go b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds/seconds.go index 58e947c80..10219c9fd 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds/seconds.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds/seconds.go @@ -48,5 +48,8 @@ func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Ca } func init() { - registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + err := registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/analyzer_en.go b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/analyzer_en.go index 44a8d4c21..b9b53a8fc 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/analyzer_en.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/analyzer_en.go @@ -66,5 +66,8 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( } func init() { - registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) + err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/plural_stemmer.go b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/plural_stemmer.go index 0de7c1bbf..7aebdc8fa 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/plural_stemmer.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/plural_stemmer.go @@ -63,7 +63,10 @@ func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache } func init() { - registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor) + err := registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor) + if err != nil { + panic(err) + } } // ---------------------------------------------------------------------------- diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/possessive_filter_en.go b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/possessive_filter_en.go index 79c2489e2..42d51f0c9 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/possessive_filter_en.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/possessive_filter_en.go @@ -63,5 +63,8 @@ func PossessiveFilterConstructor(config map[string]interface{}, cache *registry. } func init() { - registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor) + err := registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stemmer_en_snowball.go b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stemmer_en_snowball.go index ab30b8b19..568a2b6a5 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stemmer_en_snowball.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stemmer_en_snowball.go @@ -45,5 +45,8 @@ func EnglishStemmerFilterConstructor(config map[string]interface{}, cache *regis } func init() { - registry.RegisterTokenFilter(SnowballStemmerName, EnglishStemmerFilterConstructor) + err := registry.RegisterTokenFilter(SnowballStemmerName, EnglishStemmerFilterConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_filter_en.go b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_filter_en.go index a3f91d226..0015ad603 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_filter_en.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_filter_en.go @@ -29,5 +29,8 @@ func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.C } func init() { - registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) + err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_words_en.go b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_words_en.go index 9b6ca86a7..d6ff496fa 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_words_en.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/lang/en/stop_words_en.go @@ -340,5 +340,8 @@ func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) ( } func init() { - registry.RegisterTokenMap(StopName, TokenMapConstructor) + err := registry.RegisterTokenMap(StopName, TokenMapConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go b/vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go index a1b6dbd05..92b962808 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/token/lowercase/lowercase.go @@ -47,7 +47,10 @@ func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.C } func init() { - registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor) + err := registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor) + if err != nil { + panic(err) + } } // toLowerDeferredCopy will function exactly like diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go b/vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go index 95af0fa72..ed1574bbb 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/token/porter/porter.go @@ -49,5 +49,8 @@ func PorterStemmerConstructor(config map[string]interface{}, cache *registry.Cac } func init() { - registry.RegisterTokenFilter(Name, PorterStemmerConstructor) + err := registry.RegisterTokenFilter(Name, PorterStemmerConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go b/vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go index bf4b98db1..09f2d1c9c 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/token/stop/stop.go @@ -66,5 +66,8 @@ func StopTokensFilterConstructor(config map[string]interface{}, cache *registry. } func init() { - registry.RegisterTokenFilter(Name, StopTokensFilterConstructor) + err := registry.RegisterTokenFilter(Name, StopTokensFilterConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/single/single.go b/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/single/single.go index a3eac7899..7f3abd2a8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/single/single.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/single/single.go @@ -45,5 +45,8 @@ func SingleTokenTokenizerConstructor(config map[string]interface{}, cache *regis } func init() { - registry.RegisterTokenizer(Name, SingleTokenTokenizerConstructor) + err := registry.RegisterTokenizer(Name, SingleTokenTokenizerConstructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode/unicode.go b/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode/unicode.go index ca3cfe76c..b694a3ee4 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode/unicode.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode/unicode.go @@ -115,7 +115,10 @@ func UnicodeTokenizerConstructor(config map[string]interface{}, cache *registry. } func init() { - registry.RegisterTokenizer(Name, UnicodeTokenizerConstructor) + err := registry.RegisterTokenizer(Name, UnicodeTokenizerConstructor) + if err != nil { + panic(err) + } } func convertType(segmentWordType int) analysis.TokenType { diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/type.go b/vendor/github.com/blevesearch/bleve/v2/analysis/type.go index e3a7f201b..f819984b5 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/type.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/type.go @@ -106,6 +106,15 @@ type DateTimeParser interface { ParseDateTime(string) (time.Time, string, error) } +const SynonymSourceType = "synonym" + +type SynonymSourceVisitor func(name string, item SynonymSource) error + +type SynonymSource interface { + Analyzer() string + Collection() string +} + type ByteArrayConverter interface { Convert([]byte) (interface{}, error) } diff --git a/vendor/github.com/blevesearch/bleve/v2/doc.go b/vendor/github.com/blevesearch/bleve/v2/doc.go index b9580cbe8..0c55551fb 100644 --- a/vendor/github.com/blevesearch/bleve/v2/doc.go +++ b/vendor/github.com/blevesearch/bleve/v2/doc.go @@ -19,7 +19,7 @@ Example Opening New Index, Indexing Data message := struct{ Id: "example" - From: "marty.schoch@gmail.com", + From: "xyz@couchbase.com", Body: "bleve indexing is easy", } diff --git a/vendor/github.com/blevesearch/bleve/v2/document/document.go b/vendor/github.com/blevesearch/bleve/v2/document/document.go index 54fd6d442..569d57bd6 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/document.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/document.go @@ -34,6 +34,7 @@ type Document struct { Fields []Field `json:"fields"` CompositeFields []*CompositeField StoredFieldsSize uint64 + indexed bool } func (d *Document) StoredFieldsBytes() uint64 { @@ -48,6 +49,13 @@ func NewDocument(id string) *Document { } } +func NewSynonymDocument(id string) *Document { + return &Document{ + id: id, + Fields: make([]Field, 0), + } +} + func (d *Document) Size() int { sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + len(d.id) @@ -133,3 +141,19 @@ func (d *Document) VisitComposite(visitor index.CompositeFieldVisitor) { func (d *Document) HasComposite() bool { return len(d.CompositeFields) > 0 } + +func (d *Document) VisitSynonymFields(visitor index.SynonymFieldVisitor) { + for _, f := range d.Fields { + if sf, ok := f.(index.SynonymField); ok { + visitor(sf) + } + } +} + +func (d *Document) SetIndexed() { + d.indexed = true +} + +func (d *Document) Indexed() bool { + return d.indexed +} diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go b/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go index 8c2987a7f..fdd14ce99 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_boolean.go @@ -116,13 +116,13 @@ func NewBooleanFieldFromBytes(name string, arrayPositions []uint64, value []byte name: name, arrayPositions: arrayPositions, value: value, - options: DefaultNumericIndexingOptions, + options: DefaultBooleanIndexingOptions, numPlainTextBytes: uint64(len(value)), } } func NewBooleanField(name string, arrayPositions []uint64, b bool) *BooleanField { - return NewBooleanFieldWithIndexingOptions(name, arrayPositions, b, DefaultNumericIndexingOptions) + return NewBooleanFieldWithIndexingOptions(name, arrayPositions, b, DefaultBooleanIndexingOptions) } func NewBooleanFieldWithIndexingOptions(name string, arrayPositions []uint64, b bool, options index.FieldIndexingOptions) *BooleanField { diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go b/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go index 6bf7b010a..ca671cea7 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go @@ -87,13 +87,6 @@ func (n *GeoShapeField) AnalyzedTokenFrequencies() index.TokenFrequencies { func (n *GeoShapeField) Analyze() { // compute the bytes representation for the coordinates tokens := make(analysis.TokenStream, 0) - tokens = append(tokens, &analysis.Token{ - Start: 0, - End: len(n.encodedValue), - Term: n.encodedValue, - Position: 1, - Type: analysis.AlphaNumeric, - }) rti := geo.GetSpatialAnalyzerPlugin("s2") terms := rti.GetIndexTokens(n.shape) @@ -126,6 +119,10 @@ func (n *GeoShapeField) NumPlainTextBytes() uint64 { return n.numPlainTextBytes } +func (n *GeoShapeField) EncodedShape() []byte { + return n.encodedValue +} + func NewGeoShapeField(name string, arrayPositions []uint64, coordinates [][][][]float64, typ string) *GeoShapeField { return NewGeoShapeFieldWithIndexingOptions(name, arrayPositions, diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go b/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go index 80a353a01..3a5ab3799 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_ip.go @@ -31,7 +31,7 @@ func init() { reflectStaticSizeIPField = int(reflect.TypeOf(f).Size()) } -const DefaultIPIndexingOptions = index.StoreField | index.IndexField | index.DocValues | index.IncludeTermVectors +const DefaultIPIndexingOptions = index.StoreField | index.IndexField | index.DocValues type IPField struct { name string @@ -115,7 +115,7 @@ func NewIPFieldFromBytes(name string, arrayPositions []uint64, value []byte) *IP name: name, arrayPositions: arrayPositions, value: value, - options: DefaultNumericIndexingOptions, + options: DefaultIPIndexingOptions, numPlainTextBytes: uint64(len(value)), } } diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_synonym.go b/vendor/github.com/blevesearch/bleve/v2/document/field_synonym.go new file mode 100644 index 000000000..c34b481dd --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_synonym.go @@ -0,0 +1,149 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package document + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeSynonymField int + +func init() { + var f SynonymField + reflectStaticSizeSynonymField = int(reflect.TypeOf(f).Size()) +} + +const DefaultSynonymIndexingOptions = index.IndexField + +type SynonymField struct { + name string + analyzer analysis.Analyzer + options index.FieldIndexingOptions + input []string + synonyms []string + numPlainTextBytes uint64 + + // populated during analysis + synonymMap map[string][]string +} + +func (s *SynonymField) Size() int { + return reflectStaticSizeSynonymField + size.SizeOfPtr + + len(s.name) +} + +func (s *SynonymField) Name() string { + return s.name +} + +func (s *SynonymField) ArrayPositions() []uint64 { + return nil +} + +func (s *SynonymField) Options() index.FieldIndexingOptions { + return s.options +} + +func (s *SynonymField) NumPlainTextBytes() uint64 { + return s.numPlainTextBytes +} + +func (s *SynonymField) AnalyzedLength() int { + return 0 +} + +func (s *SynonymField) EncodedFieldType() byte { + return 'y' +} + +func (s *SynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies { + return nil +} + +func (s *SynonymField) Analyze() { + var analyzedInput []string + if len(s.input) > 0 { + analyzedInput = make([]string, 0, len(s.input)) + for _, term := range s.input { + analyzedTerm := analyzeSynonymTerm(term, s.analyzer) + if analyzedTerm != "" { + analyzedInput = append(analyzedInput, analyzedTerm) + } + } + } + analyzedSynonyms := make([]string, 0, len(s.synonyms)) + for _, syn := range s.synonyms { + analyzedTerm := analyzeSynonymTerm(syn, s.analyzer) + if analyzedTerm != "" { + analyzedSynonyms = append(analyzedSynonyms, analyzedTerm) + } + } + s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms) +} + +func (s *SynonymField) Value() []byte { + return nil +} + +func (s *SynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) { + for term, synonyms := range s.synonymMap { + visitor(term, synonyms) + } +} + +func NewSynonymField(name string, analyzer analysis.Analyzer, input []string, synonyms []string) *SynonymField { + return &SynonymField{ + name: name, + analyzer: analyzer, + options: DefaultSynonymIndexingOptions, + input: input, + synonyms: synonyms, + } +} + +func processSynonymData(input []string, synonyms []string) map[string][]string { + var synonymMap map[string][]string + if len(input) > 0 { + // Map each term to the same list of synonyms. + synonymMap = make(map[string][]string, len(input)) + for _, term := range input { + synonymMap[term] = synonyms + } + } else { + synonymMap = make(map[string][]string, len(synonyms)) + // Precompute a map where each synonym points to all other synonyms. + for i, elem := range synonyms { + synonymMap[elem] = make([]string, 0, len(synonyms)-1) + for j, otherElem := range synonyms { + if i != j { + synonymMap[elem] = append(synonymMap[elem], otherElem) + } + } + } + } + return synonymMap +} + +func analyzeSynonymTerm(term string, analyzer analysis.Analyzer) string { + tokenStream := analyzer.Analyze([]byte(term)) + if len(tokenStream) == 1 { + return string(tokenStream[0].Term) + } + return "" +} diff --git a/vendor/github.com/blevesearch/bleve/v2/error.go b/vendor/github.com/blevesearch/bleve/v2/error.go index 2d2751cd4..b57a61543 100644 --- a/vendor/github.com/blevesearch/bleve/v2/error.go +++ b/vendor/github.com/blevesearch/bleve/v2/error.go @@ -27,6 +27,7 @@ const ( ErrorEmptyID ErrorIndexReadInconsistency ErrorTwoPhaseSearchInconsistency + ErrorSynonymSearchNotSupported ) // Error represents a more strongly typed bleve error for detecting @@ -49,4 +50,5 @@ var errorMessages = map[Error]string{ ErrorEmptyID: "document ID cannot be empty", ErrorIndexReadInconsistency: "index read inconsistency detected", ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change", + ErrorSynonymSearchNotSupported: "synonym search not supported", } diff --git a/vendor/github.com/blevesearch/bleve/v2/geo/README.md b/vendor/github.com/blevesearch/bleve/v2/geo/README.md index 6112ff5da..ceba77efe 100644 --- a/vendor/github.com/blevesearch/bleve/v2/geo/README.md +++ b/vendor/github.com/blevesearch/bleve/v2/geo/README.md @@ -1,4 +1,4 @@ -# geo support in bleve +# Geo spatial search support in bleve Latest bleve spatial capabilities are powered by spatial hierarchical tokens generated from s2geometry. You can find more details about the [s2geometry basics here](http://s2geometry.io/), and explore the diff --git a/vendor/github.com/blevesearch/bleve/v2/geo/geo.go b/vendor/github.com/blevesearch/bleve/v2/geo/geo.go index 55eace1df..2416c034d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/geo/geo.go +++ b/vendor/github.com/blevesearch/bleve/v2/geo/geo.go @@ -139,7 +139,7 @@ func RectFromPointDistance(lon, lat, dist float64) (float64, float64, float64, f var minLonL, maxLonL float64 if minLatL > minLatRad && maxLatL < maxLatRad { - deltaLon := asin(sin(radDistance) / cos(radLat)) + deltaLon := math.Asin(math.Sin(radDistance) / math.Cos(radLat)) minLonL = radLon - deltaLon if minLonL < minLonRad { minLonL += 2 * math.Pi diff --git a/vendor/github.com/blevesearch/bleve/v2/geo/geo_dist.go b/vendor/github.com/blevesearch/bleve/v2/geo/geo_dist.go index d3ae0ed9e..3e6784fb3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/geo/geo_dist.go +++ b/vendor/github.com/blevesearch/bleve/v2/geo/geo_dist.go @@ -88,11 +88,11 @@ func ParseDistanceUnit(u string) (float64, error) { func Haversin(lon1, lat1, lon2, lat2 float64) float64 { x1 := lat1 * degreesToRadian x2 := lat2 * degreesToRadian - h1 := 1 - cos(x1-x2) - h2 := 1 - cos((lon1-lon2)*degreesToRadian) - h := (h1 + cos(x1)*cos(x2)*h2) / 2 + h1 := 1 - math.Cos(x1-x2) + h2 := 1 - math.Cos((lon1-lon2)*degreesToRadian) + h := (h1 + math.Cos(x1)*math.Cos(x2)*h2) / 2 avgLat := (x1 + x2) / 2 diameter := earthDiameter(avgLat) - return diameter * asin(math.Min(1, math.Sqrt(h))) + return diameter * math.Asin(math.Min(1, math.Sqrt(h))) } diff --git a/vendor/github.com/blevesearch/bleve/v2/geo/parse.go b/vendor/github.com/blevesearch/bleve/v2/geo/parse.go index 34f731a9e..ed1d935be 100644 --- a/vendor/github.com/blevesearch/bleve/v2/geo/parse.go +++ b/vendor/github.com/blevesearch/bleve/v2/geo/parse.go @@ -236,14 +236,19 @@ func extract2DCoordinates(thing interface{}) [][]float64 { func extract3DCoordinates(thing interface{}) (c [][][]float64) { coords := reflect.ValueOf(thing) - for i := 0; i < coords.Len(); i++ { - vals := coords.Index(i) + if !coords.IsValid() { + return nil + } - edges := vals.Interface() - if es, ok := edges.([]interface{}); ok { - loop := extract2DCoordinates(es) - if len(loop) > 0 { - c = append(c, loop) + if coords.Kind() == reflect.Slice { + for i := 0; i < coords.Len(); i++ { + vals := coords.Index(i) + edges := vals.Interface() + if es, ok := edges.([]interface{}); ok { + loop := extract2DCoordinates(es) + if len(loop) > 0 { + c = append(c, loop) + } } } } diff --git a/vendor/github.com/blevesearch/bleve/v2/geo/sloppy.go b/vendor/github.com/blevesearch/bleve/v2/geo/sloppy.go index 0ce646d74..e9de06dc0 100644 --- a/vendor/github.com/blevesearch/bleve/v2/geo/sloppy.go +++ b/vendor/github.com/blevesearch/bleve/v2/geo/sloppy.go @@ -19,104 +19,16 @@ import ( ) var earthDiameterPerLatitude []float64 -var sinTab []float64 -var cosTab []float64 -var asinTab []float64 -var asinDer1DivF1Tab []float64 -var asinDer2DivF2Tab []float64 -var asinDer3DivF3Tab []float64 -var asinDer4DivF4Tab []float64 -const radiusTabsSize = (1 << 10) + 1 -const radiusDelta = (math.Pi / 2) / (radiusTabsSize - 1) -const radiusIndexer = 1 / radiusDelta -const sinCosTabsSize = (1 << 11) + 1 -const asinTabsSize = (1 << 13) + 1 -const oneDivF2 = 1 / 2.0 -const oneDivF3 = 1 / 6.0 -const oneDivF4 = 1 / 24.0 - -// 1.57079632673412561417e+00 first 33 bits of pi/2 -var pio2Hi = math.Float64frombits(0x3FF921FB54400000) - -// 6.07710050650619224932e-11 pi/2 - PIO2_HI -var pio2Lo = math.Float64frombits(0x3DD0B4611A626331) - -var asinPio2Hi = math.Float64frombits(0x3FF921FB54442D18) // 1.57079632679489655800e+00 -var asinPio2Lo = math.Float64frombits(0x3C91A62633145C07) // 6.12323399573676603587e-17 -var asinPs0 = math.Float64frombits(0x3fc5555555555555) // 1.66666666666666657415e-01 -var asinPs1 = math.Float64frombits(0xbfd4d61203eb6f7d) // -3.25565818622400915405e-01 -var asinPs2 = math.Float64frombits(0x3fc9c1550e884455) // 2.01212532134862925881e-01 -var asinPs3 = math.Float64frombits(0xbfa48228b5688f3b) // -4.00555345006794114027e-02 -var asinPs4 = math.Float64frombits(0x3f49efe07501b288) // 7.91534994289814532176e-04 -var asinPs5 = math.Float64frombits(0x3f023de10dfdf709) // 3.47933107596021167570e-05 -var asinQs1 = math.Float64frombits(0xc0033a271c8a2d4b) // -2.40339491173441421878e+00 -var asinQs2 = math.Float64frombits(0x40002ae59c598ac8) // 2.02094576023350569471e+00 -var asinQs3 = math.Float64frombits(0xbfe6066c1b8d0159) // -6.88283971605453293030e-01 -var asinQs4 = math.Float64frombits(0x3fb3b8c5b12e9282) // 7.70381505559019352791e-02 - -var twoPiHi = 4 * pio2Hi -var twoPiLo = 4 * pio2Lo -var sinCosDeltaHi = twoPiHi/sinCosTabsSize - 1 -var sinCosDeltaLo = twoPiLo/sinCosTabsSize - 1 -var sinCosIndexer = 1 / (sinCosDeltaHi + sinCosDeltaLo) -var sinCosMaxValueForIntModulo = ((math.MaxInt64 >> 9) / sinCosIndexer) * 0.99 -var asinMaxValueForTabs = math.Sin(73.0 * degreesToRadian) - -var asinDelta = asinMaxValueForTabs / (asinTabsSize - 1) -var asinIndexer = 1 / asinDelta +const ( + radiusTabsSize = (1 << 10) + 1 + radiusDelta = (math.Pi / 2) / (radiusTabsSize - 1) + radiusIndexer = 1 / radiusDelta +) func init() { // initializes the tables used for the sloppy math functions - // sin and cos - sinTab = make([]float64, sinCosTabsSize) - cosTab = make([]float64, sinCosTabsSize) - sinCosPiIndex := (sinCosTabsSize - 1) / 2 - sinCosPiMul2Index := 2 * sinCosPiIndex - sinCosPiMul05Index := sinCosPiIndex / 2 - sinCosPiMul15Index := 3 * sinCosPiIndex / 2 - for i := 0; i < sinCosTabsSize; i++ { - // angle: in [0,2*PI]. - angle := float64(i)*sinCosDeltaHi + float64(i)*sinCosDeltaLo - sinAngle := math.Sin(angle) - cosAngle := math.Cos(angle) - // For indexes corresponding to null cosine or sine, we make sure the value is zero - // and not an epsilon. This allows for a much better accuracy for results close to zero. - if i == sinCosPiIndex { - sinAngle = 0.0 - } else if i == sinCosPiMul2Index { - sinAngle = 0.0 - } else if i == sinCosPiMul05Index { - sinAngle = 0.0 - } else if i == sinCosPiMul15Index { - sinAngle = 0.0 - } - sinTab[i] = sinAngle - cosTab[i] = cosAngle - } - - // asin - asinTab = make([]float64, asinTabsSize) - asinDer1DivF1Tab = make([]float64, asinTabsSize) - asinDer2DivF2Tab = make([]float64, asinTabsSize) - asinDer3DivF3Tab = make([]float64, asinTabsSize) - asinDer4DivF4Tab = make([]float64, asinTabsSize) - for i := 0; i < asinTabsSize; i++ { - // x: in [0,ASIN_MAX_VALUE_FOR_TABS]. - x := float64(i) * asinDelta - asinTab[i] = math.Asin(x) - oneMinusXSqInv := 1.0 / (1 - x*x) - oneMinusXSqInv05 := math.Sqrt(oneMinusXSqInv) - oneMinusXSqInv15 := oneMinusXSqInv05 * oneMinusXSqInv - oneMinusXSqInv25 := oneMinusXSqInv15 * oneMinusXSqInv - oneMinusXSqInv35 := oneMinusXSqInv25 * oneMinusXSqInv - asinDer1DivF1Tab[i] = oneMinusXSqInv05 - asinDer2DivF2Tab[i] = (x * oneMinusXSqInv15) * oneDivF2 - asinDer3DivF3Tab[i] = ((1 + 2*x*x) * oneMinusXSqInv25) * oneDivF3 - asinDer4DivF4Tab[i] = ((5 + 2*x*(2+x*(5-2*x))) * oneMinusXSqInv35) * oneDivF4 - } - // earth radius a := 6378137.0 b := 6356752.31420 @@ -145,68 +57,3 @@ func earthDiameter(lat float64) float64 { } return earthDiameterPerLatitude[int(index)] } - -var pio2 = math.Pi / 2 - -func sin(a float64) float64 { - return cos(a - pio2) -} - -// cos is a sloppy math (faster) implementation of math.Cos -func cos(a float64) float64 { - if a < 0.0 { - a = -a - } - if a > sinCosMaxValueForIntModulo { - return math.Cos(a) - } - // index: possibly outside tables range. - index := int(a*sinCosIndexer + 0.5) - delta := (a - float64(index)*sinCosDeltaHi) - float64(index)*sinCosDeltaLo - // Making sure index is within tables range. - // Last value of each table is the same than first, so we ignore it (tabs size minus one) for modulo. - index &= (sinCosTabsSize - 2) // index % (SIN_COS_TABS_SIZE-1) - indexCos := cosTab[index] - indexSin := sinTab[index] - return indexCos + delta*(-indexSin+delta*(-indexCos*oneDivF2+delta*(indexSin*oneDivF3+delta*indexCos*oneDivF4))) -} - -// asin is a sloppy math (faster) implementation of math.Asin -func asin(a float64) float64 { - var negateResult bool - if a < 0 { - a = -a - negateResult = true - } - if a <= asinMaxValueForTabs { - index := int(a*asinIndexer + 0.5) - delta := a - float64(index)*asinDelta - result := asinTab[index] + delta*(asinDer1DivF1Tab[index]+delta*(asinDer2DivF2Tab[index]+delta*(asinDer3DivF3Tab[index]+delta*asinDer4DivF4Tab[index]))) - if negateResult { - return -result - } - return result - } - // value > ASIN_MAX_VALUE_FOR_TABS, or value is NaN - // This part is derived from fdlibm. - if a < 1 { - t := (1.0 - a) * 0.5 - p := t * (asinPs0 + t*(asinPs1+t*(asinPs2+t*(asinPs3+t*(asinPs4+t+asinPs5))))) - q := 1.0 + t*(asinQs1+t*(asinQs2+t*(asinQs3+t*asinQs4))) - s := math.Sqrt(t) - z := s + s*(p/q) - result := asinPio2Hi - ((z + z) - asinPio2Lo) - if negateResult { - return -result - } - return result - } - // value >= 1.0, or value is NaN - if a == 1.0 { - if negateResult { - return -math.Pi / 2 - } - return math.Pi / 2 - } - return math.NaN() -} diff --git a/vendor/github.com/blevesearch/bleve/v2/index.go b/vendor/github.com/blevesearch/bleve/v2/index.go index acbefc695..3d2389884 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "fmt" "github.com/blevesearch/bleve/v2/index/upsidedown" @@ -63,6 +64,36 @@ func (b *Batch) Index(id string, data interface{}) error { return nil } +func (b *Batch) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + if id == "" { + return ErrorEmptyID + } + if eventIndex, ok := b.index.(index.EventIndex); ok { + eventIndex.FireIndexEvent() + } + synMap, ok := b.index.Mapping().(mapping.SynonymMapping) + if !ok { + return ErrorSynonymSearchNotSupported + } + + if err := definition.Validate(); err != nil { + return err + } + + doc := document.NewSynonymDocument(id) + err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms) + if err != nil { + return err + } + b.internal.Update(doc) + + b.lastDocSize = uint64(doc.Size() + + len(id) + size.SizeOfString) // overhead from internal + b.totalSize += b.lastDocSize + + return nil +} + func (b *Batch) LastDocSize() uint64 { return b.lastDocSize } @@ -323,3 +354,35 @@ type IndexCopyable interface { // FileSystemDirectory is the default implementation for the // index.Directory interface. type FileSystemDirectory string + +// SynonymDefinition represents a synonym mapping in Bleve. +// Each instance associates one or more input terms with a list of synonyms, +// defining how terms are treated as equivalent in searches. +type SynonymDefinition struct { + // Input is an optional list of terms for unidirectional synonym mapping. + // When terms are specified in Input, they will map to the terms in Synonyms, + // making the relationship unidirectional (each Input maps to all Synonyms). + // If Input is omitted, the relationship is bidirectional among all Synonyms. + Input []string `json:"input,omitempty"` + + // Synonyms is a list of terms that are considered equivalent. + // If Input is specified, each term in Input will map to each term in Synonyms. + // If Input is not specified, the Synonyms list will be treated bidirectionally, + // meaning each term in Synonyms is treated as synonymous with all others. + Synonyms []string `json:"synonyms"` +} + +func (sd *SynonymDefinition) Validate() error { + if len(sd.Synonyms) == 0 { + return fmt.Errorf("synonym definition must have at least one synonym") + } + return nil +} + +// SynonymIndex supports indexing synonym definitions alongside regular documents. +// Synonyms, grouped by collection name, define term relationships for query expansion in searches. +type SynonymIndex interface { + Index + // IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection. + IndexSynonym(id string, collection string, definition *SynonymDefinition) error +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go index 04e5bd1b2..d4d8e9c07 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go @@ -19,7 +19,7 @@ import ( "os" "sync" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" bolt "go.etcd.io/bbolt" @@ -303,7 +303,7 @@ func (o *Builder) Close() error { } // fill the root bolt with this fake index snapshot - _, _, err = prepareBoltSnapshot(is, tx, o.path, o.segPlugin, nil) + _, _, err = prepareBoltSnapshot(is, tx, o.path, o.segPlugin, nil, nil) if err != nil { _ = tx.Rollback() _ = rootBolt.Close() diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go index 2cb1398ec..209da5b8d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go @@ -19,7 +19,7 @@ import ( "path/filepath" "sync/atomic" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" ) @@ -352,33 +352,39 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { creator: "introduceMerge", } - // iterate through current segments - newSegmentDeleted := roaring.NewBitmap() var running, docsToPersistCount, memSegments, fileSegments uint64 var droppedSegmentFiles []string + newSegmentDeleted := make([]*roaring.Bitmap, len(nextMerge.new)) + for i := range newSegmentDeleted { + // create a bitmaps to track the obsoletes per newly merged segments + newSegmentDeleted[i] = roaring.NewBitmap() + } + + // iterate through current segments for i := range root.segment { segmentID := root.segment[i].id - if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { + if segSnapAtMerge, ok := nextMerge.mergedSegHistory[segmentID]; ok { // this segment is going away, see if anything else was deleted since we started the merge if segSnapAtMerge != nil && root.segment[i].deleted != nil { // assume all these deletes are new deletedSince := root.segment[i].deleted // if we already knew about some of them, remove - if segSnapAtMerge.deleted != nil { - deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted) + if segSnapAtMerge.oldSegment.deleted != nil { + deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.oldSegment.deleted) } deletedSinceItr := deletedSince.Iterator() for deletedSinceItr.HasNext() { oldDocNum := deletedSinceItr.Next() - newDocNum := nextMerge.oldNewDocNums[segmentID][oldDocNum] - newSegmentDeleted.Add(uint32(newDocNum)) + newDocNum := segSnapAtMerge.oldNewDocIDs[oldDocNum] + newSegmentDeleted[segSnapAtMerge.workerID].Add(uint32(newDocNum)) } } + // clean up the old segment map to figure out the // obsolete segments wrt root in meantime, whatever // segments left behind in old map after processing // the root segments would be the obsolete segment set - delete(nextMerge.old, segmentID) + delete(nextMerge.mergedSegHistory, segmentID) } else if root.segment[i].LiveSize() > 0 { // this segment is staying newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ @@ -410,52 +416,59 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // before the newMerge introduction, need to clean the newly // merged segment wrt the current root segments, hence // applying the obsolete segment contents to newly merged segment - for segID, ss := range nextMerge.old { - obsoleted := ss.DocNumbersLive() + for _, ss := range nextMerge.mergedSegHistory { + obsoleted := ss.oldSegment.DocNumbersLive() if obsoleted != nil { obsoletedIter := obsoleted.Iterator() for obsoletedIter.HasNext() { oldDocNum := obsoletedIter.Next() - newDocNum := nextMerge.oldNewDocNums[segID][oldDocNum] - newSegmentDeleted.Add(uint32(newDocNum)) + newDocNum := ss.oldNewDocIDs[oldDocNum] + newSegmentDeleted[ss.workerID].Add(uint32(newDocNum)) } } } - var skipped bool - // In case where all the docs in the newly merged segment getting - // deleted by the time we reach here, can skip the introduction. - if nextMerge.new != nil && - nextMerge.new.Count() > newSegmentDeleted.GetCardinality() { - stats := newFieldStats() - if fsr, ok := nextMerge.new.(segment.FieldStatsReporter); ok { - fsr.UpdateFieldStats(stats) + skipped := true + // make the newly merged segments part of the newSnapshot being constructed + for i, newMergedSegment := range nextMerge.new { + // checking if this newly merged segment is worth keeping based on + // obsoleted doc count since the merge intro started + if newMergedSegment != nil && + newMergedSegment.Count() > newSegmentDeleted[i].GetCardinality() { + stats := newFieldStats() + if fsr, ok := newMergedSegment.(segment.FieldStatsReporter); ok { + fsr.UpdateFieldStats(stats) + } + + // put the merged segment at the end of newSnapshot + newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + id: nextMerge.id[i], + segment: newMergedSegment, // take ownership for nextMerge.new's ref-count + deleted: newSegmentDeleted[i], + stats: stats, + cachedDocs: &cachedDocs{cache: nil}, + cachedMeta: &cachedMeta{meta: nil}, + creator: "introduceMerge", + mmaped: nextMerge.mmaped, + }) + newSnapshot.offsets = append(newSnapshot.offsets, running) + running += newMergedSegment.Count() + + switch newMergedSegment.(type) { + case segment.PersistedSegment: + fileSegments++ + default: + docsToPersistCount += newMergedSegment.Count() - newSegmentDeleted[i].GetCardinality() + memSegments++ + } + skipped = false } + } - // put new segment at end - newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: nextMerge.id, - segment: nextMerge.new, // take ownership for nextMerge.new's ref-count - deleted: newSegmentDeleted, - stats: stats, - cachedDocs: &cachedDocs{cache: nil}, - cachedMeta: &cachedMeta{meta: nil}, - creator: "introduceMerge", - mmaped: nextMerge.mmaped, - }) - newSnapshot.offsets = append(newSnapshot.offsets, running) - atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) - - switch nextMerge.new.(type) { - case segment.PersistedSegment: - fileSegments++ - default: - docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() - memSegments++ - } - } else { - skipped = true + if skipped { atomic.AddUint64(&s.stats.TotFileMergeIntroductionsObsoleted, 1) + } else { + atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, uint64(len(nextMerge.new))) } atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go index b74504ca1..7f787a344 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go @@ -19,10 +19,11 @@ import ( "fmt" "os" "strings" + "sync" "sync/atomic" "time" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/index/scorch/mergeplan" "github.com/blevesearch/bleve/v2/util" segment "github.com/blevesearch/scorch_segment_api/v2" @@ -208,6 +209,17 @@ func (s *Scorch) ForceMerge(ctx context.Context, func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, error) { mergePlannerOptions := mergeplan.DefaultMergePlanOptions + + po, err := s.parsePersisterOptions() + if err != nil { + return nil, err + } + // by default use the MaxSizeInMemoryMergePerWorker from the persister option + // as the FloorSegmentFileSize for the merge planner which would be the + // first tier size in the planning. If the value is 0, then we don't use the + // file size in the planning. + mergePlannerOptions.FloorSegmentFileSize = int64(po.MaxSizeInMemoryMergePerWorker) + if v, ok := s.config["scorchMergePlanOptions"]; ok { b, err := util.MarshalJSON(v) if err != nil { @@ -305,14 +317,20 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) segmentsToMerge := make([]segment.Segment, 0, len(task.Segments)) docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) + mergedSegHistory := make(map[uint64]*mergedSegmentHistory, len(task.Segments)) for _, planSegment := range task.Segments { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { oldMap[segSnapshot.id] = segSnapshot + mergedSegHistory[segSnapshot.id] = &mergedSegmentHistory{ + workerID: 0, + oldSegment: segSnapshot, + } if persistedSeg, ok := segSnapshot.segment.(segment.PersistedSegment); ok { if segSnapshot.LiveSize() == 0 { atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1) oldMap[segSnapshot.id] = nil + delete(mergedSegHistory, segSnapshot.id) } else { segmentsToMerge = append(segmentsToMerge, segSnapshot.segment) docsToDrop = append(docsToDrop, segSnapshot.deleted) @@ -327,7 +345,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, } } - var oldNewDocNums map[uint64][]uint64 var seg segment.Segment var filename string if len(segmentsToMerge) > 0 { @@ -368,21 +385,22 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, totalBytesRead := seg.BytesRead() + prevBytesReadTotal seg.ResetBytesRead(totalBytesRead) - oldNewDocNums = make(map[uint64][]uint64, len(newDocNums)) for i, segNewDocNums := range newDocNums { - oldNewDocNums[task.Segments[i].Id()] = segNewDocNums + if mergedSegHistory[task.Segments[i].Id()] != nil { + mergedSegHistory[task.Segments[i].Id()].oldNewDocIDs = segNewDocNums + } } atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge))) } sm := &segmentMerge{ - id: newSegmentID, - old: oldMap, - oldNewDocNums: oldNewDocNums, - new: seg, - notifyCh: make(chan *mergeTaskIntroStatus), - mmaped: 1, + id: []uint64{newSegmentID}, + mergedSegHistory: mergedSegHistory, + new: []segment.Segment{seg}, + newCount: seg.Count(), + notifyCh: make(chan *mergeTaskIntroStatus), + mmaped: 1, } s.fireEvent(EventKindMergeTaskIntroductionStart, 0) @@ -435,13 +453,22 @@ type mergeTaskIntroStatus struct { skipped bool } +// this is important when it comes to introducing multiple merged segments in a +// single introducer channel push. That way there is a check to ensure that the +// file count doesn't explode during the index's lifetime. +type mergedSegmentHistory struct { + workerID uint64 + oldNewDocIDs []uint64 + oldSegment *SegmentSnapshot +} + type segmentMerge struct { - id uint64 - old map[uint64]*SegmentSnapshot - oldNewDocNums map[uint64][]uint64 - new segment.Segment - notifyCh chan *mergeTaskIntroStatus - mmaped uint32 + id []uint64 + new []segment.Segment + mergedSegHistory map[uint64]*mergedSegmentHistory + notifyCh chan *mergeTaskIntroStatus + mmaped uint32 + newCount uint64 } func cumulateBytesRead(sbs []segment.Segment) uint64 { @@ -452,24 +479,85 @@ func cumulateBytesRead(sbs []segment.Segment) uint64 { return rv } -// perform a merging of the given SegmentBase instances into a new, -// persisted segment, and synchronously introduce that new segment -// into the root -func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, - sbs []segment.Segment, sbsDrops []*roaring.Bitmap, - sbsIndexes []int) (*IndexSnapshot, uint64, error) { +func closeNewMergedSegments(segs []segment.Segment) error { + for _, seg := range segs { + if seg != nil { + _ = seg.DecRef() + } + } + return nil +} + +func (s *Scorch) mergeSegmentBasesParallel(snapshot *IndexSnapshot, flushableObjs []*flushable) (*IndexSnapshot, []uint64, error) { atomic.AddUint64(&s.stats.TotMemMergeBeg, 1) memMergeZapStartTime := time.Now() atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) - newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) - filename := zapFileName(newSegmentID) - path := s.path + string(os.PathSeparator) + filename + var wg sync.WaitGroup + // we're tracking the merged segments and their doc number per worker + // to be able to introduce them all at once, so the first dimension of the + // slices here correspond to workerID + newDocIDsSet := make([][][]uint64, len(flushableObjs)) + newMergedSegments := make([]segment.Segment, len(flushableObjs)) + newMergedSegmentIDs := make([]uint64, len(flushableObjs)) + numFlushes := len(flushableObjs) + var numSegments, newMergedCount uint64 + var em sync.Mutex + var errs []error - newDocNums, _, err := - s.segPlugin.Merge(sbs, sbsDrops, path, s.closeCh, s) + // deploy the workers to merge and flush the batches of segments parallely + for i := 0; i < numFlushes; i++ { + wg.Add(1) + go func(segsBatch []segment.Segment, dropsBatch []*roaring.Bitmap, id int) { + defer wg.Done() + newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) + filename := zapFileName(newSegmentID) + path := s.path + string(os.PathSeparator) + filename + + // the newly merged segment is already flushed out to disk, just needs + // to be opened using mmap. + newDocIDs, _, err := + s.segPlugin.Merge(segsBatch, dropsBatch, path, s.closeCh, s) + if err != nil { + em.Lock() + errs = append(errs, err) + em.Unlock() + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) + return + } + newMergedSegmentIDs[id] = newSegmentID + newDocIDsSet[id] = newDocIDs + newMergedSegments[id], err = s.segPlugin.Open(path) + if err != nil { + em.Lock() + errs = append(errs, err) + em.Unlock() + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) + return + } + atomic.AddUint64(&newMergedCount, newMergedSegments[id].Count()) + atomic.AddUint64(&numSegments, uint64(len(segsBatch))) + }(flushableObjs[i].segments, flushableObjs[i].drops, i) + } + wg.Wait() + + if errs != nil { + // close the new merged segments + _ = closeNewMergedSegments(newMergedSegments) + var errf error + for _, err := range errs { + if err == segment.ErrClosed { + // the index snapshot was closed which will be handled gracefully + // by retrying the whole merge+flush operation in a later iteration + // so its safe to early exit the same error. + return nil, nil, err + } + errf = fmt.Errorf("%w; %v", errf, err) + } + return nil, nil, errf + } atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) @@ -479,39 +567,30 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime) } - if err != nil { - atomic.AddUint64(&s.stats.TotMemMergeErr, 1) - return nil, 0, err - } - - seg, err := s.segPlugin.Open(path) - if err != nil { - atomic.AddUint64(&s.stats.TotMemMergeErr, 1) - return nil, 0, err - } - - // update persisted stats - atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count()) - atomic.AddUint64(&s.stats.TotPersistedSegments, 1) - sm := &segmentMerge{ - id: newSegmentID, - old: make(map[uint64]*SegmentSnapshot, len(sbsIndexes)), - oldNewDocNums: make(map[uint64][]uint64, len(sbsIndexes)), - new: seg, - notifyCh: make(chan *mergeTaskIntroStatus), + id: newMergedSegmentIDs, + new: newMergedSegments, + mergedSegHistory: make(map[uint64]*mergedSegmentHistory, numSegments), + notifyCh: make(chan *mergeTaskIntroStatus), + newCount: newMergedCount, } - for i, idx := range sbsIndexes { - ss := snapshot.segment[idx] - sm.old[ss.id] = ss - sm.oldNewDocNums[ss.id] = newDocNums[i] + for i, flushable := range flushableObjs { + for j, idx := range flushable.sbIdxs { + ss := snapshot.segment[idx] + // oldSegmentSnapshot.id -> {workerID, oldSegmentSnapshot, docIDs} + sm.mergedSegHistory[ss.id] = &mergedSegmentHistory{ + workerID: uint64(i), + oldNewDocIDs: newDocIDsSet[i][j], + oldSegment: ss, + } + } } select { // send to introducer case <-s.closeCh: - _ = seg.DecRef() - return nil, 0, segment.ErrClosed + _ = closeNewMergedSegments(newMergedSegments) + return nil, nil, segment.ErrClosed case s.merges <- sm: } @@ -520,17 +599,17 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, introStatus := <-sm.notifyCh if introStatus != nil && introStatus.indexSnapshot != nil { newSnapshot = introStatus.indexSnapshot - atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) + atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(numSegments)) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) if introStatus.skipped { // close the segment on skipping introduction. _ = newSnapshot.DecRef() - _ = seg.Close() + _ = closeNewMergedSegments(newMergedSegments) newSnapshot = nil } } - return newSnapshot, newSegmentID, nil + return newSnapshot, newMergedSegmentIDs, nil } func (s *Scorch) ReportBytesWritten(bytesWritten uint64) { diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go index ac6d8b22b..8ddde74a5 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/mergeplan/merge_plan.go @@ -99,6 +99,10 @@ type MergePlanOptions struct { // of tiny segments from resulting in a long tail in the index. FloorSegmentSize int64 + // Small segments' file size are rounded up to this size to prevent lot + // of tiny segments causing a long tail in the index. + FloorSegmentFileSize int64 + // Controls how aggressively merges that reclaim more deletions // are favored. Higher values will more aggressively target // merges that reclaim deletions, but be careful not to go so high @@ -126,6 +130,13 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { return o.FloorSegmentSize } +func (o *MergePlanOptions) RaiseToFloorSegmentFileSize(s int64) int64 { + if s > o.FloorSegmentFileSize { + return s + } + return o.FloorSegmentFileSize +} + // MaxSegmentSizeLimit represents the maximum size of a segment, // this limit comes with hit-1 optimisation/max encoding limit uint31. const MaxSegmentSizeLimit = 1<<31 - 1 @@ -155,6 +166,7 @@ var SingleSegmentMergePlanOptions = MergePlanOptions{ SegmentsPerMergeTask: 10, FloorSegmentSize: 1 << 30, ReclaimDeletesWeight: 2.0, + FloorSegmentFileSize: 1 << 40, } // ------------------------------------------- @@ -176,12 +188,18 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { var eligibles []Segment var eligiblesLiveSize int64 + var eligiblesFileSize int64 + var minFileSize int64 = math.MaxInt64 for _, segment := range segments { if minLiveSize > segment.LiveSize() { minLiveSize = segment.LiveSize() } + if minFileSize > segment.FileSize() { + minFileSize = segment.FileSize() + } + isEligible := segment.LiveSize() < o.MaxSegmentSize/2 // An eligible segment (based on #documents) may be too large // and thus need a stricter check based on the file size. @@ -195,17 +213,24 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { if isEligible { eligibles = append(eligibles, segment) eligiblesLiveSize += segment.LiveSize() + eligiblesFileSize += segment.FileSize() } } - minLiveSize = o.RaiseToFloorSegmentSize(minLiveSize) - calcBudget := o.CalcBudget if calcBudget == nil { calcBudget = CalcBudget } - budgetNumSegments := calcBudget(eligiblesLiveSize, minLiveSize, o) + var budgetNumSegments int + if o.FloorSegmentFileSize > 0 { + minFileSize = o.RaiseToFloorSegmentFileSize(minFileSize) + budgetNumSegments = calcBudget(eligiblesFileSize, minFileSize, o) + + } else { + minLiveSize = o.RaiseToFloorSegmentSize(minLiveSize) + budgetNumSegments = calcBudget(eligiblesLiveSize, minLiveSize, o) + } scoreSegments := o.ScoreSegments if scoreSegments == nil { diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go index 968a744ac..389d582b7 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go @@ -18,7 +18,7 @@ import ( "fmt" "sync/atomic" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go index ca179574c..20a5cab30 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go @@ -23,7 +23,6 @@ import ( "sync" "sync/atomic" - "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" segment_api "github.com/blevesearch/scorch_segment_api/v2" @@ -65,11 +64,6 @@ func (o *OptimizeVR) Finish() error { var errorsM sync.Mutex var errors []error - var snapshotGlobalDocNums map[int]*roaring.Bitmap - if o.requiresFiltering { - snapshotGlobalDocNums = o.snapshot.globalDocNums() - } - defer o.invokeSearcherEndCallback() wg := sync.WaitGroup{} @@ -104,27 +98,12 @@ func (o *OptimizeVR) Finish() error { // for each VR, populate postings list and iterators // by passing the obtained vector index and getting similar vectors. - // Only applies to filtered kNN. - if vr.eligibleDocIDs != nil && len(vr.eligibleDocIDs) > 0 { - eligibleVectorInternalIDs := vr.getEligibleDocIDs() - if snapshotGlobalDocNums != nil { - // Only the eligible documents belonging to this segment - // will get filtered out. - // There is no way to determine which doc belongs to which segment - eligibleVectorInternalIDs.And(snapshotGlobalDocNums[index]) - } - - eligibleLocalDocNums := make([]uint64, - eligibleVectorInternalIDs.GetCardinality()) - // get the (segment-)local document numbers - for i, docNum := range eligibleVectorInternalIDs.ToArray() { - localDocNum := o.snapshot.localDocNumFromGlobal(index, - uint64(docNum)) - eligibleLocalDocNums[i] = localDocNum - } - + // check if the vector reader is configured to use a pre-filter + // to filter out ineligible documents before performing + // kNN search. + if vr.eligibleSelector != nil { pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k, - eligibleLocalDocNums, vr.searchParams) + vr.eligibleSelector.SegmentEligibleDocs(index), vr.searchParams) } else { pl, err = vecIndex.Search(vr.vector, vr.k, vr.searchParams) } @@ -178,7 +157,7 @@ func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context, } o.ctx = ctx if !o.requiresFiltering { - o.requiresFiltering = len(s.eligibleDocIDs) > 0 + o.requiresFiltering = s.eligibleSelector != nil } if o.snapshot != s.snapshot { diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go index d59f733df..eb6b70c51 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go @@ -24,13 +24,14 @@ import ( "math" "os" "path/filepath" + "slices" "sort" "strconv" "strings" "sync/atomic" "time" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" @@ -79,6 +80,14 @@ type persisterOptions struct { // for the number of paused application threads. The default value would // be a very high number to always favour the merging of memory segments. MemoryPressurePauseThreshold uint64 + + // NumPersisterWorkers decides the number of parallel workers that will + // perform the in-memory merge of segments followed by a flush operation. + NumPersisterWorkers int + + // MaxSizeInMemoryMerge is the maximum size of data that a single persister + // worker is allowed to work on + MaxSizeInMemoryMergePerWorker int } type notificationChan chan struct{} @@ -240,7 +249,8 @@ OUTER: } func notifyMergeWatchers(lastPersistedEpoch uint64, - persistWatchers []*epochWatcher) []*epochWatcher { + persistWatchers []*epochWatcher, +) []*epochWatcher { var watchersNext []*epochWatcher for _, w := range persistWatchers { if w.epoch < lastPersistedEpoch { @@ -254,8 +264,8 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, persistWatchers []*epochWatcher, - po *persisterOptions) (uint64, []*epochWatcher) { - + po *persisterOptions, +) (uint64, []*epochWatcher) { // First, let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) @@ -320,9 +330,11 @@ OUTER: func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { po := persisterOptions{ - PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, - PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, - MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold, + PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, + PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, + MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold, + NumPersisterWorkers: DefaultNumPersisterWorkers, + MaxSizeInMemoryMergePerWorker: DefaultMaxSizeInMemoryMergePerWorker, } if v, ok := s.config["scorchPersisterOptions"]; ok { b, err := util.MarshalJSON(v) @@ -339,12 +351,13 @@ func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { } func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, - po *persisterOptions) error { + po *persisterOptions, +) error { // Perform in-memory segment merging only when the memory pressure is // below the configured threshold, else the persister performs the // direct persistence of segments. if s.NumEventsBlocking() < po.MemoryPressurePauseThreshold { - persisted, err := s.persistSnapshotMaybeMerge(snapshot) + persisted, err := s.persistSnapshotMaybeMerge(snapshot, po) if err != nil { return err } @@ -353,7 +366,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, } } - return s.persistSnapshotDirect(snapshot) + return s.persistSnapshotDirect(snapshot, nil) } // DefaultMinSegmentsForInMemoryMerge represents the default number of @@ -362,32 +375,118 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, // those segments var DefaultMinSegmentsForInMemoryMerge = 2 +type flushable struct { + segments []segment.Segment + drops []*roaring.Bitmap + sbIdxs []int + totDocs uint64 +} + +// number workers which parallely perform an in-memory merge of the segments +// followed by a flush operation. +var DefaultNumPersisterWorkers = 1 + +// maximum size of data that a single worker is allowed to perform the in-memory +// merge operation. +var DefaultMaxSizeInMemoryMergePerWorker = 0 + +func legacyFlushBehaviour(maxSizeInMemoryMergePerWorker, numPersisterWorkers int) bool { + // DefaultMaxSizeInMemoryMergePerWorker = 0 is a special value to preserve the leagcy + // one-shot in-memory merge + flush behaviour. + return maxSizeInMemoryMergePerWorker == 0 && numPersisterWorkers == 1 +} + // persistSnapshotMaybeMerge examines the snapshot and might merge and // persist the in-memory zap segments if there are enough of them -func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( +func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persisterOptions) ( bool, error) { // collect the in-memory zap segments (SegmentBase instances) var sbs []segment.Segment var sbsDrops []*roaring.Bitmap var sbsIndexes []int + var oldSegIdxs []int - for i, segmentSnapshot := range snapshot.segment { - if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); !ok { - sbs = append(sbs, segmentSnapshot.segment) - sbsDrops = append(sbsDrops, segmentSnapshot.deleted) - sbsIndexes = append(sbsIndexes, i) + flushSet := make([]*flushable, 0) + var totSize int + var numSegsToFlushOut int + var totDocs uint64 + + // legacy behaviour of merge + flush of all in-memory segments in one-shot + if legacyFlushBehaviour(po.MaxSizeInMemoryMergePerWorker, po.NumPersisterWorkers) { + val := &flushable{ + segments: make([]segment.Segment, 0), + drops: make([]*roaring.Bitmap, 0), + sbIdxs: make([]int, 0), + totDocs: totDocs, + } + for i, snapshot := range snapshot.segment { + if _, ok := snapshot.segment.(segment.PersistedSegment); !ok { + val.segments = append(val.segments, snapshot.segment) + val.drops = append(val.drops, snapshot.deleted) + val.sbIdxs = append(val.sbIdxs, i) + oldSegIdxs = append(oldSegIdxs, i) + val.totDocs += snapshot.segment.Count() + numSegsToFlushOut++ + } + } + + flushSet = append(flushSet, val) + } else { + // constructs a flushSet where each flushable object contains a set of segments + // to be merged and flushed out to disk. + for i, snapshot := range snapshot.segment { + if totSize >= po.MaxSizeInMemoryMergePerWorker && + len(sbs) >= DefaultMinSegmentsForInMemoryMerge { + numSegsToFlushOut += len(sbs) + val := &flushable{ + segments: slices.Clone(sbs), + drops: slices.Clone(sbsDrops), + sbIdxs: slices.Clone(sbsIndexes), + totDocs: totDocs, + } + flushSet = append(flushSet, val) + oldSegIdxs = append(oldSegIdxs, sbsIndexes...) + + sbs, sbsDrops, sbsIndexes = sbs[:0], sbsDrops[:0], sbsIndexes[:0] + totSize, totDocs = 0, 0 + } + + if len(flushSet) >= int(po.NumPersisterWorkers) { + break + } + + if _, ok := snapshot.segment.(segment.PersistedSegment); !ok { + sbs = append(sbs, snapshot.segment) + sbsDrops = append(sbsDrops, snapshot.deleted) + sbsIndexes = append(sbsIndexes, i) + totDocs += snapshot.segment.Count() + totSize += snapshot.segment.Size() + } + } + // if there were too few segments just merge them all as part of a single worker + if len(flushSet) < po.NumPersisterWorkers { + numSegsToFlushOut += len(sbs) + val := &flushable{ + segments: slices.Clone(sbs), + drops: slices.Clone(sbsDrops), + sbIdxs: slices.Clone(sbsIndexes), + totDocs: totDocs, + } + flushSet = append(flushSet, val) + oldSegIdxs = append(oldSegIdxs, sbsIndexes...) } } - if len(sbs) < DefaultMinSegmentsForInMemoryMerge { + if numSegsToFlushOut < DefaultMinSegmentsForInMemoryMerge { return false, nil } - newSnapshot, newSegmentID, err := s.mergeSegmentBases( - snapshot, sbs, sbsDrops, sbsIndexes) + // drains out (after merging in memory) the segments in the flushSet parallely + newSnapshot, newSegmentIDs, err := s.mergeSegmentBasesParallel(snapshot, flushSet) if err != nil { return false, err } + if newSnapshot == nil { return false, nil } @@ -397,10 +496,15 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( }() mergedSegmentIDs := map[uint64]struct{}{} - for _, idx := range sbsIndexes { + for _, idx := range oldSegIdxs { mergedSegmentIDs[snapshot.segment[idx].id] = struct{}{} } + newMergedSegmentIDs := make(map[uint64]struct{}, len(newSegmentIDs)) + for _, id := range newSegmentIDs { + newMergedSegmentIDs[id] = struct{}{} + } + // construct a snapshot that's logically equivalent to the input // snapshot, but with merged segments replaced by the new segment equiv := &IndexSnapshot{ @@ -411,18 +515,25 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( creator: "persistSnapshotMaybeMerge", } + // to track which segments haven't participated in the in-memory merge + // they won't be flushed out to the disk yet, but in the next cycle will be + // merged in-memory and then flushed out - this is to keep the number of + // on-disk files in limit. + exclude := make(map[uint64]struct{}) + // copy to the equiv the segments that weren't replaced for _, segment := range snapshot.segment { if _, wasMerged := mergedSegmentIDs[segment.id]; !wasMerged { equiv.segment = append(equiv.segment, segment) + exclude[segment.id] = struct{}{} } } // append to the equiv the new segment for _, segment := range newSnapshot.segment { - if segment.id == newSegmentID { + if _, ok := newMergedSegmentIDs[segment.id]; ok { equiv.segment = append(equiv.segment, &SegmentSnapshot{ - id: newSegmentID, + id: segment.id, segment: segment.segment, deleted: nil, // nil since merging handled deletions stats: nil, @@ -431,7 +542,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( } } - err = s.persistSnapshotDirect(equiv) + err = s.persistSnapshotDirect(equiv, exclude) if err != nil { return false, err } @@ -468,7 +579,8 @@ func copyToDirectory(srcPath string, d index.Directory) (int64, error) { } func persistToDirectory(seg segment.UnpersistedSegment, d index.Directory, - path string) error { + path string, +) error { if d == nil { return seg.Persist(path) } @@ -490,7 +602,7 @@ func persistToDirectory(seg segment.UnpersistedSegment, d index.Directory, } func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, - segPlugin SegmentPlugin, d index.Directory) ( + segPlugin SegmentPlugin, exclude map[uint64]struct{}, d index.Directory) ( []string, map[uint64]string, error) { snapshotsBucket, err := tx.CreateBucketIfNotExists(boltSnapshotsBucket) if err != nil { @@ -579,19 +691,22 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, } filenames = append(filenames, filename) case segment.UnpersistedSegment: - // need to persist this to disk - filename := zapFileName(segmentSnapshot.id) - path := filepath.Join(path, filename) - err := persistToDirectory(seg, d, path) - if err != nil { - return nil, nil, fmt.Errorf("segment: %s persist err: %v", path, err) + // need to persist this to disk if its not part of exclude list (which + // restricts which in-memory segment to be persisted to disk) + if _, ok := exclude[segmentSnapshot.id]; !ok { + filename := zapFileName(segmentSnapshot.id) + path := filepath.Join(path, filename) + err := persistToDirectory(seg, d, path) + if err != nil { + return nil, nil, fmt.Errorf("segment: %s persist err: %v", path, err) + } + newSegmentPaths[segmentSnapshot.id] = path + err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + if err != nil { + return nil, nil, err + } + filenames = append(filenames, filename) } - newSegmentPaths[segmentSnapshot.id] = path - err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) - if err != nil { - return nil, nil, err - } - filenames = append(filenames, filename) default: return nil, nil, fmt.Errorf("unknown segment type: %T", seg) } @@ -624,7 +739,7 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, return filenames, newSegmentPaths, nil } -func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { +func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint64]struct{}) (err error) { // start a write transaction tx, err := s.rootBolt.Begin(true) if err != nil { @@ -637,7 +752,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { } }() - filenames, newSegmentPaths, err := prepareBoltSnapshot(snapshot, tx, s.path, s.segPlugin, nil) + filenames, newSegmentPaths, err := prepareBoltSnapshot(snapshot, tx, s.path, s.segPlugin, exclude, nil) if err != nil { return err } @@ -713,16 +828,18 @@ func zapFileName(epoch uint64) string { // bolt snapshot code -var boltSnapshotsBucket = []byte{'s'} -var boltPathKey = []byte{'p'} -var boltDeletedKey = []byte{'d'} -var boltInternalKey = []byte{'i'} -var boltMetaDataKey = []byte{'m'} -var boltMetaDataSegmentTypeKey = []byte("type") -var boltMetaDataSegmentVersionKey = []byte("version") -var boltMetaDataTimeStamp = []byte("timeStamp") -var boltStatsKey = []byte("stats") -var TotBytesWrittenKey = []byte("TotBytesWritten") +var ( + boltSnapshotsBucket = []byte{'s'} + boltPathKey = []byte{'p'} + boltDeletedKey = []byte{'d'} + boltInternalKey = []byte{'i'} + boltMetaDataKey = []byte{'m'} + boltMetaDataSegmentTypeKey = []byte("type") + boltMetaDataSegmentVersionKey = []byte("version") + boltMetaDataTimeStamp = []byte("timeStamp") + boltStatsKey = []byte("stats") + TotBytesWrittenKey = []byte("TotBytesWritten") +) func (s *Scorch) loadFromBolt() error { return s.rootBolt.View(func(tx *bolt.Tx) error { @@ -800,7 +917,6 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { } func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { - rv := &IndexSnapshot{ parent: s, internal: make(map[string][]byte), @@ -947,7 +1063,8 @@ var RollbackSamplingInterval = 0 * time.Minute var RollbackRetentionFactor = float64(0.5) func getTimeSeriesSnapshots(maxDataPoints int, interval time.Duration, - snapshots []*snapshotMetaData) (int, map[uint64]time.Time) { + snapshots []*snapshotMetaData, +) (int, map[uint64]time.Time) { if interval == 0 { return len(snapshots), map[uint64]time.Time{} } @@ -994,8 +1111,8 @@ func getTimeSeriesSnapshots(maxDataPoints int, interval time.Duration, // by a time duration of RollbackSamplingInterval. func getProtectedSnapshots(rollbackSamplingInterval time.Duration, numSnapshotsToKeep int, - persistedSnapshots []*snapshotMetaData) map[uint64]time.Time { - + persistedSnapshots []*snapshotMetaData, +) map[uint64]time.Time { lastPoint, protectedEpochs := getTimeSeriesSnapshots(numSnapshotsToKeep, rollbackSamplingInterval, persistedSnapshots) if len(protectedEpochs) < numSnapshotsToKeep { @@ -1162,7 +1279,8 @@ func (s *Scorch) removeOldZapFiles() error { // Hence we try to retain atleast retentionFactor portion worth of old snapshots // in such a scenario using the following function func getBoundaryCheckPoint(retentionFactor float64, - checkPoints []*snapshotMetaData, timeStamp time.Time) time.Time { + checkPoints []*snapshotMetaData, timeStamp time.Time, +) time.Time { if checkPoints != nil { boundary := checkPoints[int(math.Floor(float64(len(checkPoints))* retentionFactor))] diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go index 429d1daa9..54dcb9274 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go @@ -23,7 +23,7 @@ import ( "sync/atomic" "time" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/registry" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" @@ -110,7 +110,8 @@ type internalStats struct { func NewScorch(storeName string, config map[string]interface{}, - analysisQueue *index.AnalysisQueue) (index.Index, error) { + analysisQueue *index.AnalysisQueue, +) (index.Index, error) { rv := &Scorch{ version: Version, config: config, @@ -137,7 +138,9 @@ func NewScorch(storeName string, typ, ok := config["spatialPlugin"].(string) if ok { - rv.loadSpatialAnalyzerPlugin(typ) + if err := rv.loadSpatialAnalyzerPlugin(typ); err != nil { + return nil, err + } } rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"} @@ -157,6 +160,18 @@ func NewScorch(storeName string, if ok { rv.onAsyncError = RegistryAsyncErrorCallbacks[aecbName] } + // validate any custom persistor options to + // prevent an async error in the persistor routine + _, err = rv.parsePersisterOptions() + if err != nil { + return nil, err + } + // validate any custom merge planner options to + // prevent an async error in the merger routine + _, err = rv.parseMergePlannerOptions() + if err != nil { + return nil, err + } return rv, nil } @@ -230,7 +245,7 @@ func (s *Scorch) openBolt() error { s.unsafeBatch = true } - var rootBoltOpt = *bolt.DefaultOptions + rootBoltOpt := *bolt.DefaultOptions if s.readOnly { rootBoltOpt.ReadOnly = true rootBoltOpt.OpenFile = func(path string, flag int, mode os.FileMode) (*os.File, error) { @@ -244,7 +259,7 @@ func (s *Scorch) openBolt() error { } } else { if s.path != "" { - err := os.MkdirAll(s.path, 0700) + err := os.MkdirAll(s.path, 0o700) if err != nil { return err } @@ -263,7 +278,7 @@ func (s *Scorch) openBolt() error { rootBoltPath := s.path + string(os.PathSeparator) + "root.bolt" var err error if s.path != "" { - s.rootBolt, err = bolt.Open(rootBoltPath, 0600, &rootBoltOpt) + s.rootBolt, err = bolt.Open(rootBoltPath, 0o600, &rootBoltOpt) if err != nil { return err } @@ -325,7 +340,9 @@ func (s *Scorch) openBolt() error { typ, ok := s.config["spatialPlugin"].(string) if ok { - s.loadSpatialAnalyzerPlugin(typ) + if err := s.loadSpatialAnalyzerPlugin(typ); err != nil { + return err + } } return nil @@ -424,6 +441,10 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { for itemsDeQueued < numUpdates { result := <-resultChan resultSize := result.Size() + // check if the document is searchable by the index + if result.Indexed() { + atomic.AddUint64(&s.stats.TotMutationsFiltered, 1) + } atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize)) totalAnalysisSize += resultSize analysisResults[itemsDeQueued] = result @@ -477,8 +498,8 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, - internalOps map[string][]byte, persistedCallback index.BatchCallback, stats *fieldStats) error { - + internalOps map[string][]byte, persistedCallback index.BatchCallback, stats *fieldStats, +) error { // new introduction introduction := &segmentIntroduction{ id: atomic.AddUint64(&s.nextSegmentID, 1), @@ -572,7 +593,8 @@ func (s *Scorch) BytesReadQueryTime() uint64 { } func (s *Scorch) diskFileStats(rootSegmentPaths map[string]struct{}) (uint64, - uint64, uint64) { + uint64, uint64, +) { var numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot uint64 if s.path != "" { files, err := os.ReadDir(s.path) @@ -635,6 +657,8 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["term_searchers_started"] = m["TotTermSearchersStarted"] m["term_searchers_finished"] = m["TotTermSearchersFinished"] m["knn_searches"] = m["TotKNNSearches"] + m["synonym_searches"] = m["TotSynonymSearches"] + m["total_mutations_filtered"] = m["TotMutationsFiltered"] m["num_bytes_read_at_query_time"] = m["TotBytesReadAtQueryTime"] m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] @@ -657,6 +681,17 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"] m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] + // the bool stat `index_bgthreads_active` indicates whether the background routines + // (which are responsible for the index to attain a steady state) are still + // doing some work. + if rootEpoch, ok := m["CurRootEpoch"].(uint64); ok { + if lastMergedEpoch, ok := m["LastMergedEpoch"].(uint64); ok { + if lastPersistedEpoch, ok := m["LastPersistedEpoch"].(uint64); ok { + m["index_bgthreads_active"] = !(lastMergedEpoch == rootEpoch && lastPersistedEpoch == rootEpoch) + } + } + } + // calculate the aggregate of all the segment's field stats aggFieldStats := newFieldStats() for _, segmentSnapshot := range indexSnapshot.Segments() { @@ -705,6 +740,27 @@ func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { d.VisitComposite(func(cf index.CompositeField) { cf.Compose(field.Name(), field.AnalyzedLength(), field.AnalyzedTokenFrequencies()) }) + // Since the encoded geoShape is only necessary within the doc values + // of the geoShapeField, it has been removed from the field's term dictionary. + // However, '_all' field uses its term dictionary as its docValues, so it + // becomes necessary to add the geoShape into the '_all' field's term dictionary + if f, ok := field.(index.GeoShapeField); ok { + d.VisitComposite(func(cf index.CompositeField) { + geoshape := f.EncodedShape() + cf.Compose(field.Name(), 1, index.TokenFrequencies{ + string(geoshape): &index.TokenFreq{ + Term: geoshape, + Locations: []*index.TokenLocation{ + { + Start: 0, + End: len(geoshape), + Position: 1, + }, + }, + }, + }) + }) + } } } }) @@ -771,7 +827,10 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) { } func init() { - registry.RegisterIndexType(Name, NewScorch) + err := registry.RegisterIndexType(Name, NewScorch) + if err != nil { + panic(err) + } } func parseToTimeDuration(i interface{}) (time.Duration, error) { @@ -812,7 +871,6 @@ func (fs *fieldStats) Store(statName, fieldName string, value uint64) { // Combine the given stats map with the existing map func (fs *fieldStats) Aggregate(stats segment.FieldStats) { - statMap := stats.Fetch() if statMap == nil { return diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go index b3b9ba01f..790a8008a 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go @@ -17,10 +17,9 @@ package scorch import ( "fmt" - "github.com/RoaringBitmap/roaring" - index "github.com/blevesearch/bleve_index_api" - + "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/geo" + index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" zapv11 "github.com/blevesearch/zapx/v11" diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go index 51ffc859b..aa17ce1f3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go @@ -26,7 +26,7 @@ import ( "sync" "sync/atomic" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/document" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" @@ -42,10 +42,11 @@ type asynchSegmentResult struct { dict segment.TermDictionary dictItr segment.DictionaryIterator - index int - docs *roaring.Bitmap + cardinality int + index int + docs *roaring.Bitmap - postings segment.PostingsList + thesItr segment.ThesaurusIterator err error } @@ -133,10 +134,11 @@ func (i *IndexSnapshot) updateSize() { func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator, - randomLookup bool) (*IndexSnapshotFieldDict, error) { - + randomLookup bool, +) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) var totalBytesRead uint64 + var fieldCardinality int64 for _, s := range is.segment { go func(s *SegmentSnapshot) { dict, err := s.segment.Dictionary(field) @@ -146,6 +148,7 @@ func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string, if dictStats, ok := dict.(segment.DiskStatsReporter); ok { atomic.AddUint64(&totalBytesRead, dictStats.BytesRead()) } + atomic.AddInt64(&fieldCardinality, int64(dict.Cardinality())) if randomLookup { results <- &asynchSegmentResult{dict: dict} } else { @@ -160,6 +163,7 @@ func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string, snapshot: is, cursors: make([]*segmentDictCursor, 0, len(is.segment)), } + for count := 0; count < len(is.segment); count++ { asr := <-results if asr.err != nil && err == nil { @@ -183,6 +187,7 @@ func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string, } } } + rv.cardinality = int(fieldCardinality) rv.bytesRead = totalBytesRead // after ensuring we've read all items on channel if err != nil { @@ -225,7 +230,8 @@ func calculateExclusiveEndFromInclusiveEnd(inclusiveEnd []byte) []byte { } func (is *IndexSnapshot) FieldDictRange(field string, startTerm []byte, - endTerm []byte) (index.FieldDict, error) { + endTerm []byte, +) (index.FieldDict, error) { return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { endTermExclusive := calculateExclusiveEndFromInclusiveEnd(endTerm) return is.AutomatonIterator(nil, startTerm, endTermExclusive) @@ -251,7 +257,8 @@ func calculateExclusiveEndFromPrefix(in []byte) []byte { } func (is *IndexSnapshot) FieldDictPrefix(field string, - termPrefix []byte) (index.FieldDict, error) { + termPrefix []byte, +) (index.FieldDict, error) { termPrefixEnd := calculateExclusiveEndFromPrefix(termPrefix) return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { return is.AutomatonIterator(nil, termPrefix, termPrefixEnd) @@ -259,22 +266,41 @@ func (is *IndexSnapshot) FieldDictPrefix(field string, } func (is *IndexSnapshot) FieldDictRegexp(field string, - termRegex string) (index.FieldDict, error) { + termRegex string, +) (index.FieldDict, error) { + fd, _, err := is.FieldDictRegexpAutomaton(field, termRegex) + return fd, err +} + +func (is *IndexSnapshot) FieldDictRegexpAutomaton(field string, + termRegex string, +) (index.FieldDict, index.RegexAutomaton, error) { + return is.fieldDictRegexp(field, termRegex) +} + +func (is *IndexSnapshot) fieldDictRegexp(field string, + termRegex string, +) (index.FieldDict, index.RegexAutomaton, error) { // TODO: potential optimization where the literal prefix represents the, // entire regexp, allowing us to use PrefixIterator(prefixTerm)? a, prefixBeg, prefixEnd, err := parseRegexp(termRegex) if err != nil { - return nil, err + return nil, nil, err } - return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { + fd, err := is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { return is.AutomatonIterator(a, prefixBeg, prefixEnd) }, false) + if err != nil { + return nil, nil, err + } + return fd, a, nil } func (is *IndexSnapshot) getLevAutomaton(term string, - fuzziness uint8) (vellum.Automaton, error) { + fuzziness uint8, +) (vellum.Automaton, error) { if fuzziness == 1 { return lb1.BuildDfa(term, fuzziness) } else if fuzziness == 2 { @@ -284,21 +310,41 @@ func (is *IndexSnapshot) getLevAutomaton(term string, } func (is *IndexSnapshot) FieldDictFuzzy(field string, - term string, fuzziness int, prefix string) (index.FieldDict, error) { + term string, fuzziness int, prefix string, +) (index.FieldDict, error) { + fd, _, err := is.FieldDictFuzzyAutomaton(field, term, fuzziness, prefix) + return fd, err +} + +func (is *IndexSnapshot) FieldDictFuzzyAutomaton(field string, + term string, fuzziness int, prefix string, +) (index.FieldDict, index.FuzzyAutomaton, error) { + return is.fieldDictFuzzy(field, term, fuzziness, prefix) +} + +func (is *IndexSnapshot) fieldDictFuzzy(field string, + term string, fuzziness int, prefix string, +) (index.FieldDict, index.FuzzyAutomaton, error) { a, err := is.getLevAutomaton(term, uint8(fuzziness)) if err != nil { - return nil, err + return nil, nil, err + } + var fa index.FuzzyAutomaton + if vfa, ok := a.(vellum.FuzzyAutomaton); ok { + fa = vfa } - var prefixBeg, prefixEnd []byte if prefix != "" { prefixBeg = []byte(prefix) prefixEnd = calculateExclusiveEndFromPrefix(prefixBeg) } - - return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { + fd, err := is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { return is.AutomatonIterator(a, prefixBeg, prefixEnd) }, false) + if err != nil { + return nil, nil, err + } + return fd, fa, nil } func (is *IndexSnapshot) FieldDictContains(field string) (index.FieldDictContains, error) { @@ -394,7 +440,7 @@ func (is *IndexSnapshot) DocCount() (uint64, error) { func (is *IndexSnapshot) Document(id string) (rv index.Document, err error) { // FIXME could be done more efficiently directly, but reusing for simplicity - tfr, err := is.TermFieldReader(nil, []byte(id), "_id", false, false, false) + tfr, err := is.TermFieldReader(context.TODO(), []byte(id), "_id", false, false, false) if err != nil { return nil, err } @@ -473,31 +519,7 @@ func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (in return is.offsets[x] > docNum }) - 1 - localDocNum := is.localDocNumFromGlobal(segmentIndex, docNum) - return int(segmentIndex), localDocNum -} - -// This function returns the local docnum, given the segment index and global docnum -func (is *IndexSnapshot) localDocNumFromGlobal(segmentIndex int, docNum uint64) uint64 { - return docNum - is.offsets[segmentIndex] -} - -// Function to return a mapping of the segment index to the live global doc nums -// in the segment of the specified index snapshot. -func (is *IndexSnapshot) globalDocNums() map[int]*roaring.Bitmap { - if len(is.segment) == 0 { - return nil - } - - segmentIndexGlobalDocNums := make(map[int]*roaring.Bitmap) - - for i := range is.segment { - segmentIndexGlobalDocNums[i] = roaring.NewBitmap() - for _, localDocNum := range is.segment[i].DocNumbersLive().ToArray() { - segmentIndexGlobalDocNums[i].Add(localDocNum + uint32(is.offsets[i])) - } - } - return segmentIndexGlobalDocNums + return int(segmentIndex), docNum - is.offsets[segmentIndex] } func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { @@ -518,9 +540,18 @@ func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { return string(v), nil } +func (is *IndexSnapshot) segmentIndexAndLocalDocNum(id index.IndexInternalID) (int, uint64, error) { + docNum, err := docInternalToNumber(id) + if err != nil { + return 0, 0, err + } + segIdx, localDocNum := is.segmentIndexAndLocalDocNumFromGlobal(docNum) + return segIdx, localDocNum, nil +} + func (is *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) { // FIXME could be done more efficiently directly, but reusing for simplicity - tfr, err := is.TermFieldReader(nil, []byte(id), "_id", false, false, false) + tfr, err := is.TermFieldReader(context.TODO(), []byte(id), "_id", false, false, false) if err != nil { return nil, err } @@ -539,7 +570,8 @@ func (is *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err er } func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field string, includeFreq, - includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { + includeNorm, includeTermVectors bool, +) (index.TermFieldReader, error) { rv := is.allocTermFieldReaderDicts(field) rv.ctx = ctx @@ -706,7 +738,8 @@ func docInternalToNumber(in index.IndexInternalID) (uint64, error) { func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( segmentIndex int, localDocNum uint64, fields []string, cFields []string, visitor index.DocValueVisitor, dvs segment.DocVisitState) ( - cFieldsOut []string, dvsOut segment.DocVisitState, err error) { + cFieldsOut []string, dvsOut segment.DocVisitState, err error, +) { ss := is.segment[segmentIndex] var vFields []string // fields that are visitable via the segment @@ -763,7 +796,8 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( } func (is *IndexSnapshot) DocValueReader(fields []string) ( - index.DocValueReader, error) { + index.DocValueReader, error, +) { return &DocValueReader{i: is, fields: fields, currSegmentIndex: -1}, nil } @@ -784,7 +818,8 @@ func (dvr *DocValueReader) BytesRead() uint64 { } func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, - visitor index.DocValueVisitor) (err error) { + visitor index.DocValueVisitor, +) (err error) { docNum, err := docInternalToNumber(id) if err != nil { return err @@ -892,7 +927,7 @@ func (is *IndexSnapshot) CopyTo(d index.Directory) error { return fmt.Errorf("invalid root.bolt file found") } - copyBolt, err := bolt.Open(rootFile.Name(), 0600, nil) + copyBolt, err := bolt.Open(rootFile.Name(), 0o600, nil) if err != nil { return err } @@ -909,7 +944,7 @@ func (is *IndexSnapshot) CopyTo(d index.Directory) error { return err } - _, _, err = prepareBoltSnapshot(is, tx, "", is.parent.segPlugin, d) + _, _, err = prepareBoltSnapshot(is, tx, "", is.parent.segPlugin, nil, d) if err != nil { _ = tx.Rollback() return fmt.Errorf("error backing up index snapshot: %v", err) @@ -929,7 +964,8 @@ func (is *IndexSnapshot) UpdateIOStats(val uint64) { } func (is *IndexSnapshot) GetSpatialAnalyzerPlugin(typ string) ( - index.SpatialAnalyzerPlugin, error) { + index.SpatialAnalyzerPlugin, error, +) { var rv index.SpatialAnalyzerPlugin is.m.Lock() rv = is.parent.spatialPlugin @@ -963,3 +999,144 @@ func (is *IndexSnapshot) CloseCopyReader() error { // close the index snapshot normally return is.Close() } + +func (is *IndexSnapshot) ThesaurusTermReader(ctx context.Context, thesaurusName string, term []byte) (index.ThesaurusTermReader, error) { + rv := &IndexSnapshotThesaurusTermReader{} + rv.name = thesaurusName + rv.snapshot = is + if rv.postings == nil { + rv.postings = make([]segment.SynonymsList, len(is.segment)) + } + if rv.iterators == nil { + rv.iterators = make([]segment.SynonymsIterator, len(is.segment)) + } + rv.segmentOffset = 0 + + if rv.thesauri == nil { + rv.thesauri = make([]segment.Thesaurus, len(is.segment)) + for i, s := range is.segment { + if synSeg, ok := s.segment.(segment.ThesaurusSegment); ok { + thes, err := synSeg.Thesaurus(thesaurusName) + if err != nil { + return nil, err + } + rv.thesauri[i] = thes + } + } + } + + for i, s := range is.segment { + if _, ok := s.segment.(segment.ThesaurusSegment); ok { + pl, err := rv.thesauri[i].SynonymsList(term, s.deleted, rv.postings[i]) + if err != nil { + return nil, err + } + rv.postings[i] = pl + + rv.iterators[i] = pl.Iterator(rv.iterators[i]) + } + } + return rv, nil +} + +func (is *IndexSnapshot) newIndexSnapshotThesaurusKeys(name string, + makeItr func(i segment.Thesaurus) segment.ThesaurusIterator, +) (*IndexSnapshotThesaurusKeys, error) { + results := make(chan *asynchSegmentResult, len(is.segment)) + var wg sync.WaitGroup + wg.Add(len(is.segment)) + for _, s := range is.segment { + go func(s *SegmentSnapshot) { + defer wg.Done() + if synSeg, ok := s.segment.(segment.ThesaurusSegment); ok { + thes, err := synSeg.Thesaurus(name) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + results <- &asynchSegmentResult{thesItr: makeItr(thes)} + } + } + }(s) + } + // Close the channel after all goroutines complete + go func() { + wg.Wait() + close(results) + }() + + var err error + rv := &IndexSnapshotThesaurusKeys{ + snapshot: is, + cursors: make([]*segmentThesCursor, 0, len(is.segment)), + } + for asr := range results { + if asr.err != nil && err == nil { + err = asr.err + } else { + next, err2 := asr.thesItr.Next() + if err2 != nil && err == nil { + err = err2 + } + if next != nil { + rv.cursors = append(rv.cursors, &segmentThesCursor{ + itr: asr.thesItr, + curr: *next, + }) + } + } + } + // after ensuring we've read all items on channel + if err != nil { + return nil, err + } + + return rv, nil +} + +func (is *IndexSnapshot) ThesaurusKeys(name string) (index.ThesaurusKeys, error) { + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(nil, nil, nil) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysFuzzy(name string, + term string, fuzziness int, prefix string, +) (index.ThesaurusKeys, error) { + a, err := is.getLevAutomaton(term, uint8(fuzziness)) + if err != nil { + return nil, err + } + var prefixBeg, prefixEnd []byte + if prefix != "" { + prefixBeg = []byte(prefix) + prefixEnd = calculateExclusiveEndFromPrefix(prefixBeg) + } + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysPrefix(name string, + termPrefix []byte, +) (index.ThesaurusKeys, error) { + termPrefixEnd := calculateExclusiveEndFromPrefix(termPrefix) + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(nil, termPrefix, termPrefixEnd) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysRegexp(name string, + termRegex string, +) (index.ThesaurusKeys, error) { + a, prefixBeg, prefixEnd, err := parseRegexp(termRegex) + if err != nil { + return nil, err + } + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} + +func (is *IndexSnapshot) UpdateSynonymSearchCount(delta uint64) { + atomic.AddUint64(&is.parent.stats.TotSynonymSearches, delta) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_dict.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_dict.go index 658aa8148..2ae789c6b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_dict.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_dict.go @@ -28,10 +28,12 @@ type segmentDictCursor struct { } type IndexSnapshotFieldDict struct { - snapshot *IndexSnapshot - cursors []*segmentDictCursor - entry index.DictEntry - bytesRead uint64 + cardinality int + bytesRead uint64 + + snapshot *IndexSnapshot + cursors []*segmentDictCursor + entry index.DictEntry } func (i *IndexSnapshotFieldDict) BytesRead() uint64 { @@ -94,6 +96,10 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { return &i.entry, nil } +func (i *IndexSnapshotFieldDict) Cardinality() int { + return i.cardinality +} + func (i *IndexSnapshotFieldDict) Close() error { return nil } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go index fe174e7e3..0a979bfb5 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go @@ -18,7 +18,7 @@ import ( "bytes" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" ) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_str.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_str.go new file mode 100644 index 000000000..d6281e832 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_str.go @@ -0,0 +1,75 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/size" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizeIndexSnapshotThesaurusTermReader int + +func init() { + var istr IndexSnapshotThesaurusTermReader + reflectStaticSizeIndexSnapshotThesaurusTermReader = int(reflect.TypeOf(istr).Size()) +} + +type IndexSnapshotThesaurusTermReader struct { + name string + snapshot *IndexSnapshot + thesauri []segment.Thesaurus + postings []segment.SynonymsList + iterators []segment.SynonymsIterator + segmentOffset int +} + +func (i *IndexSnapshotThesaurusTermReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotThesaurusTermReader + size.SizeOfPtr + + len(i.name) + size.SizeOfString + + for _, postings := range i.postings { + sizeInBytes += postings.Size() + } + + for _, iterator := range i.iterators { + sizeInBytes += iterator.Size() + } + + return sizeInBytes +} + +func (i *IndexSnapshotThesaurusTermReader) Next() (string, error) { + // find the next hit + for i.segmentOffset < len(i.iterators) { + if i.iterators[i.segmentOffset] != nil { + next, err := i.iterators[i.segmentOffset].Next() + if err != nil { + return "", err + } + if next != nil { + synTerm := next.Term() + return synTerm, nil + } + i.segmentOffset++ + } + } + return "", nil +} + +func (i *IndexSnapshotThesaurusTermReader) Close() error { + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go index 9f0315fa8..48ba35682 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go @@ -146,7 +146,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo // FIXME do something better // for now, if we need to seek backwards, then restart from the beginning if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { - i2, err := i.snapshot.TermFieldReader(nil, i.term, i.field, + i2, err := i.snapshot.TermFieldReader(context.TODO(), i.term, i.field, i.includeFreq, i.includeNorm, i.includeTermVectors) if err != nil { return nil, err diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_thes.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_thes.go new file mode 100644 index 000000000..6f3aae818 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_thes.go @@ -0,0 +1,107 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "container/heap" + + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +type segmentThesCursor struct { + thes segment.Thesaurus + itr segment.ThesaurusIterator + curr index.ThesaurusEntry +} + +type IndexSnapshotThesaurusKeys struct { + snapshot *IndexSnapshot + cursors []*segmentThesCursor + entry index.ThesaurusEntry +} + +func (i *IndexSnapshotThesaurusKeys) Len() int { return len(i.cursors) } +func (i *IndexSnapshotThesaurusKeys) Less(a, b int) bool { + return i.cursors[a].curr.Term < i.cursors[b].curr.Term +} +func (i *IndexSnapshotThesaurusKeys) Swap(a, b int) { + i.cursors[a], i.cursors[b] = i.cursors[b], i.cursors[a] +} + +func (i *IndexSnapshotThesaurusKeys) Push(x interface{}) { + i.cursors = append(i.cursors, x.(*segmentThesCursor)) +} + +func (i *IndexSnapshotThesaurusKeys) Pop() interface{} { + n := len(i.cursors) + x := i.cursors[n-1] + i.cursors = i.cursors[0 : n-1] + return x +} + +func (i *IndexSnapshotThesaurusKeys) Next() (*index.ThesaurusEntry, error) { + if len(i.cursors) == 0 { + return nil, nil + } + i.entry = i.cursors[0].curr + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = *next + heap.Fix(i, 0) + } + // look for any other entries with the exact same term + for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term { + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = *next + heap.Fix(i, 0) + } + } + + return &i.entry, nil +} + +func (i *IndexSnapshotThesaurusKeys) Close() error { + return nil +} + +func (i *IndexSnapshotThesaurusKeys) Contains(key []byte) (bool, error) { + if len(i.cursors) == 0 { + return false, nil + } + + for _, cursor := range i.cursors { + if found, _ := cursor.thes.Contains(key); found { + return true, nil + } + } + + return false, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go index 320364bc7..7c6741125 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go @@ -24,7 +24,6 @@ import ( "fmt" "reflect" - "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" segment_api "github.com/blevesearch/scorch_segment_api/v2" @@ -51,32 +50,8 @@ type IndexSnapshotVectorReader struct { currID index.IndexInternalID ctx context.Context - searchParams json.RawMessage - - // The following fields are only applicable for vector readers which will - // process pre-filtered kNN queries. - eligibleDocIDs []index.IndexInternalID -} - -// Function to convert the internal IDs of the eligible documents to a type suitable -// for addition to a bitmap. -// Useful to have the eligible doc IDs in a bitmap to leverage the fast intersection -// (AND) operations. Eg. finding the eligible doc IDs present in a segment. -func (i *IndexSnapshotVectorReader) getEligibleDocIDs() *roaring.Bitmap { - res := roaring.NewBitmap() - if len(i.eligibleDocIDs) > 0 { - internalDocIDs := make([]uint32, 0, len(i.eligibleDocIDs)) - // converts the doc IDs to uint32 and returns - for _, eligibleDocInternalID := range i.eligibleDocIDs { - internalDocID, err := docInternalToNumber(index.IndexInternalID(eligibleDocInternalID)) - if err != nil { - continue - } - internalDocIDs = append(internalDocIDs, uint32(internalDocID)) - } - res.AddMany(internalDocIDs) - } - return res + searchParams json.RawMessage + eligibleSelector index.EligibleDocumentSelector } func (i *IndexSnapshotVectorReader) Size() int { @@ -134,17 +109,8 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, preAlloced *index.VectorDoc) (*index.VectorDoc, error) { if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { - var i2 index.VectorReader - var err error - - if len(i.eligibleDocIDs) > 0 { - i2, err = i.snapshot.VectorReaderWithFilter(i.ctx, i.vector, i.field, - i.k, i.searchParams, i.eligibleDocIDs) - } else { - i2, err = i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, - i.searchParams) - } - + i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, + i.searchParams, i.eligibleSelector) if err != nil { return nil, err } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go index 96e59a31d..ec65bf800 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go @@ -20,7 +20,7 @@ import ( "sync" "sync/atomic" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go index bcb05024d..db5e06745 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go @@ -20,47 +20,23 @@ package scorch import ( "context" "encoding/json" + "fmt" index "github.com/blevesearch/bleve_index_api" segment_api "github.com/blevesearch/scorch_segment_api/v2" ) func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32, - field string, k int64, searchParams json.RawMessage) ( - index.VectorReader, error) { - - rv := &IndexSnapshotVectorReader{ - vector: vector, - field: field, - k: k, - snapshot: is, - searchParams: searchParams, - } - - if rv.postings == nil { - rv.postings = make([]segment_api.VecPostingsList, len(is.segment)) - } - if rv.iterators == nil { - rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment)) - } - - // initialize postings and iterators within the OptimizeVR's Finish() - - return rv, nil -} - -func (is *IndexSnapshot) VectorReaderWithFilter(ctx context.Context, vector []float32, field string, k int64, searchParams json.RawMessage, - filterIDs []index.IndexInternalID) ( + eligibleSelector index.EligibleDocumentSelector) ( index.VectorReader, error) { - rv := &IndexSnapshotVectorReader{ - vector: vector, - field: field, - k: k, - snapshot: is, - searchParams: searchParams, - eligibleDocIDs: filterIDs, + vector: vector, + field: field, + k: k, + snapshot: is, + searchParams: searchParams, + eligibleSelector: eligibleSelector, } if rv.postings == nil { @@ -69,8 +45,41 @@ func (is *IndexSnapshot) VectorReaderWithFilter(ctx context.Context, vector []fl if rv.iterators == nil { rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment)) } - // initialize postings and iterators within the OptimizeVR's Finish() - return rv, nil } + +// eligibleDocumentSelector is used to filter out documents that are eligible for +// the KNN search from a pre-filter query. +type eligibleDocumentSelector struct { + // segment ID -> segment local doc nums + eligibleDocNums map[int][]uint64 + is *IndexSnapshot +} + +// SegmentEligibleDocs returns the list of eligible local doc numbers for the given segment. +func (eds *eligibleDocumentSelector) SegmentEligibleDocs(segmentID int) []uint64 { + return eds.eligibleDocNums[segmentID] +} + +// AddEligibleDocumentMatch adds a document match to the list of eligible documents. +func (eds *eligibleDocumentSelector) AddEligibleDocumentMatch(id index.IndexInternalID) error { + if eds.is == nil { + return fmt.Errorf("eligibleDocumentSelector is not initialized with IndexSnapshot") + } + // Get the segment number and the local doc number for this document. + segIdx, docNum, err := eds.is.segmentIndexAndLocalDocNum(id) + if err != nil { + return err + } + // Add the local doc number to the list of eligible doc numbers for this segment. + eds.eligibleDocNums[segIdx] = append(eds.eligibleDocNums[segIdx], docNum) + return nil +} + +func (is *IndexSnapshot) NewEligibleDocumentSelector() index.EligibleDocumentSelector { + return &eligibleDocumentSelector{ + eligibleDocNums: map[int][]uint64{}, + is: is, + } +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go index 269ae2f63..9abc8ba96 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go @@ -51,7 +51,8 @@ type Stats struct { TotTermSearchersStarted uint64 TotTermSearchersFinished uint64 - TotKNNSearches uint64 + TotKNNSearches uint64 + TotSynonymSearches uint64 TotEventTriggerStarted uint64 TotEventTriggerCompleted uint64 @@ -80,6 +81,7 @@ type Stats struct { TotPersistedItems uint64 TotItemsToPersist uint64 TotPersistedSegments uint64 + TotMutationsFiltered uint64 TotPersisterSlowMergerPause uint64 TotPersisterSlowMergerResume uint64 diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go index 8221b23e3..411ef2a35 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go @@ -18,7 +18,7 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/field_dict.go b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/field_dict.go index 4875680c9..c990fd47b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/field_dict.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/field_dict.go @@ -77,6 +77,10 @@ func (r *UpsideDownCouchFieldDict) Next() (*index.DictEntry, error) { } +func (r *UpsideDownCouchFieldDict) Cardinality() int { + return 0 +} + func (r *UpsideDownCouchFieldDict) Close() error { return r.iterator.Close() } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/row.go b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/row.go index fff6d0673..622db46c1 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/row.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/row.go @@ -26,8 +26,10 @@ import ( "github.com/golang/protobuf/proto" ) -var reflectStaticSizeTermFrequencyRow int -var reflectStaticSizeTermVector int +var ( + reflectStaticSizeTermFrequencyRow int + reflectStaticSizeTermVector int +) func init() { var tfr TermFrequencyRow @@ -322,7 +324,6 @@ func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) { return nil, err } return rv, nil - } func NewDictionaryRowK(key []byte) (*DictionaryRow, error) { @@ -642,7 +643,7 @@ func (tfr *TermFrequencyRow) parseV(value []byte, includeTermVectors bool) error } currOffset += bytesRead - var arrayPositionsLen uint64 = 0 + var arrayPositionsLen uint64 arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:]) if bytesRead <= 0 { return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen") @@ -682,7 +683,6 @@ func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) { return nil, err } return rv, nil - } type BackIndexRow struct { @@ -1029,7 +1029,7 @@ func visitBackIndexRow(data []byte, callback backIndexFieldTermVisitor) error { return io.ErrUnexpectedEOF } // don't track unrecognized data - //m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) + // m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } @@ -1109,7 +1109,7 @@ func visitBackIndexRowFieldTerms(data []byte, callback backIndexFieldTermVisitor if postIndex > l { return io.ErrUnexpectedEOF } - //m.Terms = append(m.Terms, string(data[iNdEx:postIndex])) + // m.Terms = append(m.Terms, string(data[iNdEx:postIndex])) callback(theField, data[iNdEx:postIndex]) iNdEx = postIndex default: @@ -1132,7 +1132,7 @@ func visitBackIndexRowFieldTerms(data []byte, callback backIndexFieldTermVisitor if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } - //m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) + // m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) iNdEx += skippy } } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/boltdb/store.go b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/boltdb/store.go index bc99275e1..2ebe9d2ba 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/boltdb/store.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/boltdb/store.go @@ -177,5 +177,8 @@ func (bs *Store) Compact() error { } func init() { - registry.RegisterKVStore(Name, New) + err := registry.RegisterKVStore(Name, New) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/gtreap/store.go b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/gtreap/store.go index 3cc7eb9ae..8050e4d91 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/gtreap/store.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/store/gtreap/store.go @@ -26,7 +26,7 @@ import ( "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/gtreap" - "github.com/blevesearch/upsidedown_store_api" + store "github.com/blevesearch/upsidedown_store_api" ) const Name = "gtreap" @@ -78,5 +78,8 @@ func (s *Store) Writer() (store.KVWriter, error) { } func init() { - registry.RegisterKVStore(Name, New) + err := registry.RegisterKVStore(Name, New) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/upsidedown.go b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/upsidedown.go index 3756422da..2400776d7 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/upsidedown.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/upsidedown.go @@ -1042,7 +1042,10 @@ func (udc *UpsideDownCouch) fieldIndexOrNewRow(name string) (uint16, *FieldRow) } func init() { - registry.RegisterIndexType(Name, NewUpsideDownCouch) + err := registry.RegisterIndexType(Name, NewUpsideDownCouch) + if err != nil { + panic(err) + } } func backIndexRowForDoc(kvreader store.KVReader, docID index.IndexInternalID) (*BackIndexRow, error) { diff --git a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go index 3c7cdcd32..a4f724e34 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "fmt" "sync" "time" @@ -31,6 +32,10 @@ type indexAliasImpl struct { indexes []Index mutex sync.RWMutex open bool + // if all the indexes in tha alias have the same mapping + // then the user can set the mapping here to avoid + // checking the mapping of each index in the alias + mapping mapping.IndexMapping } // NewIndexAlias creates a new IndexAlias over the provided @@ -78,6 +83,25 @@ func (i *indexAliasImpl) Index(id string, data interface{}) error { return i.indexes[0].Index(id, data) } +func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + return err + } + + if si, ok := i.indexes[0].(SynonymIndex); ok { + return si.IndexSynonym(id, collection, definition) + } + return ErrorSynonymSearchNotSupported +} + func (i *indexAliasImpl) Delete(id string) error { i.mutex.RLock() defer i.mutex.RUnlock() @@ -168,7 +192,13 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // indicates that this index alias is set as an Index // in another alias, so we need to do a preSearch search // and NOT a real search - return preSearchDataSearch(ctx, req, i.indexes...) + bm25PreSearch := isBM25Enabled(i.mapping) + flags := &preSearchFlags{ + knn: requestHasKNN(req), + synonyms: !isMatchNoneQuery(req.Query), + bm25: bm25PreSearch, + } + return preSearchDataSearch(ctx, req, flags, i.indexes...) } // at this point we know we are doing a real search @@ -182,12 +212,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // if necessary var preSearchData map[string]map[string]interface{} if req.PreSearchData != nil { - if requestHasKNN(req) { - var err error - preSearchData, err = redistributeKNNPreSearchData(req, i.indexes) - if err != nil { - return nil, err - } + var err error + preSearchData, err = redistributePreSearchData(req, i.indexes) + if err != nil { + return nil, err } } @@ -208,12 +236,17 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // - the request requires preSearch var preSearchDuration time.Duration var sr *SearchResult - if req.PreSearchData == nil && preSearchRequired(req) { + flags, err := preSearchRequired(ctx, req, i.mapping) + if err != nil { + return nil, err + } + if req.PreSearchData == nil && flags != nil { searchStart := time.Now() - preSearchResult, err := preSearch(ctx, req, i.indexes...) + preSearchResult, err := preSearch(ctx, req, flags, i.indexes...) if err != nil { return nil, err } + // check if the preSearch result has any errors and if so // return the search result as is without executing the query // so that the errors are not lost @@ -221,17 +254,17 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest return preSearchResult, nil } // finalize the preSearch result now - finalizePreSearchResult(req, preSearchResult) + finalizePreSearchResult(req, flags, preSearchResult) // if there are no errors, then merge the data in the preSearch result // and construct the preSearchData to be used in the actual search // if the request is satisfied by the preSearch result, then we can // directly return the preSearch result as the final result - if requestSatisfiedByPreSearch(req) { + if requestSatisfiedByPreSearch(req, flags) { sr = finalizeSearchResult(req, preSearchResult) // no need to run the 2nd phase MultiSearch(..) } else { - preSearchData, err = constructPreSearchData(req, preSearchResult, i.indexes) + preSearchData, err = constructPreSearchData(req, flags, preSearchResult, i.indexes) if err != nil { return nil, err } @@ -352,6 +385,20 @@ func (i *indexAliasImpl) Close() error { return nil } +// SetIndexMapping sets the mapping for the alias and must be used +// ONLY when all the indexes in the alias have the same mapping. +// This is to avoid checking the mapping of each index in the alias +// when executing a search request. +func (i *indexAliasImpl) SetIndexMapping(m mapping.IndexMapping) error { + i.mutex.Lock() + defer i.mutex.Unlock() + if !i.open { + return ErrorIndexClosed + } + i.mapping = m + return nil +} + func (i *indexAliasImpl) Mapping() mapping.IndexMapping { i.mutex.RLock() defer i.mutex.RUnlock() @@ -360,6 +407,11 @@ func (i *indexAliasImpl) Mapping() mapping.IndexMapping { return nil } + // if the mapping is already set, return it + if i.mapping != nil { + return i.mapping + } + err := i.isAliasToSingleIndex() if err != nil { return nil @@ -520,21 +572,82 @@ type asyncSearchResult struct { Err error } -func preSearchRequired(req *SearchRequest) bool { - return requestHasKNN(req) +// preSearchFlags is a struct to hold flags indicating why preSearch is required +type preSearchFlags struct { + knn bool + synonyms bool + bm25 bool // needs presearch for this too } -func preSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { +func isBM25Enabled(m mapping.IndexMapping) bool { + var rv bool + if m, ok := m.(*mapping.IndexMappingImpl); ok { + rv = m.ScoringModel == index.BM25Scoring + } + return rv +} + +// preSearchRequired checks if preSearch is required and returns the presearch flags struct +// indicating which preSearch is required +func preSearchRequired(ctx context.Context, req *SearchRequest, m mapping.IndexMapping) (*preSearchFlags, error) { + // Check for KNN query + knn := requestHasKNN(req) + var synonyms bool + if !isMatchNoneQuery(req.Query) { + // Check if synonyms are defined in the mapping + if sm, ok := m.(mapping.SynonymMapping); ok && sm.SynonymCount() > 0 { + // check if any of the fields queried have a synonym source + // in the index mapping, to prevent unnecessary preSearch + fs, err := query.ExtractFields(req.Query, m, nil) + if err != nil { + return nil, err + } + for field := range fs { + if sm.SynonymSourceForPath(field) != "" { + synonyms = true + break + } + } + } + } + var bm25 bool + if !isMatchNoneQuery(req.Query) { + if ctx != nil { + if searchType := ctx.Value(search.SearchTypeKey); searchType != nil { + if searchType.(string) == search.GlobalScoring { + bm25 = isBM25Enabled(m) + } + } + } + } + + if knn || synonyms || bm25 { + return &preSearchFlags{ + knn: knn, + synonyms: synonyms, + bm25: bm25, + }, nil + } + return nil, nil +} + +func preSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { // create a dummy request with a match none query // since we only care about the preSearchData in PreSearch + var dummyQuery = req.Query + if !flags.bm25 && !flags.synonyms { + // create a dummy request with a match none query + // since we only care about the preSearchData in PreSearch + dummyQuery = query.NewMatchNoneQuery() + } dummyRequest := &SearchRequest{ - Query: query.NewMatchNoneQuery(), + Query: dummyQuery, } newCtx := context.WithValue(ctx, search.PreSearchKey, true) - if requestHasKNN(req) { + if flags.knn { addKnnToDummyRequest(dummyRequest, req) } - return preSearchDataSearch(newCtx, dummyRequest, indexes...) + return preSearchDataSearch(newCtx, dummyRequest, flags, indexes...) } // if the request is satisfied by just the preSearch result, @@ -585,29 +698,67 @@ func finalizeSearchResult(req *SearchRequest, preSearchResult *SearchResult) *Se return preSearchResult } -func requestSatisfiedByPreSearch(req *SearchRequest) bool { - if requestHasKNN(req) && isKNNrequestSatisfiedByPreSearch(req) { +func requestSatisfiedByPreSearch(req *SearchRequest, flags *preSearchFlags) bool { + if flags == nil { + return false + } + // if the synonyms presearch flag is set the request can never be satisfied by + // the preSearch result as synonyms are not part of the preSearch result + if flags.synonyms { + return false + } + if flags.knn && isKNNrequestSatisfiedByPreSearch(req) { return true } return false } -func constructPreSearchData(req *SearchRequest, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { +func constructSynonymPreSearchData(rv map[string]map[string]interface{}, sr *SearchResult, indexes []Index) map[string]map[string]interface{} { + for _, index := range indexes { + rv[index.Name()][search.SynonymPreSearchDataKey] = sr.SynonymResult + } + return rv +} + +func constructBM25PreSearchData(rv map[string]map[string]interface{}, sr *SearchResult, indexes []Index) map[string]map[string]interface{} { + bmStats := sr.BM25Stats + if bmStats != nil { + for _, index := range indexes { + rv[index.Name()][search.BM25PreSearchDataKey] = &search.BM25Stats{ + DocCount: bmStats.DocCount, + FieldCardinality: bmStats.FieldCardinality, + } + } + } + return rv +} + +func constructPreSearchData(req *SearchRequest, flags *preSearchFlags, + preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { + if flags == nil || preSearchResult == nil { + return nil, fmt.Errorf("invalid input, flags: %v, preSearchResult: %v", flags, preSearchResult) + } mergedOut := make(map[string]map[string]interface{}, len(indexes)) for _, index := range indexes { mergedOut[index.Name()] = make(map[string]interface{}) } var err error - if requestHasKNN(req) { + if flags.knn { mergedOut, err = constructKnnPreSearchData(mergedOut, preSearchResult, indexes) if err != nil { return nil, err } } + if flags.synonyms { + mergedOut = constructSynonymPreSearchData(mergedOut, preSearchResult, indexes) + } + if flags.bm25 { + mergedOut = constructBM25PreSearchData(mergedOut, preSearchResult, indexes) + } return mergedOut, nil } -func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { +func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { asyncResults := make(chan *asyncSearchResult, len(indexes)) // run search on each index in separate go routine var waitGroup sync.WaitGroup @@ -638,7 +789,7 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Ind if prp == nil { // first valid preSearch result // create a new preSearch result processor - prp = createPreSearchResultProcessor(req) + prp = createPreSearchResultProcessor(req, flags) } prp.add(asr.Result, asr.Name) if sr == nil { @@ -684,6 +835,56 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Ind return sr, nil } +// redistributePreSearchData redistributes the preSearchData sent in the search request to an index alias +// which would happen in the case of an alias tree and depending on the level of the tree, the preSearchData +// needs to be redistributed to the indexes at that level +func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { + rv := make(map[string]map[string]interface{}) + for _, index := range indexes { + rv[index.Name()] = make(map[string]interface{}) + } + if knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch); ok { + // the preSearchData for KNN is a list of DocumentMatch objects + // that need to be redistributed to the right index. + // This is used only in the case of an alias tree, where the indexes + // are at the leaves of the tree, and the master alias is at the root. + // At each level of the tree, the preSearchData needs to be redistributed + // to the indexes/aliases at that level. Because the preSearchData is + // specific to each final index at the leaf. + segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) + if err != nil { + return nil, err + } + for _, index := range indexes { + rv[index.Name()][search.KnnPreSearchDataKey] = segregatedKnnHits[index.Name()] + } + } + if fts, ok := req.PreSearchData[search.SynonymPreSearchDataKey].(search.FieldTermSynonymMap); ok { + for _, index := range indexes { + rv[index.Name()][search.SynonymPreSearchDataKey] = fts + } + } + + if bm25Data, ok := req.PreSearchData[search.BM25PreSearchDataKey].(*search.BM25Stats); ok { + for _, index := range indexes { + rv[index.Name()][search.BM25PreSearchDataKey] = bm25Data + } + } + return rv, nil +} + +// finalizePreSearchResult finalizes the preSearch result by applying the finalization steps +// specific to the preSearch flags +func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult) { + // if flags is nil then return + if flags == nil { + return + } + if flags.knn { + preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) + } +} + // hitsInCurrentPage returns the hits in the current page // using the From and Size parameters in the request func hitsInCurrentPage(req *SearchRequest, hits []*search.DocumentMatch) []*search.DocumentMatch { @@ -856,3 +1057,7 @@ func (f *indexAliasImplFieldDict) Close() error { defer f.index.mutex.RUnlock() return f.fieldDict.Close() } + +func (f *indexAliasImplFieldDict) Cardinality() int { + return f.fieldDict.Cardinality() +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_impl.go index e6debf17a..c04f20e5d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_impl.go @@ -38,6 +38,7 @@ import ( "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/facet" "github.com/blevesearch/bleve/v2/search/highlight" + "github.com/blevesearch/bleve/v2/search/query" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" "github.com/blevesearch/geo/s2" @@ -267,6 +268,40 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) { return } +// IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection. +// Synonym definition defines term relationships for query expansion in searches. +func (i *indexImpl) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + if id == "" { + return ErrorEmptyID + } + + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + i.FireIndexEvent() + + synMap, ok := i.m.(mapping.SynonymMapping) + if !ok { + return ErrorSynonymSearchNotSupported + } + + if err := definition.Validate(); err != nil { + return err + } + + doc := document.NewSynonymDocument(id) + err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms) + if err != nil { + return err + } + err = i.i.Update(doc) + return err +} + // IndexAdvanced takes a document.Document object // skips the mapping and indexes it. func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) { @@ -449,12 +484,51 @@ func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader in } } + var fts search.FieldTermSynonymMap + var count uint64 + var fieldCardinality map[string]int + if !isMatchNoneQuery(req.Query) { + if synMap, ok := i.m.(mapping.SynonymMapping); ok { + if synReader, ok := reader.(index.ThesaurusReader); ok { + fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) + if err != nil { + return nil, err + } + } + } + if ok := isBM25Enabled(i.m); ok { + fieldCardinality = make(map[string]int) + count, err = reader.DocCount() + if err != nil { + return nil, err + } + + fs := make(query.FieldSet) + fs, err := query.ExtractFields(req.Query, i.m, fs) + if err != nil { + return nil, err + } + for field := range fs { + dict, err := reader.FieldDict(field) + if err != nil { + return nil, err + } + fieldCardinality[field] = dict.Cardinality() + } + } + } + return &SearchResult{ Status: &SearchStatus{ Total: 1, Successful: 1, }, - Hits: knnHits, + Hits: knnHits, + SynonymResult: fts, + BM25Stats: &search.BM25Stats{ + DocCount: float64(count), + FieldCardinality: fieldCardinality, + }, }, nil } @@ -505,8 +579,13 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } var knnHits []*search.DocumentMatch + var skipKNNCollector bool + + var fts search.FieldTermSynonymMap + var skipSynonymCollector bool + + var bm25Data *search.BM25Stats var ok bool - var skipKnnCollector bool if req.PreSearchData != nil { for k, v := range req.PreSearchData { switch k { @@ -516,20 +595,68 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if !ok { return nil, fmt.Errorf("knn preSearchData must be of type []*search.DocumentMatch") } + skipKNNCollector = true + } + case search.SynonymPreSearchDataKey: + if v != nil { + fts, ok = v.(search.FieldTermSynonymMap) + if !ok { + return nil, fmt.Errorf("synonym preSearchData must be of type search.FieldTermSynonymMap") + } + skipSynonymCollector = true + } + case search.BM25PreSearchDataKey: + if v != nil { + bm25Data, ok = v.(*search.BM25Stats) + if !ok { + return nil, fmt.Errorf("bm25 preSearchData must be of type map[string]interface{}") + } } - skipKnnCollector = true } } } - if !skipKnnCollector && requestHasKNN(req) { + if !skipKNNCollector && requestHasKNN(req) { knnHits, err = i.runKnnCollector(ctx, req, indexReader, false) if err != nil { return nil, err } } + if !skipSynonymCollector { + if synMap, ok := i.m.(mapping.SynonymMapping); ok && synMap.SynonymCount() > 0 { + if synReader, ok := indexReader.(index.ThesaurusReader); ok { + fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) + if err != nil { + return nil, err + } + } + } + } + setKnnHitsInCollector(knnHits, req, coll) + if fts != nil { + if is, ok := indexReader.(*scorch.IndexSnapshot); ok { + is.UpdateSynonymSearchCount(1) + } + ctx = context.WithValue(ctx, search.FieldTermSynonymMapKey, fts) + } + + scoringModelCallback := func() string { + if isBM25Enabled(i.m) { + return index.BM25Scoring + } + return index.DefaultScoringModel + } + ctx = context.WithValue(ctx, search.GetScoringModelCallbackKey, + search.GetScoringModelCallbackFn(scoringModelCallback)) + + // set the bm25 presearch data (stats important for consistent scoring) in + // the context object + if bm25Data != nil { + ctx = context.WithValue(ctx, search.BM25PreSearchDataKey, bm25Data) + } + // This callback and variable handles the tracking of bytes read // 1. as part of creation of tfr and its Next() calls which is // accounted by invoking this callback when the TFR is closed. @@ -786,6 +913,11 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, if err == nil { value = v } + case index.IPField: + ip, err := docF.IP() + if err == nil { + value = ip.String() + } } if value != nil { @@ -1032,6 +1164,10 @@ func (f *indexImplFieldDict) Close() error { return f.indexReader.Close() } +func (f *indexImplFieldDict) Cardinality() int { + return f.fieldDict.Cardinality() +} + // helper function to remove duplicate entries from slice of strings func deDuplicate(fields []string) []string { entries := make(map[string]struct{}) diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/analysis.go b/vendor/github.com/blevesearch/bleve/v2/mapping/analysis.go index 03e3cd01b..311e97232 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/analysis.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/analysis.go @@ -21,6 +21,7 @@ type customAnalysis struct { TokenFilters map[string]map[string]interface{} `json:"token_filters,omitempty"` Analyzers map[string]map[string]interface{} `json:"analyzers,omitempty"` DateTimeParsers map[string]map[string]interface{} `json:"date_time_parsers,omitempty"` + SynonymSources map[string]map[string]interface{} `json:"synonym_sources,omitempty"` } func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { @@ -83,6 +84,12 @@ func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { return err } } + for name, config := range c.SynonymSources { + _, err := i.cache.DefineSynonymSource(name, config) + if err != nil { + return err + } + } return nil } @@ -94,6 +101,7 @@ func newCustomAnalysis() *customAnalysis { TokenFilters: make(map[string]map[string]interface{}), Analyzers: make(map[string]map[string]interface{}), DateTimeParsers: make(map[string]map[string]interface{}), + SynonymSources: make(map[string]map[string]interface{}), } return &rv } diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go index 847326e41..e89e66979 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go @@ -40,11 +40,12 @@ import ( // are used. To disable this automatic handling, set // Dynamic to false. type DocumentMapping struct { - Enabled bool `json:"enabled"` - Dynamic bool `json:"dynamic"` - Properties map[string]*DocumentMapping `json:"properties,omitempty"` - Fields []*FieldMapping `json:"fields,omitempty"` - DefaultAnalyzer string `json:"default_analyzer,omitempty"` + Enabled bool `json:"enabled"` + Dynamic bool `json:"dynamic"` + Properties map[string]*DocumentMapping `json:"properties,omitempty"` + Fields []*FieldMapping `json:"fields,omitempty"` + DefaultAnalyzer string `json:"default_analyzer,omitempty"` + DefaultSynonymSource string `json:"default_synonym_source,omitempty"` // StructTagKey overrides "json" when looking for field names in struct tags StructTagKey string `json:"struct_tag_key,omitempty"` @@ -59,6 +60,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, return err } } + if dm.DefaultSynonymSource != "" { + _, err := cache.SynonymSourceNamed(dm.DefaultSynonymSource) + if err != nil { + return err + } + } for propertyName, property := range dm.Properties { newParent := propertyName if parentName != "" { @@ -82,7 +89,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, return err } } - + if field.SynonymSource != "" { + _, err = cache.SynonymSourceNamed(field.SynonymSource) + if err != nil { + return err + } + } err := validateFieldMapping(field, parentName, fieldAliasCtx) if err != nil { return err @@ -112,6 +124,17 @@ func (dm *DocumentMapping) analyzerNameForPath(path string) string { return "" } +// synonymSourceForPath attempts to first find the field +// described by this path, then returns the analyzer +// configured for that field +func (dm *DocumentMapping) synonymSourceForPath(path string) string { + field := dm.fieldDescribedByPath(path) + if field != nil { + return field.SynonymSource + } + return "" +} + func (dm *DocumentMapping) fieldDescribedByPath(path string) *FieldMapping { pathElements := decodePath(path) if len(pathElements) > 1 { @@ -295,6 +318,11 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "default_synonym_source": + err := util.UnmarshalJSON(v, &dm.DefaultSynonymSource) + if err != nil { + return err + } case "properties": err := util.UnmarshalJSON(v, &dm.Properties) if err != nil { @@ -338,6 +366,22 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { return rv } +func (dm *DocumentMapping) defaultSynonymSource(path []string) string { + current := dm + rv := current.DefaultSynonymSource + for _, pathElement := range path { + var ok bool + current, ok = current.Properties[pathElement] + if !ok { + break + } + if current.DefaultSynonymSource != "" { + rv = current.DefaultSynonymSource + } + } + return rv +} + func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { // allow default "json" tag to be overridden structTagKey := dm.StructTagKey diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go index 5c064fddd..cfb390b40 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go @@ -74,12 +74,14 @@ type FieldMapping struct { Dims int `json:"dims,omitempty"` // Similarity is the similarity algorithm used for scoring - // vector fields. - // See: index.DefaultSimilarityMetric & index.SupportedSimilarityMetrics + // field's content while performing search on it. + // See: index.SimilarityModels Similarity string `json:"similarity,omitempty"` // Applicable to vector fields only - optimization string VectorIndexOptimizedFor string `json:"vector_index_optimized_for,omitempty"` + + SynonymSource string `json:"synonym_source,omitempty"` } // NewTextFieldMapping returns a default field mapping for text @@ -460,17 +462,22 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { return err } case "dims": - err := json.Unmarshal(v, &fm.Dims) + err := util.UnmarshalJSON(v, &fm.Dims) if err != nil { return err } case "similarity": - err := json.Unmarshal(v, &fm.Similarity) + err := util.UnmarshalJSON(v, &fm.Similarity) if err != nil { return err } case "vector_index_optimized_for": - err := json.Unmarshal(v, &fm.VectorIndexOptimizedFor) + err := util.UnmarshalJSON(v, &fm.VectorIndexOptimizedFor) + if err != nil { + return err + } + case "synonym_source": + err := util.UnmarshalJSON(v, &fm.SynonymSource) if err != nil { return err } diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go index fe8c96713..a40feb470 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go @@ -49,6 +49,8 @@ type IndexMappingImpl struct { DefaultType string `json:"default_type"` DefaultAnalyzer string `json:"default_analyzer"` DefaultDateTimeParser string `json:"default_datetime_parser"` + DefaultSynonymSource string `json:"default_synonym_source,omitempty"` + ScoringModel string `json:"scoring_model,omitempty"` DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` @@ -145,6 +147,15 @@ func (im *IndexMappingImpl) AddCustomDateTimeParser(name string, config map[stri return nil } +func (im *IndexMappingImpl) AddSynonymSource(name string, config map[string]interface{}) error { + _, err := im.cache.DefineSynonymSource(name, config) + if err != nil { + return err + } + im.CustomAnalysis.SynonymSources[name] = config + return nil +} + // NewIndexMapping creates a new IndexMapping that will use all the default indexing rules func NewIndexMapping() *IndexMappingImpl { return &IndexMappingImpl{ @@ -174,7 +185,12 @@ func (im *IndexMappingImpl) Validate() error { if err != nil { return err } - + if im.DefaultSynonymSource != "" { + _, err = im.cache.SynonymSourceNamed(im.DefaultSynonymSource) + if err != nil { + return err + } + } fieldAliasCtx := make(map[string]*FieldMapping) err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { @@ -186,6 +202,11 @@ func (im *IndexMappingImpl) Validate() error { return err } } + + if _, ok := index.SupportedScoringModels[im.ScoringModel]; !ok && im.ScoringModel != "" { + return fmt.Errorf("unsupported scoring model: %s", im.ScoringModel) + } + return nil } @@ -253,6 +274,11 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "default_synonym_source": + err := util.UnmarshalJSON(v, &im.DefaultSynonymSource) + if err != nil { + return err + } case "default_field": err := util.UnmarshalJSON(v, &im.DefaultField) if err != nil { @@ -283,6 +309,12 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "scoring_model": + err := util.UnmarshalJSON(v, &im.ScoringModel) + if err != nil { + return err + } + default: invalidKeys = append(invalidKeys, k) } @@ -334,11 +366,30 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, index.IndexField|index.IncludeTermVectors) doc.AddField(field) } + doc.SetIndexed() } return nil } +func (im *IndexMappingImpl) MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error { + // determine all the synonym sources with the given collection + // and create a synonym field for each + err := im.SynonymSourceVisitor(func(name string, item analysis.SynonymSource) error { + if item.Collection() == collection { + // create a new field with the name of the synonym source + analyzer := im.AnalyzerNamed(item.Analyzer()) + if analyzer == nil { + return fmt.Errorf("unknown analyzer named: %s", item.Analyzer()) + } + field := document.NewSynonymField(name, analyzer, input, synonyms) + doc.AddField(field) + } + return nil + }) + return err +} + type walkContext struct { doc *document.Document im *IndexMappingImpl @@ -457,3 +508,66 @@ func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping { func (im *IndexMappingImpl) DefaultSearchField() string { return im.DefaultField } + +func (im *IndexMappingImpl) SynonymSourceNamed(name string) analysis.SynonymSource { + syn, err := im.cache.SynonymSourceNamed(name) + if err != nil { + logger.Printf("error using synonym source named: %s", name) + return nil + } + return syn +} + +func (im *IndexMappingImpl) SynonymSourceForPath(path string) string { + // first we look for explicit mapping on the field + for _, docMapping := range im.TypeMapping { + synonymSource := docMapping.synonymSourceForPath(path) + if synonymSource != "" { + return synonymSource + } + } + + // now try the default mapping + pathMapping, _ := im.DefaultMapping.documentMappingForPath(path) + if pathMapping != nil { + if len(pathMapping.Fields) > 0 { + if pathMapping.Fields[0].SynonymSource != "" { + return pathMapping.Fields[0].SynonymSource + } + } + } + + // next we will try default synonym sources for the path + pathDecoded := decodePath(path) + for _, docMapping := range im.TypeMapping { + if docMapping.Enabled { + rv := docMapping.defaultSynonymSource(pathDecoded) + if rv != "" { + return rv + } + } + } + // now the default analyzer for the default mapping + if im.DefaultMapping.Enabled { + rv := im.DefaultMapping.defaultSynonymSource(pathDecoded) + if rv != "" { + return rv + } + } + + return im.DefaultSynonymSource +} + +// SynonymCount() returns the number of synonym sources defined in the mapping +func (im *IndexMappingImpl) SynonymCount() int { + return len(im.CustomAnalysis.SynonymSources) +} + +// SynonymSourceVisitor() allows a visitor to iterate over all synonym sources +func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error { + err := im.cache.SynonymSources.VisitSynonymSources(visitor) + if err != nil { + return err + } + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go index cbfc98faa..a6c1591b8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go @@ -58,3 +58,19 @@ type IndexMapping interface { FieldMappingForPath(path string) FieldMapping } + +// A SynonymMapping extends the IndexMapping interface to provide +// additional methods for working with synonyms. +type SynonymMapping interface { + IndexMapping + + MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error + + SynonymSourceForPath(path string) string + + SynonymSourceNamed(name string) analysis.SynonymSource + + SynonymCount() int + + SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error +} diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go index dbfde1fb0..20cbac6a8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go @@ -204,7 +204,7 @@ func validateVectorFieldAlias(field *FieldMapping, parentName string, } if field.Similarity == "" { - field.Similarity = index.DefaultSimilarityMetric + field.Similarity = index.DefaultVectorSimilarityMetric } if field.VectorIndexOptimizedFor == "" { @@ -249,10 +249,10 @@ func validateVectorFieldAlias(field *FieldMapping, parentName string, MinVectorDims, MaxVectorDims) } - if _, ok := index.SupportedSimilarityMetrics[field.Similarity]; !ok { + if _, ok := index.SupportedVectorSimilarityMetrics[field.Similarity]; !ok { return fmt.Errorf("field: '%s', invalid similarity "+ "metric: '%s', valid metrics are: %+v", field.Name, field.Similarity, - reflect.ValueOf(index.SupportedSimilarityMetrics).MapKeys()) + reflect.ValueOf(index.SupportedVectorSimilarityMetrics).MapKeys()) } if fieldAliasCtx != nil { // writing to a nil map is unsafe diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/synonym.go b/vendor/github.com/blevesearch/bleve/v2/mapping/synonym.go new file mode 100644 index 000000000..198282194 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/synonym.go @@ -0,0 +1,71 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mapping + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" +) + +type SynonymSource struct { + CollectionName string `json:"collection"` + AnalyzerName string `json:"analyzer"` +} + +func NewSynonymSource(collection, analyzer string) *SynonymSource { + return &SynonymSource{ + CollectionName: collection, + AnalyzerName: analyzer, + } +} + +func (s *SynonymSource) Collection() string { + return s.CollectionName +} + +func (s *SynonymSource) Analyzer() string { + return s.AnalyzerName +} + +func (s *SynonymSource) SetCollection(c string) { + s.CollectionName = c +} + +func (s *SynonymSource) SetAnalyzer(a string) { + s.AnalyzerName = a +} +func SynonymSourceConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.SynonymSource, error) { + collection, ok := config["collection"].(string) + if !ok { + return nil, fmt.Errorf("must specify collection") + } + analyzer, ok := config["analyzer"].(string) + if !ok { + return nil, fmt.Errorf("must specify analyzer") + } + if _, err := cache.AnalyzerNamed(analyzer); err != nil { + return nil, fmt.Errorf("analyzer named '%s' not found", analyzer) + } + return NewSynonymSource(collection, analyzer), nil +} + +func init() { + err := registry.RegisterSynonymSource(analysis.SynonymSourceType, SynonymSourceConstructor) + if err != nil { + panic(err) + } +} diff --git a/vendor/github.com/blevesearch/bleve/v2/pre_search.go b/vendor/github.com/blevesearch/bleve/v2/pre_search.go index c8c55bfbc..3dd7e0fe3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/pre_search.go +++ b/vendor/github.com/blevesearch/bleve/v2/pre_search.go @@ -14,6 +14,10 @@ package bleve +import ( + "github.com/blevesearch/bleve/v2/search" +) + // A preSearchResultProcessor processes the data in // the preSearch result from multiple // indexes in an alias and merges them together to @@ -26,6 +30,8 @@ type preSearchResultProcessor interface { finalize(*SearchResult) } +// ----------------------------------------------------------------------------- +// KNN preSearchResultProcessor for handling KNN presearch results type knnPreSearchResultProcessor struct { addFn func(sr *SearchResult, indexName string) finalizeFn func(sr *SearchResult) @@ -44,16 +50,121 @@ func (k *knnPreSearchResultProcessor) finalize(sr *SearchResult) { } // ----------------------------------------------------------------------------- +// Synonym preSearchResultProcessor for handling Synonym presearch results +type synonymPreSearchResultProcessor struct { + finalizedFts search.FieldTermSynonymMap +} -func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) { - if requestHasKNN(req) { - preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) +func newSynonymPreSearchResultProcessor() *synonymPreSearchResultProcessor { + return &synonymPreSearchResultProcessor{} +} + +func (s *synonymPreSearchResultProcessor) add(sr *SearchResult, indexName string) { + // Check if SynonymResult or the synonym data key is nil + if sr.SynonymResult == nil { + return + } + + // Attempt to cast PreSearchResults to FieldTermSynonymMap + + // Merge with finalizedFts or initialize it if nil + if s.finalizedFts == nil { + s.finalizedFts = sr.SynonymResult + } else { + s.finalizedFts.MergeWith(sr.SynonymResult) } } -func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor { - if requestHasKNN(req) { - return newKnnPreSearchResultProcessor(req) +func (s *synonymPreSearchResultProcessor) finalize(sr *SearchResult) { + // Set the finalized synonym data to the PreSearchResults + if s.finalizedFts != nil { + sr.SynonymResult = s.finalizedFts + } +} + +type bm25PreSearchResultProcessor struct { + docCount float64 // bm25 specific stats + fieldCardinality map[string]int +} + +func newBM25PreSearchResultProcessor() *bm25PreSearchResultProcessor { + return &bm25PreSearchResultProcessor{ + fieldCardinality: make(map[string]int), + } +} + +// TODO How will this work for queries other than term queries? +func (b *bm25PreSearchResultProcessor) add(sr *SearchResult, indexName string) { + if sr.BM25Stats != nil { + b.docCount += sr.BM25Stats.DocCount + for field, cardinality := range sr.BM25Stats.FieldCardinality { + b.fieldCardinality[field] += cardinality + } + } +} + +func (b *bm25PreSearchResultProcessor) finalize(sr *SearchResult) { + sr.BM25Stats = &search.BM25Stats{ + DocCount: b.docCount, + FieldCardinality: b.fieldCardinality, + } +} + +// ----------------------------------------------------------------------------- +// Master struct that can hold any number of presearch result processors +type compositePreSearchResultProcessor struct { + presearchResultProcessors []preSearchResultProcessor +} + +// Implements the add method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) add(sr *SearchResult, indexName string) { + for _, p := range m.presearchResultProcessors { + p.add(sr, indexName) + } +} + +// Implements the finalize method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) finalize(sr *SearchResult) { + for _, p := range m.presearchResultProcessors { + p.finalize(sr) + } +} + +// ----------------------------------------------------------------------------- +// Function to create the appropriate preSearchResultProcessor(s) +func createPreSearchResultProcessor(req *SearchRequest, flags *preSearchFlags) preSearchResultProcessor { + // return nil for invalid input + if flags == nil || req == nil { + return nil + } + var processors []preSearchResultProcessor + // Add KNN processor if the request has KNN + if flags.knn { + if knnProcessor := newKnnPreSearchResultProcessor(req); knnProcessor != nil { + processors = append(processors, knnProcessor) + } + } + // Add Synonym processor if the request has Synonym + if flags.synonyms { + if synonymProcessor := newSynonymPreSearchResultProcessor(); synonymProcessor != nil { + processors = append(processors, synonymProcessor) + } + } + if flags.bm25 { + if bm25Processtor := newBM25PreSearchResultProcessor(); bm25Processtor != nil { + processors = append(processors, bm25Processtor) + } + } + // Return based on the number of processors, optimizing for the common case of 1 processor + // If there are no processors, return nil + switch len(processors) { + case 0: + return nil + case 1: + return processors[0] + default: + return &compositePreSearchResultProcessor{ + presearchResultProcessors: processors, + } } - return &knnPreSearchResultProcessor{} // equivalent to nil } diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/analyzer.go b/vendor/github.com/blevesearch/bleve/v2/registry/analyzer.go index f4753bc1c..af95b885d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/analyzer.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/analyzer.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/analysis" ) -func RegisterAnalyzer(name string, constructor AnalyzerConstructor) { +func RegisterAnalyzer(name string, constructor AnalyzerConstructor) error { _, exists := analyzers[name] if exists { - panic(fmt.Errorf("attempted to register duplicate analyzer named '%s'", name)) + return fmt.Errorf("attempted to register duplicate analyzer named '%s'", name) } analyzers[name] = constructor + return nil } type AnalyzerConstructor func(config map[string]interface{}, cache *Cache) (analysis.Analyzer, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/char_filter.go b/vendor/github.com/blevesearch/bleve/v2/registry/char_filter.go index aa400be68..e888dac4a 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/char_filter.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/char_filter.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/analysis" ) -func RegisterCharFilter(name string, constructor CharFilterConstructor) { +func RegisterCharFilter(name string, constructor CharFilterConstructor) error { _, exists := charFilters[name] if exists { - panic(fmt.Errorf("attempted to register duplicate char filter named '%s'", name)) + return fmt.Errorf("attempted to register duplicate char filter named '%s'", name) } charFilters[name] = constructor + return nil } type CharFilterConstructor func(config map[string]interface{}, cache *Cache) (analysis.CharFilter, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/datetime_parser.go b/vendor/github.com/blevesearch/bleve/v2/registry/datetime_parser.go index a2d8ac24a..ff9a80cb5 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/datetime_parser.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/datetime_parser.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/analysis" ) -func RegisterDateTimeParser(name string, constructor DateTimeParserConstructor) { +func RegisterDateTimeParser(name string, constructor DateTimeParserConstructor) error { _, exists := dateTimeParsers[name] if exists { - panic(fmt.Errorf("attempted to register duplicate date time parser named '%s'", name)) + return fmt.Errorf("attempted to register duplicate date time parser named '%s'", name) } dateTimeParsers[name] = constructor + return nil } type DateTimeParserConstructor func(config map[string]interface{}, cache *Cache) (analysis.DateTimeParser, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/fragment_formatter.go b/vendor/github.com/blevesearch/bleve/v2/registry/fragment_formatter.go index 6699f53ba..f32c5571d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/fragment_formatter.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/fragment_formatter.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/search/highlight" ) -func RegisterFragmentFormatter(name string, constructor FragmentFormatterConstructor) { +func RegisterFragmentFormatter(name string, constructor FragmentFormatterConstructor) error { _, exists := fragmentFormatters[name] if exists { - panic(fmt.Errorf("attempted to register duplicate fragment formatter named '%s'", name)) + return fmt.Errorf("attempted to register duplicate fragment formatter named '%s'", name) } fragmentFormatters[name] = constructor + return nil } type FragmentFormatterConstructor func(config map[string]interface{}, cache *Cache) (highlight.FragmentFormatter, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/fragmenter.go b/vendor/github.com/blevesearch/bleve/v2/registry/fragmenter.go index cd1e32d28..da2a7b5c1 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/fragmenter.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/fragmenter.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/search/highlight" ) -func RegisterFragmenter(name string, constructor FragmenterConstructor) { +func RegisterFragmenter(name string, constructor FragmenterConstructor) error { _, exists := fragmenters[name] if exists { - panic(fmt.Errorf("attempted to register duplicate fragmenter named '%s'", name)) + return fmt.Errorf("attempted to register duplicate fragmenter named '%s'", name) } fragmenters[name] = constructor + return nil } type FragmenterConstructor func(config map[string]interface{}, cache *Cache) (highlight.Fragmenter, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/highlighter.go b/vendor/github.com/blevesearch/bleve/v2/registry/highlighter.go index 8eb210fb3..75de25482 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/highlighter.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/highlighter.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/search/highlight" ) -func RegisterHighlighter(name string, constructor HighlighterConstructor) { +func RegisterHighlighter(name string, constructor HighlighterConstructor) error { _, exists := highlighters[name] if exists { - panic(fmt.Errorf("attempted to register duplicate highlighter named '%s'", name)) + return fmt.Errorf("attempted to register duplicate highlighter named '%s'", name) } highlighters[name] = constructor + return nil } type HighlighterConstructor func(config map[string]interface{}, cache *Cache) (highlight.Highlighter, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/index_type.go b/vendor/github.com/blevesearch/bleve/v2/registry/index_type.go index 67938c4af..0c2c87f46 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/index_type.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/index_type.go @@ -20,12 +20,13 @@ import ( index "github.com/blevesearch/bleve_index_api" ) -func RegisterIndexType(name string, constructor IndexTypeConstructor) { +func RegisterIndexType(name string, constructor IndexTypeConstructor) error { _, exists := indexTypes[name] if exists { - panic(fmt.Errorf("attempted to register duplicate index encoding named '%s'", name)) + return fmt.Errorf("attempted to register duplicate index encoding named '%s'", name) } indexTypes[name] = constructor + return nil } type IndexTypeConstructor func(storeName string, storeConfig map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) @@ -38,7 +39,7 @@ func IndexTypeConstructorByName(name string) IndexTypeConstructor { func IndexTypesAndInstances() ([]string, []string) { var types []string var instances []string - for name := range stores { + for name := range indexTypes { types = append(types, name) } return types, instances diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/registry.go b/vendor/github.com/blevesearch/bleve/v2/registry/registry.go index 1954d0896..69ee8dd86 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/registry.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/registry.go @@ -36,6 +36,7 @@ var tokenMaps = make(TokenMapRegistry, 0) var tokenFilters = make(TokenFilterRegistry, 0) var analyzers = make(AnalyzerRegistry, 0) var dateTimeParsers = make(DateTimeParserRegistry, 0) +var synonymSources = make(SynonymSourceRegistry, 0) type Cache struct { CharFilters *CharFilterCache @@ -47,6 +48,7 @@ type Cache struct { FragmentFormatters *FragmentFormatterCache Fragmenters *FragmenterCache Highlighters *HighlighterCache + SynonymSources *SynonymSourceCache } func NewCache() *Cache { @@ -60,6 +62,7 @@ func NewCache() *Cache { FragmentFormatters: NewFragmentFormatterCache(), Fragmenters: NewFragmenterCache(), Highlighters: NewHighlighterCache(), + SynonymSources: NewSynonymSourceCache(), } } @@ -147,6 +150,14 @@ func (c *Cache) DefineDateTimeParser(name string, config map[string]interface{}) return c.DateTimeParsers.DefineDateTimeParser(name, typ, config, c) } +func (c *Cache) SynonymSourceNamed(name string) (analysis.SynonymSource, error) { + return c.SynonymSources.SynonymSourceNamed(name, c) +} + +func (c *Cache) DefineSynonymSource(name string, config map[string]interface{}) (analysis.SynonymSource, error) { + return c.SynonymSources.DefineSynonymSource(name, analysis.SynonymSourceType, config, c) +} + func (c *Cache) FragmentFormatterNamed(name string) (highlight.FragmentFormatter, error) { return c.FragmentFormatters.FragmentFormatterNamed(name, c) } diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/store.go b/vendor/github.com/blevesearch/bleve/v2/registry/store.go index 02ebd888c..56840836b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/store.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/store.go @@ -17,15 +17,16 @@ package registry import ( "fmt" - "github.com/blevesearch/upsidedown_store_api" + store "github.com/blevesearch/upsidedown_store_api" ) -func RegisterKVStore(name string, constructor KVStoreConstructor) { +func RegisterKVStore(name string, constructor KVStoreConstructor) error { _, exists := stores[name] if exists { - panic(fmt.Errorf("attempted to register duplicate store named '%s'", name)) + return fmt.Errorf("attempted to register duplicate store named '%s'", name) } stores[name] = constructor + return nil } // KVStoreConstructor is used to build a KVStore of a specific type when diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/synonym_source.go b/vendor/github.com/blevesearch/bleve/v2/registry/synonym_source.go new file mode 100644 index 000000000..f1836f8ae --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/registry/synonym_source.go @@ -0,0 +1,86 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package registry + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/analysis" +) + +func RegisterSynonymSource(typ string, constructor SynonymSourceConstructor) error { + _, exists := synonymSources[typ] + if exists { + return fmt.Errorf("attempted to register duplicate synonym source with type '%s'", typ) + } + synonymSources[typ] = constructor + return nil +} + +type SynonymSourceCache struct { + *ConcurrentCache +} + +func NewSynonymSourceCache() *SynonymSourceCache { + return &SynonymSourceCache{ + NewConcurrentCache(), + } +} + +type SynonymSourceConstructor func(config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) +type SynonymSourceRegistry map[string]SynonymSourceConstructor + +func SynonymSourceBuild(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { + cons, registered := synonymSources[name] + if !registered { + return nil, fmt.Errorf("no synonym source with name '%s' registered", name) + } + synonymSource, err := cons(config, cache) + if err != nil { + return nil, fmt.Errorf("error building synonym source: %v", err) + } + return synonymSource, nil +} + +func (c *SynonymSourceCache) SynonymSourceNamed(name string, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.ItemNamed(name, cache, SynonymSourceBuild) + if err != nil { + return nil, err + } + return item.(analysis.SynonymSource), nil +} + +func (c *SynonymSourceCache) DefineSynonymSource(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.DefineItem(name, typ, config, cache, SynonymSourceBuild) + if err != nil { + if err == ErrAlreadyDefined { + return nil, fmt.Errorf("synonym source named '%s' already defined", name) + } + return nil, err + } + return item.(analysis.SynonymSource), nil +} + +func (c *SynonymSourceCache) VisitSynonymSources(visitor analysis.SynonymSourceVisitor) error { + c.mutex.RLock() + defer c.mutex.RUnlock() + for k, v := range c.data { + err := visitor(k, v.(analysis.SynonymSource)) + if err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/token_filter.go b/vendor/github.com/blevesearch/bleve/v2/registry/token_filter.go index df39411ae..533a1030f 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/token_filter.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/token_filter.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/analysis" ) -func RegisterTokenFilter(name string, constructor TokenFilterConstructor) { +func RegisterTokenFilter(name string, constructor TokenFilterConstructor) error { _, exists := tokenFilters[name] if exists { - panic(fmt.Errorf("attempted to register duplicate token filter named '%s'", name)) + return fmt.Errorf("attempted to register duplicate token filter named '%s'", name) } tokenFilters[name] = constructor + return nil } type TokenFilterConstructor func(config map[string]interface{}, cache *Cache) (analysis.TokenFilter, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/token_maps.go b/vendor/github.com/blevesearch/bleve/v2/registry/token_maps.go index 08c9956eb..7fd7886bf 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/token_maps.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/token_maps.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/analysis" ) -func RegisterTokenMap(name string, constructor TokenMapConstructor) { +func RegisterTokenMap(name string, constructor TokenMapConstructor) error { _, exists := tokenMaps[name] if exists { - panic(fmt.Errorf("attempted to register duplicate token map named '%s'", name)) + return fmt.Errorf("attempted to register duplicate token map named '%s'", name) } tokenMaps[name] = constructor + return nil } type TokenMapConstructor func(config map[string]interface{}, cache *Cache) (analysis.TokenMap, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/tokenizer.go b/vendor/github.com/blevesearch/bleve/v2/registry/tokenizer.go index eb954287c..81222b8f5 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/tokenizer.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/tokenizer.go @@ -20,12 +20,13 @@ import ( "github.com/blevesearch/bleve/v2/analysis" ) -func RegisterTokenizer(name string, constructor TokenizerConstructor) { +func RegisterTokenizer(name string, constructor TokenizerConstructor) error { _, exists := tokenizers[name] if exists { - panic(fmt.Errorf("attempted to register duplicate tokenizer named '%s'", name)) + return fmt.Errorf("attempted to register duplicate tokenizer named '%s'", name) } tokenizers[name] = constructor + return nil } type TokenizerConstructor func(config map[string]interface{}, cache *Cache) (analysis.Tokenizer, error) diff --git a/vendor/github.com/blevesearch/bleve/v2/search.go b/vendor/github.com/blevesearch/bleve/v2/search.go index 7861d24b8..2c25e0551 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search.go +++ b/vendor/github.com/blevesearch/bleve/v2/search.go @@ -31,8 +31,10 @@ import ( "github.com/blevesearch/bleve/v2/util" ) -var reflectStaticSizeSearchResult int -var reflectStaticSizeSearchStatus int +var ( + reflectStaticSizeSearchResult int + reflectStaticSizeSearchStatus int +) func init() { var sr SearchResult @@ -444,6 +446,12 @@ type SearchResult struct { MaxScore float64 `json:"max_score"` Took time.Duration `json:"took"` Facets search.FacetResults `json:"facets"` + // special fields that are applicable only for search + // results that are obtained from a presearch + SynonymResult search.FieldTermSynonymMap `json:"synonym_result,omitempty"` + + // The following fields are applicable to BM25 preSearch + BM25Stats *search.BM25Stats `json:"bm25_stats,omitempty"` } func (sr *SearchResult) Size() int { @@ -491,7 +499,7 @@ func (sr *SearchResult) String() string { rv = "No matches" } if len(sr.Facets) > 0 { - rv += fmt.Sprintf("Facets:\n") + rv += "Facets:\n" for fn, f := range sr.Facets { rv += fmt.Sprintf("%s(%d)\n", fn, f.Total) for _, t := range f.Terms.Terms() { @@ -589,3 +597,13 @@ func (r *SearchRequest) SortFunc() func(data sort.Interface) { return sort.Sort } + +func isMatchNoneQuery(q query.Query) bool { + _, ok := q.(*query.MatchNoneQuery) + return ok +} + +func isMatchAllQuery(q query.Query) bool { + _, ok := q.(*query.MatchAllQuery) + return ok +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/eligible.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/eligible.go index 5590290b0..49e044812 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector/eligible.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/eligible.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build vectors +// +build vectors + package collector import ( @@ -24,12 +27,10 @@ import ( ) type EligibleCollector struct { - size int - total uint64 - took time.Duration - results search.DocumentMatchCollection - - ids []index.IndexInternalID + size int + total uint64 + took time.Duration + eligibleSelector index.EligibleDocumentSelector } func NewEligibleCollector(size int) *EligibleCollector { @@ -38,28 +39,33 @@ func NewEligibleCollector(size int) *EligibleCollector { func newEligibleCollector(size int) *EligibleCollector { // No sort order & skip always 0 since this is only to filter eligible docs. - ec := &EligibleCollector{size: size, - ids: make([]index.IndexInternalID, 0, size), + ec := &EligibleCollector{ + size: size, } return ec } -func makeEligibleDocumentMatchHandler(ctx *search.SearchContext) (search.DocumentMatchHandler, error) { +func makeEligibleDocumentMatchHandler(ctx *search.SearchContext, reader index.IndexReader) (search.DocumentMatchHandler, error) { if ec, ok := ctx.Collector.(*EligibleCollector); ok { - return func(d *search.DocumentMatch) error { - if d == nil { + if vr, ok := reader.(index.VectorIndexReader); ok { + // create a new eligible document selector to add eligible document matches + ec.eligibleSelector = vr.NewEligibleDocumentSelector() + // return a document match handler that adds eligible document matches + // to the eligible document selector + return func(d *search.DocumentMatch) error { + if d == nil { + return nil + } + err := ec.eligibleSelector.AddEligibleDocumentMatch(d.IndexInternalID) + if err != nil { + return err + } + // recycle the DocumentMatch + ctx.DocumentMatchPool.Put(d) return nil - } - - copyOfID := make([]byte, len(d.IndexInternalID)) - copy(copyOfID, d.IndexInternalID) - ec.ids = append(ec.ids, copyOfID) - - // recycle the DocumentMatch - ctx.DocumentMatchPool.Put(d) - - return nil - }, nil + }, nil + } + return nil, fmt.Errorf("reader is not a VectorIndexReader") } return nil, fmt.Errorf("eligiblity collector not available") @@ -80,7 +86,7 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search IndexReader: reader, } - dmHandler, err := makeEligibleDocumentMatchHandler(searchContext) + dmHandler, err := makeEligibleDocumentMatchHandler(searchContext, reader) if err != nil { return err } @@ -126,12 +132,21 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search return nil } +// The eligible collector does not return any document matches and hence +// this method is a dummy method returning nil, to conform to the +// search.Collector interface. func (ec *EligibleCollector) Results() search.DocumentMatchCollection { return nil } -func (ec *EligibleCollector) IDs() []index.IndexInternalID { - return ec.ids +// EligibleSelector returns the eligible document selector, which can be used +// to retrieve the list of eligible documents from this collector. +// If the collector has no results, it returns nil. +func (ec *EligibleCollector) EligibleSelector() index.EligibleDocumentSelector { + if ec.total == 0 { + return nil + } + return ec.eligibleSelector } func (ec *EligibleCollector) Total() uint64 { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go index fc338f54e..e3ea9d7d0 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go @@ -20,6 +20,7 @@ import ( "strconv" "time" + "github.com/blevesearch/bleve/v2/numeric" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" @@ -500,7 +501,23 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error { doc.Complete(nil) return nil }) + if err != nil { + return err + } + // Decode geo sort keys back to its distance values + for i, so := range hc.sort { + if _, ok := so.(*search.SortGeoDistance); ok { + for _, dm := range hc.results { + // The string is a int64 bit representation of a float64 distance + distInt, err := numeric.PrefixCoded(dm.Sort[i]).Int64() + if err != nil { + return err + } + dm.Sort[i] = strconv.FormatFloat(numeric.Int64ToFloat64(distInt), 'f', -1, 64) + } + } + } return err } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/explanation.go b/vendor/github.com/blevesearch/bleve/v2/search/explanation.go index b1ac29aa8..924050016 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/explanation.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/explanation.go @@ -30,9 +30,10 @@ func init() { } type Explanation struct { - Value float64 `json:"value"` - Message string `json:"message"` - Children []*Explanation `json:"children,omitempty"` + Value float64 `json:"value"` + Message string `json:"message"` + PartialMatch bool `json:"partial_match,omitempty"` + Children []*Explanation `json:"children,omitempty"` } func (expl *Explanation) String() string { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go index ff5167f21..9fe4cf4ca 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_datetime.go @@ -24,8 +24,10 @@ import ( "github.com/blevesearch/bleve/v2/size" ) -var reflectStaticSizeDateTimeFacetBuilder int -var reflectStaticSizedateTimeRange int +var ( + reflectStaticSizeDateTimeFacetBuilder int + reflectStaticSizedateTimeRange int +) func init() { var dtfb DateTimeFacetBuilder @@ -62,12 +64,12 @@ func (fb *DateTimeFacetBuilder) Size() int { sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr + len(fb.field) - for k, _ := range fb.termsCount { + for k := range fb.termsCount { sizeInBytes += size.SizeOfString + len(k) + size.SizeOfInt } - for k, _ := range fb.ranges { + for k := range fb.ranges { sizeInBytes += size.SizeOfString + len(k) + size.SizeOfPtr + reflectStaticSizedateTimeRange } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_numeric.go b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_numeric.go index f19634d7b..138394280 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_numeric.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_numeric.go @@ -23,8 +23,10 @@ import ( "github.com/blevesearch/bleve/v2/size" ) -var reflectStaticSizeNumericFacetBuilder int -var reflectStaticSizenumericRange int +var ( + reflectStaticSizeNumericFacetBuilder int + reflectStaticSizenumericRange int +) func init() { var nfb NumericFacetBuilder @@ -61,12 +63,12 @@ func (fb *NumericFacetBuilder) Size() int { sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr + len(fb.field) - for k, _ := range fb.termsCount { + for k := range fb.termsCount { sizeInBytes += size.SizeOfString + len(k) + size.SizeOfInt } - for k, _ := range fb.ranges { + for k := range fb.ranges { sizeInBytes += size.SizeOfString + len(k) + size.SizeOfPtr + reflectStaticSizenumericRange } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_terms.go b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_terms.go index c5a1c8318..ad1825c83 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_terms.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/facet/facet_builder_terms.go @@ -50,7 +50,7 @@ func (fb *TermsFacetBuilder) Size() int { sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr + len(fb.field) - for k, _ := range fb.termsCount { + for k := range fb.termsCount { sizeInBytes += size.SizeOfString + len(k) + size.SizeOfInt } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/highlight/format/html/html.go b/vendor/github.com/blevesearch/bleve/v2/search/highlight/format/html/html.go index a0658d9c7..92b6f612d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/highlight/format/html/html.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/highlight/format/html/html.go @@ -87,5 +87,8 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (highligh } func init() { - registry.RegisterFragmentFormatter(Name, Constructor) + err := registry.RegisterFragmentFormatter(Name, Constructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/highlight/fragmenter/simple/simple.go b/vendor/github.com/blevesearch/bleve/v2/search/highlight/fragmenter/simple/simple.go index 34e5c9597..1c34b010e 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/highlight/fragmenter/simple/simple.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/highlight/fragmenter/simple/simple.go @@ -149,5 +149,8 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (highligh } func init() { - registry.RegisterFragmenter(Name, Constructor) + err := registry.RegisterFragmenter(Name, Constructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/html/html.go b/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/html/html.go index ceb686dce..02eca0a59 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/html/html.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/html/html.go @@ -46,5 +46,8 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (highligh } func init() { - registry.RegisterHighlighter(Name, Constructor) + err := registry.RegisterHighlighter(Name, Constructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go b/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go index 19949687d..e898a1e61 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go @@ -17,6 +17,7 @@ package simple import ( "container/heap" "fmt" + index "github.com/blevesearch/bleve_index_api" "github.com/blevesearch/bleve/v2/registry" @@ -217,5 +218,8 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (highligh } func init() { - registry.RegisterHighlighter(Name, Constructor) + err := registry.RegisterHighlighter(Name, Constructor) + if err != nil { + panic(err) + } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/levenshtein.go b/vendor/github.com/blevesearch/bleve/v2/search/levenshtein.go index 687608d3f..dadab2521 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/levenshtein.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/levenshtein.go @@ -68,6 +68,10 @@ func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, ld := int(math.Abs(float64(la - lb))) if ld > max { return max, true, d + } else if la == 0 || lb == 0 { + // if one string of the two strings is empty, then ld is + // the length of the other string and as such is <= max + return ld, false, d } if cap(d) < la+1 { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go index 0565e18f7..a2043720a 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go @@ -49,9 +49,7 @@ func (q *ConjunctionQuery) Boost() float64 { } func (q *ConjunctionQuery) AddQuery(aq ...Query) { - for _, aaq := range aq { - q.Conjuncts = append(q.Conjuncts, aaq) - } + q.Conjuncts = append(q.Conjuncts, aq...) } func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go index b307865f3..da46478b3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/disjunction.go @@ -56,9 +56,7 @@ func (q *DisjunctionQuery) Boost() float64 { } func (q *DisjunctionQuery) AddQuery(aq ...Query) { - for _, aaq := range aq { - q.Disjuncts = append(q.Disjuncts, aaq) - } + q.Disjuncts = append(q.Disjuncts, aq...) } func (q *DisjunctionQuery) SetMin(m float64) { @@ -66,7 +64,8 @@ func (q *DisjunctionQuery) SetMin(m float64) { } func (q *DisjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, - options search.SearcherOptions) (search.Searcher, error) { + options search.SearcherOptions, +) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Disjuncts)) for _, disjunct := range q.Disjuncts { sr, err := disjunct.Searcher(ctx, i, m, options) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/fuzzy.go b/vendor/github.com/blevesearch/bleve/v2/search/query/fuzzy.go index f24eb0c20..72d7c0ea6 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/fuzzy.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/fuzzy.go @@ -20,6 +20,7 @@ import ( "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/search/searcher" + "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -29,6 +30,7 @@ type FuzzyQuery struct { Fuzziness int `json:"fuzziness"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` + autoFuzzy bool } // NewFuzzyQuery creates a new Query which finds @@ -66,6 +68,10 @@ func (q *FuzzyQuery) SetFuzziness(f int) { q.Fuzziness = f } +func (q *FuzzyQuery) SetAutoFuzziness(a bool) { + q.autoFuzzy = a +} + func (q *FuzzyQuery) SetPrefix(p int) { q.Prefix = p } @@ -75,5 +81,54 @@ func (q *FuzzyQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin if q.FieldVal == "" { field = m.DefaultSearchField() } + if q.autoFuzzy { + return searcher.NewAutoFuzzySearcher(ctx, i, q.Term, q.Prefix, field, q.BoostVal.Value(), options) + } return searcher.NewFuzzySearcher(ctx, i, q.Term, q.Prefix, q.Fuzziness, field, q.BoostVal.Value(), options) } + +func (q *FuzzyQuery) UnmarshalJSON(data []byte) error { + type Alias FuzzyQuery + aux := &struct { + Fuzziness interface{} `json:"fuzziness"` + *Alias + }{ + Alias: (*Alias)(q), + } + if err := util.UnmarshalJSON(data, &aux); err != nil { + return err + } + switch v := aux.Fuzziness.(type) { + case float64: + q.Fuzziness = int(v) + case string: + if v == "auto" { + q.autoFuzzy = true + } + } + return nil +} + +func (f *FuzzyQuery) MarshalJSON() ([]byte, error) { + var fuzzyValue interface{} + if f.autoFuzzy { + fuzzyValue = "auto" + } else { + fuzzyValue = f.Fuzziness + } + type fuzzyQuery struct { + Term string `json:"term"` + Prefix int `json:"prefix_length"` + Fuzziness interface{} `json:"fuzziness"` + FieldVal string `json:"field,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + } + aux := fuzzyQuery{ + Term: f.Term, + Prefix: f.Prefix, + Fuzziness: fuzzyValue, + FieldVal: f.FieldVal, + BoostVal: f.BoostVal, + } + return util.MarshalJSON(aux) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/geo_boundingbox.go b/vendor/github.com/blevesearch/bleve/v2/search/query/geo_boundingbox.go index feb45d314..1653e6ed1 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/geo_boundingbox.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/geo_boundingbox.go @@ -85,6 +85,9 @@ func (q *GeoBoundingBoxQuery) Searcher(ctx context.Context, i index.IndexReader, } func (q *GeoBoundingBoxQuery) Validate() error { + if q.TopLeft[1] < q.BottomRight[1] { + return fmt.Errorf("geo bounding box top left should be higher than bottom right") + } return nil } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/ip_range.go b/vendor/github.com/blevesearch/bleve/v2/search/query/ip_range.go index ba46f0b25..6c447c2cd 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/ip_range.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/ip_range.go @@ -26,7 +26,7 @@ import ( ) type IPRangeQuery struct { - CIDR string `json:"cidr, omitempty"` + CIDR string `json:"cidr,omitempty"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go index 4d105d943..831d08f5b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go @@ -35,9 +35,11 @@ type KNNQuery struct { BoostVal *Boost `json:"boost,omitempty"` // see KNNRequest.Params for description - Params json.RawMessage `json:"params"` - FilterQuery Query `json:"filter,omitempty"` - filterResults []index.IndexInternalID + Params json.RawMessage `json:"params"` + FilterQuery Query `json:"filter,omitempty"` + // elegibleSelector is used to filter out documents that are + // eligible for the KNN search from a pre-filter query. + elegibleSelector index.EligibleDocumentSelector } func NewKNNQuery(vector []float32) *KNNQuery { @@ -69,12 +71,8 @@ func (q *KNNQuery) SetParams(params json.RawMessage) { q.Params = params } -func (q *KNNQuery) SetFilterQuery(f Query) { - q.FilterQuery = f -} - -func (q *KNNQuery) SetFilterResults(results []index.IndexInternalID) { - q.filterResults = results +func (q *KNNQuery) SetEligibleSelector(eligibleSelector index.EligibleDocumentSelector) { + q.elegibleSelector = eligibleSelector } func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader, @@ -82,7 +80,7 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader, fieldMapping := m.FieldMappingForPath(q.VectorField) similarityMetric := fieldMapping.Similarity if similarityMetric == "" { - similarityMetric = index.DefaultSimilarityMetric + similarityMetric = index.DefaultVectorSimilarityMetric } if q.K <= 0 || len(q.Vector) == 0 { return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty") @@ -94,5 +92,5 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader, return searcher.NewKNNSearcher(ctx, i, m, options, q.VectorField, q.Vector, q.K, q.BoostVal.Value(), similarityMetric, q.Params, - q.filterResults) + q.elegibleSelector) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/match.go b/vendor/github.com/blevesearch/bleve/v2/search/query/match.go index 074d11d34..ba84d9243 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/match.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/match.go @@ -32,6 +32,7 @@ type MatchQuery struct { Prefix int `json:"prefix_length"` Fuzziness int `json:"fuzziness"` Operator MatchQueryOperator `json:"operator,omitempty"` + autoFuzzy bool } type MatchQueryOperator int @@ -107,6 +108,10 @@ func (q *MatchQuery) SetFuzziness(f int) { q.Fuzziness = f } +func (q *MatchQuery) SetAutoFuzziness(auto bool) { + q.autoFuzzy = auto +} + func (q *MatchQuery) SetPrefix(p int) { q.Prefix = p } @@ -138,10 +143,14 @@ func (q *MatchQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin if len(tokens) > 0 { tqs := make([]Query, len(tokens)) - if q.Fuzziness != 0 { + if q.Fuzziness != 0 || q.autoFuzzy { for i, token := range tokens { query := NewFuzzyQuery(string(token.Term)) - query.SetFuzziness(q.Fuzziness) + if q.autoFuzzy { + query.SetAutoFuzziness(true) + } else { + query.SetFuzziness(q.Fuzziness) + } query.SetPrefix(q.Prefix) query.SetField(field) query.SetBoost(q.BoostVal.Value()) @@ -175,3 +184,53 @@ func (q *MatchQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin noneQuery := NewMatchNoneQuery() return noneQuery.Searcher(ctx, i, m, options) } + +func (q *MatchQuery) UnmarshalJSON(data []byte) error { + type Alias MatchQuery + aux := &struct { + Fuzziness interface{} `json:"fuzziness"` + *Alias + }{ + Alias: (*Alias)(q), + } + if err := util.UnmarshalJSON(data, &aux); err != nil { + return err + } + switch v := aux.Fuzziness.(type) { + case float64: + q.Fuzziness = int(v) + case string: + if v == "auto" { + q.autoFuzzy = true + } + } + return nil +} + +func (f *MatchQuery) MarshalJSON() ([]byte, error) { + var fuzzyValue interface{} + if f.autoFuzzy { + fuzzyValue = "auto" + } else { + fuzzyValue = f.Fuzziness + } + type match struct { + Match string `json:"match"` + FieldVal string `json:"field,omitempty"` + Analyzer string `json:"analyzer,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + Prefix int `json:"prefix_length"` + Fuzziness interface{} `json:"fuzziness"` + Operator MatchQueryOperator `json:"operator,omitempty"` + } + aux := match{ + Match: f.Match, + FieldVal: f.FieldVal, + Analyzer: f.Analyzer, + BoostVal: f.BoostVal, + Prefix: f.Prefix, + Fuzziness: fuzzyValue, + Operator: f.Operator, + } + return util.MarshalJSON(aux) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/match_phrase.go b/vendor/github.com/blevesearch/bleve/v2/search/query/match_phrase.go index 63a16a534..12a839657 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/match_phrase.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/match_phrase.go @@ -21,6 +21,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -30,6 +31,7 @@ type MatchPhraseQuery struct { Analyzer string `json:"analyzer,omitempty"` BoostVal *Boost `json:"boost,omitempty"` Fuzziness int `json:"fuzziness"` + autoFuzzy bool } // NewMatchPhraseQuery creates a new Query object @@ -63,6 +65,10 @@ func (q *MatchPhraseQuery) SetFuzziness(f int) { q.Fuzziness = f } +func (q *MatchPhraseQuery) SetAutoFuzziness(auto bool) { + q.autoFuzzy = auto +} + func (q *MatchPhraseQuery) Field() string { return q.FieldVal } @@ -89,7 +95,11 @@ func (q *MatchPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m phrase := tokenStreamToPhrase(tokens) phraseQuery := NewMultiPhraseQuery(phrase, field) phraseQuery.SetBoost(q.BoostVal.Value()) - phraseQuery.SetFuzziness(q.Fuzziness) + if q.autoFuzzy { + phraseQuery.SetAutoFuzziness(true) + } else { + phraseQuery.SetFuzziness(q.Fuzziness) + } return phraseQuery.Searcher(ctx, i, m, options) } noneQuery := NewMatchNoneQuery() @@ -118,3 +128,49 @@ func tokenStreamToPhrase(tokens analysis.TokenStream) [][]string { } return nil } + +func (q *MatchPhraseQuery) UnmarshalJSON(data []byte) error { + type Alias MatchPhraseQuery + aux := &struct { + Fuzziness interface{} `json:"fuzziness"` + *Alias + }{ + Alias: (*Alias)(q), + } + if err := util.UnmarshalJSON(data, &aux); err != nil { + return err + } + switch v := aux.Fuzziness.(type) { + case float64: + q.Fuzziness = int(v) + case string: + if v == "auto" { + q.autoFuzzy = true + } + } + return nil +} + +func (f *MatchPhraseQuery) MarshalJSON() ([]byte, error) { + var fuzzyValue interface{} + if f.autoFuzzy { + fuzzyValue = "auto" + } else { + fuzzyValue = f.Fuzziness + } + type matchPhrase struct { + MatchPhrase string `json:"match_phrase"` + FieldVal string `json:"field,omitempty"` + Analyzer string `json:"analyzer,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + Fuzziness interface{} `json:"fuzziness"` + } + aux := matchPhrase{ + MatchPhrase: f.MatchPhrase, + FieldVal: f.FieldVal, + Analyzer: f.Analyzer, + BoostVal: f.BoostVal, + Fuzziness: fuzzyValue, + } + return util.MarshalJSON(aux) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/multi_phrase.go b/vendor/github.com/blevesearch/bleve/v2/search/query/multi_phrase.go index d1144d908..aa2cc0450 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/multi_phrase.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/multi_phrase.go @@ -27,9 +27,10 @@ import ( type MultiPhraseQuery struct { Terms [][]string `json:"terms"` - Field string `json:"field,omitempty"` + FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` Fuzziness int `json:"fuzziness"` + autoFuzzy bool } // NewMultiPhraseQuery creates a new Query for finding @@ -43,8 +44,8 @@ type MultiPhraseQuery struct { // IncludeTermVectors set to true. func NewMultiPhraseQuery(terms [][]string, field string) *MultiPhraseQuery { return &MultiPhraseQuery{ - Terms: terms, - Field: field, + Terms: terms, + FieldVal: field, } } @@ -52,6 +53,10 @@ func (q *MultiPhraseQuery) SetFuzziness(f int) { q.Fuzziness = f } +func (q *MultiPhraseQuery) SetAutoFuzziness(auto bool) { + q.autoFuzzy = auto +} + func (q *MultiPhraseQuery) SetBoost(b float64) { boost := Boost(b) q.BoostVal = &boost @@ -61,8 +66,16 @@ func (q *MultiPhraseQuery) Boost() float64 { return q.BoostVal.Value() } +func (q *MultiPhraseQuery) Field() string { + return q.FieldVal +} + +func (q *MultiPhraseQuery) SetField(f string) { + q.FieldVal = f +} + func (q *MultiPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { - return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.Field, q.BoostVal.Value(), options) + return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.autoFuzzy, q.FieldVal, q.BoostVal.Value(), options) } func (q *MultiPhraseQuery) Validate() error { @@ -73,15 +86,45 @@ func (q *MultiPhraseQuery) Validate() error { } func (q *MultiPhraseQuery) UnmarshalJSON(data []byte) error { - type _mphraseQuery MultiPhraseQuery - tmp := _mphraseQuery{} - err := util.UnmarshalJSON(data, &tmp) - if err != nil { + type Alias MultiPhraseQuery + aux := &struct { + Fuzziness interface{} `json:"fuzziness"` + *Alias + }{ + Alias: (*Alias)(q), + } + if err := util.UnmarshalJSON(data, &aux); err != nil { return err } - q.Terms = tmp.Terms - q.Field = tmp.Field - q.BoostVal = tmp.BoostVal - q.Fuzziness = tmp.Fuzziness + switch v := aux.Fuzziness.(type) { + case float64: + q.Fuzziness = int(v) + case string: + if v == "auto" { + q.autoFuzzy = true + } + } return nil } + +func (f *MultiPhraseQuery) MarshalJSON() ([]byte, error) { + var fuzzyValue interface{} + if f.autoFuzzy { + fuzzyValue = "auto" + } else { + fuzzyValue = f.Fuzziness + } + type multiPhraseQuery struct { + Terms [][]string `json:"terms"` + FieldVal string `json:"field,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + Fuzziness interface{} `json:"fuzziness"` + } + aux := multiPhraseQuery{ + Terms: f.Terms, + FieldVal: f.FieldVal, + BoostVal: f.BoostVal, + Fuzziness: fuzzyValue, + } + return util.MarshalJSON(aux) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/phrase.go b/vendor/github.com/blevesearch/bleve/v2/search/query/phrase.go index 9092e72d0..96bc1b758 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/phrase.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/phrase.go @@ -27,9 +27,10 @@ import ( type PhraseQuery struct { Terms []string `json:"terms"` - Field string `json:"field,omitempty"` + FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` Fuzziness int `json:"fuzziness"` + autoFuzzy bool } // NewPhraseQuery creates a new Query for finding @@ -40,8 +41,8 @@ type PhraseQuery struct { // IncludeTermVectors set to true. func NewPhraseQuery(terms []string, field string) *PhraseQuery { return &PhraseQuery{ - Terms: terms, - Field: field, + Terms: terms, + FieldVal: field, } } @@ -54,12 +55,24 @@ func (q *PhraseQuery) SetFuzziness(f int) { q.Fuzziness = f } +func (q *PhraseQuery) SetAutoFuzziness(auto bool) { + q.autoFuzzy = auto +} + func (q *PhraseQuery) Boost() float64 { return q.BoostVal.Value() } +func (q *PhraseQuery) SetField(f string) { + q.FieldVal = f +} + +func (q *PhraseQuery) Field() string { + return q.FieldVal +} + func (q *PhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { - return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.Field, q.BoostVal.Value(), options) + return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.autoFuzzy, q.FieldVal, q.BoostVal.Value(), options) } func (q *PhraseQuery) Validate() error { @@ -70,15 +83,45 @@ func (q *PhraseQuery) Validate() error { } func (q *PhraseQuery) UnmarshalJSON(data []byte) error { - type _phraseQuery PhraseQuery - tmp := _phraseQuery{} - err := util.UnmarshalJSON(data, &tmp) - if err != nil { + type Alias PhraseQuery + aux := &struct { + Fuzziness interface{} `json:"fuzziness"` + *Alias + }{ + Alias: (*Alias)(q), + } + if err := util.UnmarshalJSON(data, &aux); err != nil { return err } - q.Terms = tmp.Terms - q.Field = tmp.Field - q.BoostVal = tmp.BoostVal - q.Fuzziness = tmp.Fuzziness + switch v := aux.Fuzziness.(type) { + case float64: + q.Fuzziness = int(v) + case string: + if v == "auto" { + q.autoFuzzy = true + } + } return nil } + +func (f *PhraseQuery) MarshalJSON() ([]byte, error) { + var fuzzyValue interface{} + if f.autoFuzzy { + fuzzyValue = "auto" + } else { + fuzzyValue = f.Fuzziness + } + type phraseQuery struct { + Terms []string `json:"terms"` + FieldVal string `json:"field,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + Fuzziness interface{} `json:"fuzziness"` + } + aux := phraseQuery{ + Terms: f.Terms, + FieldVal: f.FieldVal, + BoostVal: f.BoostVal, + Fuzziness: fuzzyValue, + } + return util.MarshalJSON(aux) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go index d263a0e54..6df38da37 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go @@ -20,9 +20,12 @@ import ( "fmt" "io" "log" + "strings" + "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -90,6 +93,31 @@ func ParsePreSearchData(input []byte) (map[string]interface{}, error) { rv = make(map[string]interface{}) } rv[search.KnnPreSearchDataKey] = value + case search.SynonymPreSearchDataKey: + var value search.FieldTermSynonymMap + if v != nil { + err := util.UnmarshalJSON(v, &value) + if err != nil { + return nil, err + } + } + if rv == nil { + rv = make(map[string]interface{}) + } + rv[search.SynonymPreSearchDataKey] = value + case search.BM25PreSearchDataKey: + var value *search.BM25Stats + if v != nil { + err := util.UnmarshalJSON(v, &value) + if err != nil { + return nil, err + } + } + if rv == nil { + rv = make(map[string]interface{}) + } + rv[search.BM25PreSearchDataKey] = value + } } return rv, nil @@ -354,9 +382,7 @@ func ParseQuery(input []byte) (Query, error) { // reference queries from the input tree or new queries. func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { var expand func(query Query) (Query, error) - var expandSlice func(queries []Query) ([]Query, error) - - expandSlice = func(queries []Query) ([]Query, error) { + var expandSlice func(queries []Query) ([]Query, error) = func(queries []Query) ([]Query, error) { expanded := []Query{} for _, q := range queries { exp, err := expand(q) @@ -423,3 +449,335 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { data, err := json.MarshalIndent(q, "", " ") return string(data), err } + +// FieldSet represents a set of queried fields. +type FieldSet map[string]struct{} + +// ExtractFields returns a set of fields referenced by the query. +// The returned set may be nil if the query does not explicitly reference any field +// and the DefaultSearchField is unset in the index mapping. +func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, error) { + if q == nil || m == nil { + return fs, nil + } + var err error + switch q := q.(type) { + case FieldableQuery: + f := q.Field() + if f == "" { + f = m.DefaultSearchField() + } + if f != "" { + if fs == nil { + fs = make(FieldSet) + } + fs[f] = struct{}{} + } + case *QueryStringQuery: + var expandedQuery Query + expandedQuery, err = expandQuery(m, q) + if err == nil { + fs, err = ExtractFields(expandedQuery, m, fs) + } + case *BooleanQuery: + for _, subq := range []Query{q.Must, q.Should, q.MustNot} { + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } + } + case *ConjunctionQuery: + for _, subq := range q.Conjuncts { + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } + } + case *DisjunctionQuery: + for _, subq := range q.Disjuncts { + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } + } + } + return fs, err +} + +const ( + FuzzyMatchType = iota + RegexpMatchType + PrefixMatchType +) + +// ExtractSynonyms extracts synonyms from the query tree and returns a map of +// field-term pairs to their synonyms. The input query tree is traversed and +// for each term query, the synonyms are extracted from the synonym source +// associated with the field. The synonyms are then added to the provided map. +// The map is returned and may be nil if no synonyms were found. +func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.ThesaurusReader, + query Query, rv search.FieldTermSynonymMap, +) (search.FieldTermSynonymMap, error) { + if r == nil || m == nil || query == nil { + return rv, nil + } + var err error + resolveFieldAndSource := func(field string) (string, string) { + if field == "" { + field = m.DefaultSearchField() + } + return field, m.SynonymSourceForPath(field) + } + handleAnalyzer := func(analyzerName, field string) (analysis.Analyzer, error) { + if analyzerName == "" { + analyzerName = m.AnalyzerNameForPath(field) + } + analyzer := m.AnalyzerNamed(analyzerName) + if analyzer == nil { + return nil, fmt.Errorf("no analyzer named '%s' registered", analyzerName) + } + return analyzer, nil + } + switch q := query.(type) { + case *BooleanQuery: + rv, err = ExtractSynonyms(ctx, m, r, q.Must, rv) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, q.Should, rv) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, q.MustNot, rv) + if err != nil { + return nil, err + } + case *ConjunctionQuery: + for _, child := range q.Conjuncts { + rv, err = ExtractSynonyms(ctx, m, r, child, rv) + if err != nil { + return nil, err + } + } + case *DisjunctionQuery: + for _, child := range q.Disjuncts { + rv, err = ExtractSynonyms(ctx, m, r, child, rv) + if err != nil { + return nil, err + } + } + case *FuzzyQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + fuzziness := q.Fuzziness + if q.autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(q.Term) + } + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, q.Term, fuzziness, q.Prefix, r, rv) + if err != nil { + return nil, err + } + } + case *MatchQuery, *MatchPhraseQuery: + var analyzerName, matchString, fieldVal string + var fuzziness, prefix int + var autoFuzzy bool + if mq, ok := q.(*MatchQuery); ok { + analyzerName, fieldVal, matchString, fuzziness, prefix, autoFuzzy = mq.Analyzer, mq.FieldVal, mq.Match, mq.Fuzziness, mq.Prefix, mq.autoFuzzy + } else if mpq, ok := q.(*MatchPhraseQuery); ok { + analyzerName, fieldVal, matchString, fuzziness, autoFuzzy = mpq.Analyzer, mpq.FieldVal, mpq.MatchPhrase, mpq.Fuzziness, mpq.autoFuzzy + } + field, source := resolveFieldAndSource(fieldVal) + if source != "" { + analyzer, err := handleAnalyzer(analyzerName, field) + if err != nil { + return nil, err + } + tokens := analyzer.Analyze([]byte(matchString)) + for _, token := range tokens { + if autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(string(token.Term)) + } + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, string(token.Term), fuzziness, prefix, r, rv) + if err != nil { + return nil, err + } + } + } + case *MultiPhraseQuery, *PhraseQuery: + var fieldVal string + var fuzziness int + var autoFuzzy bool + if mpq, ok := q.(*MultiPhraseQuery); ok { + fieldVal, fuzziness, autoFuzzy = mpq.FieldVal, mpq.Fuzziness, mpq.autoFuzzy + } else if pq, ok := q.(*PhraseQuery); ok { + fieldVal, fuzziness, autoFuzzy = pq.FieldVal, pq.Fuzziness, pq.autoFuzzy + } + field, source := resolveFieldAndSource(fieldVal) + if source != "" { + var terms []string + if mpq, ok := q.(*MultiPhraseQuery); ok { + for _, termGroup := range mpq.Terms { + terms = append(terms, termGroup...) + } + } else if pq, ok := q.(*PhraseQuery); ok { + terms = pq.Terms + } + for _, term := range terms { + if autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(term) + } + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, term, fuzziness, 0, r, rv) + if err != nil { + return nil, err + } + } + } + case *PrefixQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTermWithMatchType(ctx, PrefixMatchType, source, field, q.Prefix, 0, 0, r, rv) + if err != nil { + return nil, err + } + } + case *QueryStringQuery: + expanded, err := expandQuery(m, q) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, expanded, rv) + if err != nil { + return nil, err + } + case *TermQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTerm(ctx, source, field, q.Term, r, rv) + if err != nil { + return nil, err + } + } + case *RegexpQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTermWithMatchType(ctx, RegexpMatchType, source, field, strings.TrimPrefix(q.Regexp, "^"), 0, 0, r, rv) + if err != nil { + return nil, err + } + } + case *WildcardQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTermWithMatchType(ctx, RegexpMatchType, source, field, wildcardRegexpReplacer.Replace(q.Wildcard), 0, 0, r, rv) + if err != nil { + return nil, err + } + } + } + return rv, nil +} + +// addFuzzySynonymsForTerm finds all terms that match the given term with the +// given fuzziness and adds their synonyms to the provided map. +func addSynonymsForTermWithMatchType(ctx context.Context, matchType int, src, field, term string, fuzziness, prefix int, + r index.ThesaurusReader, rv search.FieldTermSynonymMap, +) (search.FieldTermSynonymMap, error) { + // Determine the terms based on the match type (fuzzy, prefix, or regexp) + var thesKeys index.ThesaurusKeys + var err error + var terms []string + switch matchType { + case FuzzyMatchType: + // Ensure valid fuzziness + if fuzziness == 0 { + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) + if err != nil { + return nil, err + } + return rv, nil + } + if fuzziness > searcher.MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) + } + if fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") + } + // Handle fuzzy match + prefixTerm := "" + for i, r := range term { + if i < prefix { + prefixTerm += string(r) + } else { + break + } + } + thesKeys, err = r.ThesaurusKeysFuzzy(src, term, fuzziness, prefixTerm) + case RegexpMatchType: + // Handle regexp match + thesKeys, err = r.ThesaurusKeysRegexp(src, term) + case PrefixMatchType: + // Handle prefix match + thesKeys, err = r.ThesaurusKeysPrefix(src, []byte(term)) + default: + return nil, fmt.Errorf("invalid match type: %d", matchType) + } + if err != nil { + return nil, err + } + defer func() { + if cerr := thesKeys.Close(); cerr != nil && err == nil { + err = cerr + } + }() + // Collect the matching terms + terms = []string{} + tfd, err := thesKeys.Next() + for err == nil && tfd != nil { + terms = append(terms, tfd.Term) + tfd, err = thesKeys.Next() + } + if err != nil { + return nil, err + } + for _, synTerm := range terms { + rv, err = addSynonymsForTerm(ctx, src, field, synTerm, r, rv) + if err != nil { + return nil, err + } + } + return rv, nil +} + +func addSynonymsForTerm(ctx context.Context, src, field, term string, + r index.ThesaurusReader, rv search.FieldTermSynonymMap, +) (search.FieldTermSynonymMap, error) { + termReader, err := r.ThesaurusTermReader(ctx, src, []byte(term)) + if err != nil { + return nil, err + } + defer func() { + if cerr := termReader.Close(); cerr != nil && err == nil { + err = cerr + } + }() + var synonyms []string + synonym, err := termReader.Next() + for err == nil && synonym != "" { + synonyms = append(synonyms, synonym) + synonym, err = termReader.Next() + } + if err != nil { + return nil, err + } + if len(synonyms) > 0 { + if rv == nil { + rv = make(search.FieldTermSynonymMap) + } + if _, exists := rv[field]; !exists { + rv[field] = make(map[string][]string) + } + rv[field][term] = synonyms + } + return rv, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/regexp.go b/vendor/github.com/blevesearch/bleve/v2/search/query/regexp.go index 6b3da9554..189fd5f34 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/regexp.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/regexp.go @@ -69,12 +69,9 @@ func (q *RegexpQuery) Searcher(ctx context.Context, i index.IndexReader, m mappi // known to interfere with LiteralPrefix() the way ^ does // and removing $ introduces possible ambiguities with escaped \$, \\$, etc actualRegexp := q.Regexp - if strings.HasPrefix(actualRegexp, "^") { - actualRegexp = actualRegexp[1:] // remove leading ^ - } + actualRegexp = strings.TrimPrefix(actualRegexp, "^") // remove leading ^ if it exists - return searcher.NewRegexpStringSearcher(ctx, i, actualRegexp, field, - q.BoostVal.Value(), options) + return searcher.NewRegexpStringSearcher(ctx, i, actualRegexp, field, q.BoostVal.Value(), options) } func (q *RegexpQuery) Validate() error { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go index 10190bd85..c030b8564 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_constant.go @@ -103,7 +103,7 @@ func (s *ConstantScorer) Score(ctx *search.SearchContext, id index.IndexInternal if s.options.Explain { scoreExplanation = &search.Explanation{ Value: score, - Message: fmt.Sprintf("ConstantScore()"), + Message: "ConstantScore()", } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go index fe319bbeb..b3e96ddc7 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_disjunction.go @@ -69,7 +69,7 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ ce := make([]*search.Explanation, 2) ce[0] = rawExpl ce[1] = &search.Explanation{Value: coord, Message: fmt.Sprintf("coord(%d/%d)", countMatch, countTotal)} - newExpl = &search.Explanation{Value: newScore, Message: "product of:", Children: ce} + newExpl = &search.Explanation{Value: newScore, Message: "product of:", Children: ce, PartialMatch: countMatch != countTotal} } // reuse constituents[0] as the return value diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go index 2f832efab..8d9043427 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go @@ -128,7 +128,7 @@ func (sqs *KNNQueryScorer) Score(ctx *search.SearchContext, } func (sqs *KNNQueryScorer) Weight() float64 { - return sqs.queryBoost * sqs.queryBoost + return 1.0 } func (sqs *KNNQueryScorer) SetQueryNorm(qnorm float64) { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go index 7b60eda4e..f5f8ec935 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go @@ -35,8 +35,9 @@ type TermQueryScorer struct { queryTerm string queryField string queryBoost float64 - docTerm uint64 - docTotal uint64 + docTerm uint64 // number of documents containing the term + docTotal uint64 // total number of documents in the index + avgDocLength float64 idf float64 options search.SearcherOptions idfExplanation *search.Explanation @@ -61,19 +62,43 @@ func (s *TermQueryScorer) Size() int { return sizeInBytes } -func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { +func (s *TermQueryScorer) computeIDF(avgDocLength float64, docTotal, docTerm uint64) float64 { + var rv float64 + if avgDocLength > 0 { + // avgDocLength is set only for bm25 scoring + rv = math.Log(1 + (float64(docTotal)-float64(docTerm)+0.5)/ + (float64(docTerm)+0.5)) + } else { + rv = 1.0 + math.Log(float64(docTotal)/ + float64(docTerm+1.0)) + } + + return rv +} + +// queryTerm - the specific term being scored by this scorer object +// queryField - the field in which the term is being searched +// queryBoost - the boost value for the query term +// docTotal - total number of documents in the index +// docTerm - number of documents containing the term +// avgDocLength - average document length in the index +// options - search options such as explain scoring, include the location of the term etc. +func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, + docTerm uint64, avgDocLength float64, options search.SearcherOptions) *TermQueryScorer { + rv := TermQueryScorer{ queryTerm: string(queryTerm), queryField: queryField, queryBoost: queryBoost, docTerm: docTerm, docTotal: docTotal, - idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)), + avgDocLength: avgDocLength, options: options, queryWeight: 1.0, includeScore: options.Score != "none", } + rv.idf = rv.computeIDF(avgDocLength, docTotal, docTerm) if options.Explain { rv.idfExplanation = &search.Explanation{ Value: rv.idf, @@ -114,6 +139,63 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { } } +func (s *TermQueryScorer) docScore(tf, norm float64) (score float64, model string) { + if s.avgDocLength > 0 { + // bm25 scoring + // using the posting's norm value to recompute the field length for the doc num + fieldLength := 1 / (norm * norm) + + score = s.idf * (tf * search.BM25_k1) / + (tf + search.BM25_k1*(1-search.BM25_b+(search.BM25_b*fieldLength/s.avgDocLength))) + model = index.BM25Scoring + } else { + // tf-idf scoring by default + score = tf * norm * s.idf + model = index.DefaultScoringModel + } + return score, model +} + +func (s *TermQueryScorer) scoreExplanation(tf float64, termMatch *index.TermFieldDoc) []*search.Explanation { + var rv []*search.Explanation + if s.avgDocLength > 0 { + fieldLength := 1 / (termMatch.Norm * termMatch.Norm) + fieldNormVal := 1 - search.BM25_b + (search.BM25_b * fieldLength / s.avgDocLength) + fieldNormalizeExplanation := &search.Explanation{ + Value: fieldNormVal, + Message: fmt.Sprintf("fieldNorm(field=%s), b=%f, fieldLength=%f, avgFieldLength=%f)", + s.queryField, search.BM25_b, fieldLength, s.avgDocLength), + } + + saturationExplanation := &search.Explanation{ + Value: search.BM25_k1 / (tf + search.BM25_k1*fieldNormVal), + Message: fmt.Sprintf("saturation(term:%s), k1=%f/(tf=%f + k1*fieldNorm=%f))", + termMatch.Term, search.BM25_k1, tf, fieldNormVal), + Children: []*search.Explanation{fieldNormalizeExplanation}, + } + + rv = make([]*search.Explanation, 3) + rv[0] = &search.Explanation{ + Value: tf, + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), + } + rv[1] = saturationExplanation + rv[2] = s.idfExplanation + } else { + rv = make([]*search.Explanation, 3) + rv[0] = &search.Explanation{ + Value: tf, + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), + } + rv[1] = &search.Explanation{ + Value: termMatch.Norm, + Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID), + } + rv[2] = s.idfExplanation + } + return rv +} + func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.TermFieldDoc) *search.DocumentMatch { rv := ctx.DocumentMatchPool.Get() // perform any score computations only when needed @@ -125,22 +207,14 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term } else { tf = math.Sqrt(float64(termMatch.Freq)) } - score := tf * termMatch.Norm * s.idf + score, scoringModel := s.docScore(tf, termMatch.Norm) if s.options.Explain { - childrenExplanations := make([]*search.Explanation, 3) - childrenExplanations[0] = &search.Explanation{ - Value: tf, - Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), - } - childrenExplanations[1] = &search.Explanation{ - Value: termMatch.Norm, - Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID), - } - childrenExplanations[2] = s.idfExplanation + childrenExplanations := s.scoreExplanation(tf, termMatch) scoreExplanation = &search.Explanation{ - Value: score, - Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), + Value: score, + Message: fmt.Sprintf("fieldWeight(%s:%s in %s), as per %s model, "+ + "product of:", s.queryField, s.queryTerm, termMatch.ID, scoringModel), Children: childrenExplanations, } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/search.go b/vendor/github.com/blevesearch/bleve/v2/search/search.go index 8cc5115dc..5c930bce2 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/search.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/search.go @@ -23,9 +23,11 @@ import ( index "github.com/blevesearch/bleve_index_api" ) -var reflectStaticSizeDocumentMatch int -var reflectStaticSizeSearchContext int -var reflectStaticSizeLocation int +var ( + reflectStaticSizeDocumentMatch int + reflectStaticSizeSearchContext int + reflectStaticSizeLocation int +) func init() { var dm DocumentMatch @@ -167,13 +169,6 @@ type DocumentMatch struct { // results are completed FieldTermLocations []FieldTermLocation `json:"-"` - // used to indicate if this match is a partial match - // in the case of a disjunction search - // this means that the match is partial because - // not all sub-queries matched - // if false, all the sub-queries matched - PartialMatch bool `json:"partial_match,omitempty"` - // used to indicate the sub-scores that combined to form the // final score for this document match. This is only populated // when the search request's query is a DisjunctionQuery @@ -268,7 +263,7 @@ func (dm *DocumentMatch) Size() int { sizeInBytes += size.SizeOfString + len(entry) } - for k, _ := range dm.Fields { + for k := range dm.Fields { sizeInBytes += size.SizeOfString + len(k) + size.SizeOfPtr } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go index 25e661075..57d8855ee 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction.go @@ -47,12 +47,11 @@ type ConjunctionSearcher struct { func NewConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) ( - search.Searcher, error) { + search.Searcher, error, +) { // build the sorted downstream searchers searchers := make(OrderedSearcherList, len(qsearchers)) - for i, searcher := range qsearchers { - searchers[i] = searcher - } + copy(searchers, qsearchers) sort.Sort(searchers) // attempt the "unadorned" conjunction optimization only when we diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go index d165ec027..434c705e7 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction.go @@ -114,7 +114,7 @@ func optimizeCompositeSearcher(ctx context.Context, optimizationKind string, return nil, nil } - return newTermSearcherFromReader(indexReader, tfr, + return newTermSearcherFromReader(ctx, indexReader, tfr, []byte(optimizationKind), "*", 1.0, options) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go index 89bcd498f..3da876bd3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go @@ -218,9 +218,7 @@ func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( rv = s.scorer.ScoreAndExplBreakdown(ctx, s.matching, s.matchingIdxs, nil, s.numSearchers) } else { // score this match - partialMatch := len(s.matching) != len(s.searchers) rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) - rv.PartialMatch = partialMatch } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go index 81b00cc22..6a92ffa09 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_slice.go @@ -52,7 +52,8 @@ type DisjunctionSliceSearcher struct { func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, limit bool) ( - *DisjunctionSliceSearcher, error) { + *DisjunctionSliceSearcher, error, +) { if limit && tooManyClauses(len(qsearchers)) { return nil, tooManyClausesErr("", len(qsearchers)) } @@ -79,9 +80,7 @@ func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexRea originalPos = sortedSearchers.index } else { searchers = make(OrderedSearcherList, len(qsearchers)) - for i, searcher := range qsearchers { - searchers[i] = searcher - } + copy(searchers, qsearchers) sort.Sort(searchers) } @@ -210,7 +209,8 @@ func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) { } func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( - *search.DocumentMatch, error) { + *search.DocumentMatch, error, +) { if !s.initialized { err := s.initSearchers(ctx) if err != nil { @@ -230,9 +230,7 @@ func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( rv = s.scorer.ScoreAndExplBreakdown(ctx, s.matching, s.matchingIdxs, s.originalPos, s.numSearchers) } else { // score this match - partialMatch := len(s.matching) != len(s.searchers) rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) - rv.PartialMatch = partialMatch } } @@ -257,7 +255,8 @@ func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( } func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext, - ID index.IndexInternalID) (*search.DocumentMatch, error) { + ID index.IndexInternalID, +) (*search.DocumentMatch, error) { if !s.initialized { err := s.initSearchers(ctx) if err != nil { @@ -322,7 +321,8 @@ func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { // but only activates on an edge case where the disjunction is a // wrapper around a single Optimizable child searcher func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) ( - index.OptimizableContext, error) { + index.OptimizableContext, error, +) { if len(s.searchers) == 1 { o, ok := s.searchers[0].(index.Optimizable) if ok { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_fuzzy.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_fuzzy.go index 1957168bb..187486efc 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_fuzzy.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_fuzzy.go @@ -17,6 +17,7 @@ package searcher import ( "context" "fmt" + "strings" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" @@ -24,6 +25,19 @@ import ( var MaxFuzziness = 2 +// AutoFuzzinessHighThreshold is the threshold for the term length +// above which the fuzziness is set to MaxFuzziness when the fuzziness +// mode is set to AutoFuzziness. +var AutoFuzzinessHighThreshold = 5 + +// AutoFuzzinessLowThreshold is the threshold for the term length +// below which the fuzziness is set to zero when the fuzziness mode +// is set to AutoFuzziness. +// For terms with length between AutoFuzzinessLowThreshold and +// AutoFuzzinessHighThreshold, the fuzziness is set to +// MaxFuzziness - 1. +var AutoFuzzinessLowThreshold = 2 + func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string, prefix, fuzziness int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { @@ -35,6 +49,21 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s if fuzziness < 0 { return nil, fmt.Errorf("invalid fuzziness, negative") } + if fuzziness == 0 { + // no fuzziness, just do a term search + // check if the call is made from a phrase searcher + // and if so, add the term to the fuzzy term matches + // since the fuzzy candidate terms are not collected + // for a term search, and the only candidate term is + // the term itself + if ctx != nil { + fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) + if fuzzyTermMatches != nil { + fuzzyTermMatches.(map[string][]string)[term] = []string{term} + } + } + return NewTermSearcher(ctx, indexReader, term, field, boost, options) + } // Note: we don't byte slice the term for a prefix because of runes. prefixTerm := "" @@ -45,16 +74,18 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s break } } - fuzzyCandidates, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, + fuzzyCandidates, err := findFuzzyCandidateTerms(ctx, indexReader, term, fuzziness, field, prefixTerm) if err != nil { return nil, err } var candidates []string + var editDistances []uint8 var dictBytesRead uint64 if fuzzyCandidates != nil { candidates = fuzzyCandidates.candidates + editDistances = fuzzyCandidates.editDistances dictBytesRead = fuzzyCandidates.bytesRead } @@ -66,14 +97,40 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s fuzzyTermMatches.(map[string][]string)[term] = candidates } } + // check if the candidates are empty or have one term which is the term itself + if len(candidates) == 0 || (len(candidates) == 1 && candidates[0] == term) { + if ctx != nil { + fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) + if fuzzyTermMatches != nil { + fuzzyTermMatches.(map[string][]string)[term] = []string{term} + } + } + return NewTermSearcher(ctx, indexReader, term, field, boost, options) + } - return NewMultiTermSearcher(ctx, indexReader, candidates, field, - boost, options, true) + return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field, + boost, editDistances, options, true) +} + +func GetAutoFuzziness(term string) int { + termLength := len(term) + if termLength > AutoFuzzinessHighThreshold { + return MaxFuzziness + } else if termLength > AutoFuzzinessLowThreshold { + return MaxFuzziness - 1 + } + return 0 +} + +func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string, + prefix int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + return NewFuzzySearcher(ctx, indexReader, term, prefix, GetAutoFuzziness(term), field, boost, options) } type fuzzyCandidates struct { - candidates []string - bytesRead uint64 + candidates []string + editDistances []uint8 + bytesRead uint64 } func reportIOStats(ctx context.Context, bytesRead uint64) { @@ -88,17 +145,30 @@ func reportIOStats(ctx context.Context, bytesRead uint64) { } } -func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, +func findFuzzyCandidateTerms(ctx context.Context, indexReader index.IndexReader, term string, fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) { rv = &fuzzyCandidates{ - candidates: make([]string, 0), + candidates: make([]string, 0), + editDistances: make([]uint8, 0), } // in case of advanced reader implementations directly call // the levenshtein automaton based iterator to collect the // candidate terms if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { - fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) + termSet := make(map[string]struct{}) + addCandidateTerm := func(term string, editDistance uint8) error { + if _, exists := termSet[term]; !exists { + termSet[term] = struct{}{} + rv.candidates = append(rv.candidates, term) + rv.editDistances = append(rv.editDistances, editDistance) + if tooManyClauses(len(rv.candidates)) { + return tooManyClausesErr(field, len(rv.candidates)) + } + } + return nil + } + fieldDict, a, err := ir.FieldDictFuzzyAutomaton(field, term, fuzziness, prefixTerm) if err != nil { return nil, err } @@ -109,15 +179,38 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, }() tfd, err := fieldDict.Next() for err == nil && tfd != nil { - rv.candidates = append(rv.candidates, tfd.Term) - if tooManyClauses(len(rv.candidates)) { - return nil, tooManyClausesErr(field, len(rv.candidates)) + err = addCandidateTerm(tfd.Term, tfd.EditDistance) + if err != nil { + return nil, err } tfd, err = fieldDict.Next() } - + if err != nil { + return nil, err + } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if !strings.HasPrefix(term, prefixTerm) { + continue + } + match, editDistance := a.MatchAndDistance(term) + if match { + err = addCandidateTerm(term, editDistance) + if err != nil { + return nil, err + } + } + } + } + } + } rv.bytesRead = fieldDict.BytesRead() - return rv, err + return rv, nil } var fieldDict index.FieldDict @@ -144,6 +237,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse) if !exceeded && ld <= fuzziness { rv.candidates = append(rv.candidates, tfd.Term) + rv.editDistances = append(rv.editDistances, uint8(ld)) if tooManyClauses(len(rv.candidates)) { return nil, tooManyClausesErr(field, len(rv.candidates)) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go index c889ddce0..f9dcf16ad 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go @@ -26,13 +26,16 @@ import ( type filterFunc func(key []byte) bool -var GeoBitsShift1 = geo.GeoBits << 1 -var GeoBitsShift1Minus1 = GeoBitsShift1 - 1 +var ( + GeoBitsShift1 = geo.GeoBits << 1 + GeoBitsShift1Minus1 = GeoBitsShift1 - 1 +) func NewGeoBoundingBoxSearcher(ctx context.Context, indexReader index.IndexReader, minLon, minLat, maxLon, maxLat float64, field string, boost float64, options search.SearcherOptions, checkBoundaries bool) ( - search.Searcher, error) { + search.Searcher, error, +) { if tp, ok := indexReader.(index.SpatialIndexPlugin); ok { sp, err := tp.GetSpatialAnalyzerPlugin("s2") if err == nil { @@ -65,7 +68,7 @@ func NewGeoBoundingBoxSearcher(ctx context.Context, indexReader index.IndexReade } // do math to produce list of terms needed for this search - onBoundaryTerms, notOnBoundaryTerms, err := ComputeGeoRange(nil, 0, GeoBitsShift1Minus1, + onBoundaryTerms, notOnBoundaryTerms, err := ComputeGeoRange(context.TODO(), 0, GeoBitsShift1Minus1, minLon, minLat, maxLon, maxLat, checkBoundaries, indexReader, field) if err != nil { return nil, err @@ -122,16 +125,18 @@ func NewGeoBoundingBoxSearcher(ctx context.Context, indexReader index.IndexReade return NewMatchNoneSearcher(indexReader) } -var geoMaxShift = document.GeoPrecisionStep * 4 -var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 +var ( + geoMaxShift = document.GeoPrecisionStep * 4 + geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 +) type closeFunc func() error func ComputeGeoRange(ctx context.Context, term uint64, shift uint, sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool, indexReader index.IndexReader, field string) ( - onBoundary [][]byte, notOnBoundary [][]byte, err error) { - + onBoundary [][]byte, notOnBoundary [][]byte, err error, +) { isIndexed, closeF, err := buildIsIndexedFunc(ctx, indexReader, field) if closeF != nil { defer func() { @@ -192,7 +197,6 @@ func buildIsIndexedFunc(ctx context.Context, indexReader index.IndexReader, fiel _ = reader.Close() return true } - } else { isIndexed = func([]byte) bool { return true @@ -202,7 +206,8 @@ func buildIsIndexedFunc(ctx context.Context, indexReader index.IndexReader, fiel } func buildRectFilter(ctx context.Context, dvReader index.DocValueReader, field string, - minLon, minLat, maxLon, maxLat float64) FilterFunc { + minLon, minLat, maxLon, maxLat float64, +) FilterFunc { return func(d *search.DocumentMatch) bool { // check geo matches against all numeric type terms indexed var lons, lats []float64 @@ -253,8 +258,7 @@ func (grc *geoRangeCompute) makePrefixCoded(in int64, shift uint) (rv numeric.Pr grc.preallocBytes = make([]byte, grc.preallocBytesLen) } - rv, grc.preallocBytes, _ = - numeric.NewPrefixCodedInt64Prealloc(in, shift, grc.preallocBytes) + rv, grc.preallocBytes, _ = numeric.NewPrefixCodedInt64Prealloc(in, shift, grc.preallocBytes) return rv } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go index ae113107d..6cd097714 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go @@ -27,7 +27,8 @@ import ( func NewGeoShapeSearcher(ctx context.Context, indexReader index.IndexReader, shape index.GeoJSON, relation string, field string, boost float64, - options search.SearcherOptions) (search.Searcher, error) { + options search.SearcherOptions, +) (search.Searcher, error) { var err error var spatialPlugin index.SpatialAnalyzerPlugin @@ -54,9 +55,7 @@ func NewGeoShapeSearcher(ctx context.Context, indexReader index.IndexReader, sha return nil, err } - return NewFilteringSearcher(ctx, mSearcher, - buildRelationFilterOnShapes(ctx, dvReader, field, relation, shape)), nil - + return NewFilteringSearcher(ctx, mSearcher, buildRelationFilterOnShapes(ctx, dvReader, field, relation, shape)), nil } // Using the same term splitter slice used in the doc values in zap. @@ -65,7 +64,8 @@ func NewGeoShapeSearcher(ctx context.Context, indexReader index.IndexReader, sha var termSeparatorSplitSlice = []byte{0xff} func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueReader, field string, - relation string, shape index.GeoJSON) FilterFunc { + relation string, shape index.GeoJSON, +) FilterFunc { // this is for accumulating the shape's actual complete value // spread across multiple docvalue visitor callbacks. var dvShapeValue []byte @@ -73,8 +73,8 @@ func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueRea var reader *bytes.Reader var bufPool *s2.GeoBufferPool - if ctx != nil { - bufPool = ctx.Value(search.GeoBufferPoolCallbackKey).(search.GeoBufferPoolCallbackFunc)() + if bufPoolCallback, ok := ctx.Value(search.GeoBufferPoolCallbackKey).(search.GeoBufferPoolCallbackFunc); ok { + bufPool = bufPoolCallback() } return func(d *search.DocumentMatch) bool { @@ -82,7 +82,6 @@ func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueRea err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { - // only consider the values which are GlueBytes prefixed or // if it had already started reading the shape bytes from previous callbacks. if startReading || len(term) > geo.GlueBytesOffset { @@ -110,11 +109,11 @@ func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueRea // apply the filter once the entire docvalue is finished reading. if finishReading { - v, err := geojson.FilterGeoShapesOnRelation(shape, - dvShapeValue, relation, &reader, bufPool) + v, err := geojson.FilterGeoShapesOnRelation(shape, dvShapeValue, relation, &reader, bufPool) if err == nil && v { found = true } + dvShapeValue = dvShapeValue[:0] startReading = false finishReading = false diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go index 866900d4e..a95a714b3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_knn.go @@ -50,23 +50,14 @@ type KNNSearcher struct { func NewKNNSearcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions, field string, vector []float32, k int64, boost float64, similarityMetric string, searchParams json.RawMessage, - filterIDs []index.IndexInternalID) ( + eligibleSelector index.EligibleDocumentSelector) ( search.Searcher, error) { if vr, ok := i.(index.VectorIndexReader); ok { - var vectorReader index.VectorReader - var err error - - if len(filterIDs) > 0 { - vectorReader, err = vr.VectorReaderWithFilter(ctx, vector, field, k, - searchParams, filterIDs) - } else { - vectorReader, err = vr.VectorReader(ctx, vector, field, k, searchParams) - } + vectorReader, err := vr.VectorReader(ctx, vector, field, k, searchParams, eligibleSelector) if err != nil { return nil, err } - knnScorer := scorer.NewKNNQueryScorer(vector, field, boost, options, similarityMetric) return &KNNSearcher{ diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_multi_term.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_multi_term.go index 913f99f55..98f8f92b8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_multi_term.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_multi_term.go @@ -45,6 +45,31 @@ func NewMultiTermSearcher(ctx context.Context, indexReader index.IndexReader, te options, limit) } +// Works similarly to the multi term searcher but additionally boosts individual terms based on +// their edit distance from the query terms +func NewMultiTermSearcherBoosted(ctx context.Context, indexReader index.IndexReader, terms []string, + field string, boost float64, editDistances []uint8, options search.SearcherOptions, limit bool) ( + search.Searcher, error) { + + if tooManyClauses(len(terms)) { + if optionsDisjunctionOptimizable(options) { + return optimizeMultiTermSearcher(ctx, indexReader, terms, field, boost, options) + } + if limit { + return nil, tooManyClausesErr(field, len(terms)) + } + } + + qsearchers, err := makeBatchSearchersBoosted(ctx, indexReader, terms, field, boost, editDistances, options) + if err != nil { + return nil, err + } + + // build disjunction searcher of these ranges + return newMultiTermSearcherInternal(ctx, indexReader, qsearchers, field, boost, + options, limit) +} + func NewMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { @@ -151,6 +176,32 @@ func makeBatchSearchers(ctx context.Context, indexReader index.IndexReader, term return qsearchers, nil } +func makeBatchSearchersBoosted(ctx context.Context, indexReader index.IndexReader, terms []string, field string, + boost float64, editDistances []uint8, options search.SearcherOptions) ([]search.Searcher, error) { + + qsearchers := make([]search.Searcher, len(terms)) + qsearchersClose := func() { + for _, searcher := range qsearchers { + if searcher != nil { + _ = searcher.Close() + } + } + } + for i, term := range terms { + var err error + var editMultiplier float64 + if editDistances != nil { + editMultiplier = 1 / float64(editDistances[i]+1) + } + qsearchers[i], err = NewTermSearcher(ctx, indexReader, term, field, boost*editMultiplier, options) + if err != nil { + qsearchersClose() + return nil, err + } + } + return qsearchers, nil +} + func optimizeMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_phrase.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_phrase.go index a7bdb2c81..07675cfad 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_phrase.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_phrase.go @@ -67,25 +67,32 @@ func (s *PhraseSearcher) Size() int { } func NewPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms []string, - fuzziness int, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) { + fuzziness int, autoFuzzy bool, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) { // turn flat terms []string into [][]string mterms := make([][]string, len(terms)) for i, term := range terms { mterms[i] = []string{term} } - return NewMultiPhraseSearcher(ctx, indexReader, mterms, fuzziness, field, boost, options) + return NewMultiPhraseSearcher(ctx, indexReader, mterms, fuzziness, autoFuzzy, field, boost, options) } func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms [][]string, - fuzziness int, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) { + fuzziness int, autoFuzzy bool, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) { options.IncludeTermVectors = true var termPositionSearchers []search.Searcher var err error var ts search.Searcher + // The following logic checks if fuzziness is enabled. + // Fuzziness is considered enabled if either: + // a. `fuzziness` is greater than 0, or + // b. `autoFuzzy` is set to true. + // if both conditions are true, `autoFuzzy` takes precedence. + // If enabled, a map will be created to store the matches for fuzzy terms. + fuzzinessEnabled := autoFuzzy || fuzziness > 0 var fuzzyTermMatches map[string][]string - if fuzziness > 0 { + if fuzzinessEnabled { fuzzyTermMatches = make(map[string][]string) ctx = context.WithValue(ctx, search.FuzzyMatchPhraseKey, fuzzyTermMatches) } @@ -95,9 +102,15 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, for _, termPos := range terms { if len(termPos) == 1 && termPos[0] != "" { // single term - if fuzziness > 0 { + if fuzzinessEnabled { // fuzzy - ts, err = NewFuzzySearcher(ctx, indexReader, termPos[0], 0, fuzziness, field, boost, options) + if autoFuzzy { + // auto fuzzy + ts, err = NewAutoFuzzySearcher(ctx, indexReader, termPos[0], 0, field, boost, options) + } else { + // non-auto fuzzy + ts, err = NewFuzzySearcher(ctx, indexReader, termPos[0], 0, fuzziness, field, boost, options) + } } else { // non-fuzzy ts, err = NewTermSearcher(ctx, indexReader, termPos[0], field, boost, options) @@ -117,9 +130,15 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, if term == "" { continue } - if fuzziness > 0 { + if fuzzinessEnabled { // fuzzy - ts, err = NewFuzzySearcher(ctx, indexReader, term, 0, fuzziness, field, boost, options) + if autoFuzzy { + // auto fuzzy + ts, err = NewAutoFuzzySearcher(ctx, indexReader, term, 0, field, boost, options) + } else { + // non-auto fuzzy + ts, err = NewFuzzySearcher(ctx, indexReader, term, 0, fuzziness, field, boost, options) + } } else { // non-fuzzy ts, err = NewTermSearcher(ctx, indexReader, term, field, boost, options) @@ -145,6 +164,42 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, } } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + if fuzzinessEnabled { + for term, fuzzyTerms := range fuzzyTermMatches { + fuzzySynonymTerms := make([]string, 0, len(fuzzyTerms)) + if s, found := ts[term]; found { + fuzzySynonymTerms = append(fuzzySynonymTerms, s...) + } + for _, fuzzyTerm := range fuzzyTerms { + if fuzzyTerm == term { + continue + } + if s, found := ts[fuzzyTerm]; found { + fuzzySynonymTerms = append(fuzzySynonymTerms, s...) + } + } + if len(fuzzySynonymTerms) > 0 { + fuzzyTermMatches[term] = append(fuzzyTermMatches[term], fuzzySynonymTerms...) + } + } + } else { + for _, termPos := range terms { + for _, term := range termPos { + if s, found := ts[term]; found { + if fuzzyTermMatches == nil { + fuzzyTermMatches = make(map[string][]string) + } + fuzzyTermMatches[term] = s + } + } + } + } + } + } + } mustSearcher, err := NewConjunctionSearcher(ctx, indexReader, termPositionSearchers, options) if err != nil { // close any searchers already opened @@ -318,6 +373,9 @@ func (s *PhraseSearcher) expandFuzzyMatches(tlm search.TermLocationMap, expanded for term, fuzzyMatches := range s.fuzzyTermMatches { locations := tlm[term] for _, fuzzyMatch := range fuzzyMatches { + if fuzzyMatch == term { + continue + } locations = append(locations, tlm[fuzzyMatch]...) } expandedTlm[term] = locations diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_regexp.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_regexp.go index b88133e31..1afdaee02 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_regexp.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_regexp.go @@ -48,7 +48,7 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, return NewRegexpSearcher(ctx, indexReader, r, field, boost, options) } - fieldDict, err := ir.FieldDictRegexp(field, pattern) + fieldDict, a, err := ir.FieldDictRegexpAutomaton(field, pattern) if err != nil { return nil, err } @@ -58,17 +58,37 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, } }() + var termSet = make(map[string]struct{}) var candidateTerms []string tfd, err := fieldDict.Next() for err == nil && tfd != nil { - candidateTerms = append(candidateTerms, tfd.Term) - tfd, err = fieldDict.Next() + if _, exists := termSet[tfd.Term]; !exists { + termSet[tfd.Term] = struct{}{} + candidateTerms = append(candidateTerms, tfd.Term) + tfd, err = fieldDict.Next() + } } if err != nil { return nil, err } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if a.MatchesRegex(term) { + termSet[term] = struct{}{} + candidateTerms = append(candidateTerms, term) + } + } + } + } + } + return NewMultiTermSearcher(ctx, indexReader, candidateTerms, field, boost, options, true) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term.go index cd794ea32..1c33c6a41 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term.go @@ -16,6 +16,8 @@ package searcher import ( "context" + "fmt" + "math" "reflect" "github.com/blevesearch/bleve/v2/search" @@ -38,30 +40,109 @@ type TermSearcher struct { tfd index.TermFieldDoc } -func NewTermSearcher(ctx context.Context, indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { +func NewTermSearcher(ctx context.Context, indexReader index.IndexReader, + term string, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { if isTermQuery(ctx) { ctx = context.WithValue(ctx, search.QueryTypeKey, search.Term) } return NewTermSearcherBytes(ctx, indexReader, []byte(term), field, boost, options) } -func NewTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { +func NewTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, + term []byte, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + if s, found := ts[string(term)]; found { + return NewSynonymSearcher(ctx, indexReader, term, s, field, boost, options) + } + } + } + } needFreqNorm := options.Score != "none" reader, err := indexReader.TermFieldReader(ctx, term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { return nil, err } - return newTermSearcherFromReader(indexReader, reader, term, field, boost, options) + return newTermSearcherFromReader(ctx, indexReader, reader, term, field, boost, options) } -func newTermSearcherFromReader(indexReader index.IndexReader, reader index.TermFieldReader, - term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { +func tfIDFScoreMetrics(indexReader index.IndexReader) (uint64, error) { + // default tf-idf stats count, err := indexReader.DocCount() if err != nil { - _ = reader.Close() - return nil, err + return 0, err } - scorer := scorer.NewTermQueryScorer(term, field, boost, count, reader.Count(), options) + + if count == 0 { + return 0, nil + } + return count, nil +} + +func bm25ScoreMetrics(ctx context.Context, field string, + indexReader index.IndexReader) (uint64, float64, error) { + var count uint64 + var fieldCardinality int + var err error + + bm25Stats, ok := ctx.Value(search.BM25PreSearchDataKey).(*search.BM25Stats) + if !ok { + count, err = indexReader.DocCount() + if err != nil { + return 0, 0, err + } + dict, err := indexReader.FieldDict(field) + if err != nil { + return 0, 0, err + } + fieldCardinality = dict.Cardinality() + } else { + count = uint64(bm25Stats.DocCount) + fieldCardinality, ok = bm25Stats.FieldCardinality[field] + if !ok { + return 0, 0, fmt.Errorf("field stat for bm25 not present %s", field) + } + } + + if count == 0 && fieldCardinality == 0 { + return 0, 0, nil + } + return count, math.Ceil(float64(fieldCardinality) / float64(count)), nil +} + +func newTermSearcherFromReader(ctx context.Context, indexReader index.IndexReader, + reader index.TermFieldReader, term []byte, field string, boost float64, + options search.SearcherOptions) (*TermSearcher, error) { + var count uint64 + var avgDocLength float64 + var err error + var similarityModel string + + // as a fallback case we track certain stats for tf-idf scoring + if ctx != nil { + if similaritModelCallback, ok := ctx.Value(search. + GetScoringModelCallbackKey).(search.GetScoringModelCallbackFn); ok { + similarityModel = similaritModelCallback() + } + } + switch similarityModel { + case index.BM25Scoring: + count, avgDocLength, err = bm25ScoreMetrics(ctx, field, indexReader) + if err != nil { + _ = reader.Close() + return nil, err + } + case index.TFIDFScoring: + fallthrough + default: + count, err = tfIDFScoreMetrics(indexReader) + if err != nil { + _ = reader.Close() + return nil, err + } + } + scorer := scorer.NewTermQueryScorer(term, field, boost, count, reader.Count(), avgDocLength, options) return &TermSearcher{ indexReader: indexReader, reader: reader, @@ -69,6 +150,50 @@ func newTermSearcherFromReader(indexReader index.IndexReader, reader index.TermF }, nil } +func NewSynonymSearcher(ctx context.Context, indexReader index.IndexReader, term []byte, synonyms []string, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + createTermSearcher := func(term []byte, boostVal float64) (search.Searcher, error) { + needFreqNorm := options.Score != "none" + reader, err := indexReader.TermFieldReader(ctx, term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) + if err != nil { + return nil, err + } + return newTermSearcherFromReader(ctx, indexReader, reader, term, field, boostVal, options) + } + // create a searcher for the term itself + termSearcher, err := createTermSearcher(term, boost) + if err != nil { + return nil, err + } + // constituent searchers of the disjunction + qsearchers := make([]search.Searcher, 0, len(synonyms)+1) + // helper method to close all the searchers we've created + // in case of an error + qsearchersClose := func() { + for _, searcher := range qsearchers { + if searcher != nil { + _ = searcher.Close() + } + } + } + qsearchers = append(qsearchers, termSearcher) + // create a searcher for each synonym + for _, synonym := range synonyms { + synonymSearcher, err := createTermSearcher([]byte(synonym), boost/2.0) + if err != nil { + qsearchersClose() + return nil, err + } + qsearchers = append(qsearchers, synonymSearcher) + } + // create a disjunction searcher + rv, err := NewDisjunctionSearcher(ctx, indexReader, qsearchers, 0, options) + if err != nil { + qsearchersClose() + return nil, err + } + return rv, nil +} + func (s *TermSearcher) Size() int { return reflectStaticSizeTermSearcher + size.SizeOfPtr + s.reader.Size() + diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term_prefix.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term_prefix.go index dc16e4864..3d98cd28e 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term_prefix.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_term_prefix.go @@ -16,6 +16,7 @@ package searcher import ( "context" + "strings" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" @@ -36,13 +37,17 @@ func NewTermPrefixSearcher(ctx context.Context, indexReader index.IndexReader, p }() var terms []string + var termSet = make(map[string]struct{}) tfd, err := fieldDict.Next() for err == nil && tfd != nil { - terms = append(terms, tfd.Term) - if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(field, len(terms)) + if _, exists := termSet[tfd.Term]; !exists { + termSet[tfd.Term] = struct{}{} + terms = append(terms, tfd.Term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(field, len(terms)) + } + tfd, err = fieldDict.Next() } - tfd, err = fieldDict.Next() } if err != nil { return nil, err @@ -53,5 +58,29 @@ func NewTermPrefixSearcher(ctx context.Context, indexReader index.IndexReader, p search.RecordSearchCost(ctx, search.AddM, fieldDict.BytesRead()) } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if strings.HasPrefix(term, prefix) { + termSet[term] = struct{}{} + terms = append(terms, term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(field, len(terms)) + } + } + } + } + } + } + + // check if the terms are empty or have one term which is the prefix itself + if len(terms) == 0 || (len(terms) == 1 && terms[0] == prefix) { + return NewTermSearcher(ctx, indexReader, prefix, field, boost, options) + } + return NewMultiTermSearcher(ctx, indexReader, terms, field, boost, options, true) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/sort.go b/vendor/github.com/blevesearch/bleve/v2/search/sort.go index b13fa16c1..2b757c48d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/sort.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/sort.go @@ -28,8 +28,10 @@ import ( "github.com/blevesearch/bleve/v2/util" ) -var HighTerm = strings.Repeat(string(utf8.MaxRune), 3) -var LowTerm = string([]byte{0x00}) +var ( + HighTerm = strings.Repeat(string(utf8.MaxRune), 3) + LowTerm = string([]byte{0x00}) +) type SearchSort interface { UpdateVisitor(field string, term []byte) @@ -47,10 +49,15 @@ type SearchSort interface { func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) { descending, ok := input["desc"].(bool) + if !ok { + descending = false + } + by, ok := input["by"].(string) if !ok { return nil, fmt.Errorf("search sort must specify by") } + switch by { case "id": return &SortDocID{ @@ -612,7 +619,8 @@ var maxDistance = string(numeric.MustNewPrefixCodedInt64(math.MaxInt64, 0)) // NewSortGeoDistance creates SearchSort instance for sorting documents by // their distance from the specified point. func NewSortGeoDistance(field, unit string, lon, lat float64, desc bool) ( - *SortGeoDistance, error) { + *SortGeoDistance, error, +) { rv := &SortGeoDistance{ Field: field, Desc: desc, diff --git a/vendor/github.com/blevesearch/bleve/v2/search/util.go b/vendor/github.com/blevesearch/bleve/v2/search/util.go index 6472803d1..0530c6732 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/util.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/util.go @@ -135,14 +135,74 @@ const MinGeoBufPoolSize = 24 type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool -const KnnPreSearchDataKey = "_knn_pre_search_data_key" - +// PreSearchKey indicates whether to perform a preliminary search to gather necessary +// information which would be used in the actual search down the line. const PreSearchKey = "_presearch_key" -type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation) +// *PreSearchDataKey are used to store the data gathered during the presearch phase +// which would be use in the actual search phase. +const KnnPreSearchDataKey = "_knn_pre_search_data_key" +const SynonymPreSearchDataKey = "_synonym_pre_search_data_key" +const BM25PreSearchDataKey = "_bm25_pre_search_data_key" + +// SearchTypeKey is used to identify type of the search being performed. +// +// for consistent scoring in cases an index is partitioned/sharded (using an +// index alias), GlobalScoring helps in aggregating the necessary stats across +// all the child bleve indexes (shards/partitions) first before the actual search +// is performed, such that the scoring involved using these stats would be at a +// global level. +const SearchTypeKey = "_search_type_key" + +// The following keys are used to invoke the callbacks at the start and end stages +// of optimizing the disjunction/conjunction searcher creation. +const SearcherStartCallbackKey = "_searcher_start_callback_key" +const SearcherEndCallbackKey = "_searcher_end_callback_key" + +// FieldTermSynonymMapKey is used to store and transport the synonym definitions data +// to the actual search phase which would use the synonyms to perform the search. +const FieldTermSynonymMapKey = "_field_term_synonym_map_key" + +const GlobalScoring = "_global_scoring" + +// GetScoringModelCallbackKey is used to help the underlying searcher identify +// which scoring mechanism to use based on index mapping. +const GetScoringModelCallbackKey = "_get_scoring_model" type SearcherStartCallbackFn func(size uint64) error type SearcherEndCallbackFn func(size uint64) error -const SearcherStartCallbackKey = "_searcher_start_callback_key" -const SearcherEndCallbackKey = "_searcher_end_callback_key" +type GetScoringModelCallbackFn func() string + +type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation) + +// field -> term -> synonyms +type FieldTermSynonymMap map[string]map[string][]string + +func (f FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) { + for field, termSynonymMap := range fts { + // Ensure the field exists in the receiver + if _, exists := f[field]; !exists { + f[field] = make(map[string][]string) + } + for term, synonyms := range termSynonymMap { + // Append synonyms + f[field][term] = append(f[field][term], synonyms...) + } + } +} + +// BM25 specific multipliers which control the scoring of a document. +// +// BM25_b - controls the extent to which doc's field length normalize term frequency part of score +// BM25_k1 - controls the saturation of the score due to term frequency +// the default values are as per elastic search's implementation +// - https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html#bm25 +// - https://www.elastic.co/blog/practical-bm25-part-3-considerations-for-picking-b-and-k1-in-elasticsearch +var BM25_k1 float64 = 1.2 +var BM25_b float64 = 0.75 + +type BM25Stats struct { + DocCount float64 `json:"doc_count"` + FieldCardinality map[string]int `json:"field_cardinality"` +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_knn.go index 309b36593..b3a44036b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search_knn.go @@ -250,8 +250,7 @@ var ( knnOperatorOr = knnOperator("or") ) -func createKNNQuery(req *SearchRequest, eligibleDocsMap map[int][]index.IndexInternalID, - requiresFiltering map[int]bool) ( +func createKNNQuery(req *SearchRequest, knnFilterResults map[int]index.EligibleDocumentSelector) ( query.Query, []int64, int64, error) { if requestHasKNN(req) { // first perform validation @@ -265,21 +264,16 @@ func createKNNQuery(req *SearchRequest, eligibleDocsMap map[int][]index.IndexInt for i, knn := range req.KNN { // If it's a filtered kNN but has no eligible filter hits, then // do not run the kNN query. - if requiresFiltering[i] && len(eligibleDocsMap[i]) <= 0 { + if selector, exists := knnFilterResults[i]; exists && selector == nil { continue } - knnQuery := query.NewKNNQuery(knn.Vector) knnQuery.SetFieldVal(knn.Field) knnQuery.SetK(knn.K) knnQuery.SetBoost(knn.Boost.Value()) knnQuery.SetParams(knn.Params) - if len(eligibleDocsMap[i]) > 0 { - knnQuery.SetFilterQuery(knn.FilterQuery) - filterResults, exists := eligibleDocsMap[i] - if exists { - knnQuery.SetFilterResults(filterResults) - } + if selector, exists := knnFilterResults[i]; exists { + knnQuery.SetEligibleSelector(selector) } subQueries = append(subQueries, knnQuery) kArray = append(kArray, knn.K) @@ -293,12 +287,6 @@ func createKNNQuery(req *SearchRequest, eligibleDocsMap map[int][]index.IndexInt } func validateKNN(req *SearchRequest) error { - if req.KNN != nil && - req.KNNOperator != "" && - req.KNNOperator != knnOperatorOr && - req.KNNOperator != knnOperatorAnd { - return fmt.Errorf("unknown knn operator: %s", req.KNNOperator) - } for _, q := range req.KNN { if q == nil { return fmt.Errorf("knn query cannot be nil") @@ -359,36 +347,24 @@ func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMat } func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader, preSearch bool) ([]*search.DocumentMatch, error) { - // maps the index of the KNN query in the req to the pre-filter hits aka - // eligible docs' internal IDs . - filterHitsMap := make(map[int][]index.IndexInternalID) - // Indicates if this query requires filtering downstream - // No filtering required if it's a match all query/no filters applied. - requiresFiltering := make(map[int]bool) - + // Maps the index of a KNN query in the request to its pre-filter result: + // - If the KNN query is **not filtered**, the value will be `nil`. + // - If the KNN query **is filtered**, the value will be an eligible document selector + // that can be used to retrieve eligible documents. + // - If there is an **empty entry** for a KNN query, it means no documents match + // the filter query, and the KNN query can be skipped. + knnFilterResults := make(map[int]index.EligibleDocumentSelector) for idx, knnReq := range req.KNN { - // TODO Can use goroutines for this filter query stuff - do it if perf results - // show this to be significantly slow otherwise. filterQ := knnReq.FilterQuery - if filterQ == nil { - requiresFiltering[idx] = false + if filterQ == nil || isMatchAllQuery(filterQ) { + // When there is no filter query or the filter query is match_all, + // all documents are eligible, and can be treated as unfiltered query. + continue + } else if isMatchNoneQuery(filterQ) { + // If the filter query is match_none, then no documents match the filter query. + knnFilterResults[idx] = nil continue } - - if _, ok := filterQ.(*query.MatchAllQuery); ok { - // Equivalent to not having a filter query. - requiresFiltering[idx] = false - continue - } - - if _, ok := filterQ.(*query.MatchNoneQuery); ok { - // Filtering required since no hits are eligible. - requiresFiltering[idx] = true - // a match none query just means none the documents are eligible - // hence, we can save on running the query. - continue - } - // Applies to all supported types of queries. filterSearcher, _ := filterQ.Searcher(ctx, reader, i.m, search.SearcherOptions{ Score: "none", // just want eligible hits --> don't compute scores if not needed @@ -404,17 +380,11 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea if err != nil { return nil, err } - filterHits := filterColl.IDs() - if len(filterHits) > 0 { - filterHitsMap[idx] = filterHits - } - // set requiresFiltering regardless of whether there're filtered hits or - // not to later decide whether to consider the knnQuery or not - requiresFiltering[idx] = true + knnFilterResults[idx] = filterColl.EligibleSelector() } // Add the filter hits when creating the kNN query - KNNQuery, kArray, sumOfK, err := createKNNQuery(req, filterHitsMap, requiresFiltering) + KNNQuery, kArray, sumOfK, err := createKNNQuery(req, knnFilterResults) if err != nil { return nil, err } @@ -559,7 +529,7 @@ func requestHasKNN(req *SearchRequest) bool { func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool { // if req.Query is not match_none => then we need to go to phase 2 // to perform the actual query. - if _, ok := req.Query.(*query.MatchNoneQuery); !ok { + if !isMatchNoneQuery(req.Query) { return false } // req.Query is a match_none query @@ -598,41 +568,6 @@ func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { dummyReq.Sort = realReq.Sort } -// the preSearchData for KNN is a list of DocumentMatch objects -// that need to be redistributed to the right index. -// This is used only in the case of an alias tree, where the indexes -// are at the leaves of the tree, and the master alias is at the root. -// At each level of the tree, the preSearchData needs to be redistributed -// to the indexes/aliases at that level. Because the preSearchData is -// specific to each final index at the leaf. -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { - knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch) - if !ok { - return nil, fmt.Errorf("request does not have knn preSearchData for redistribution") - } - segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) - if err != nil { - return nil, err - } - - rv := make(map[string]map[string]interface{}) - for _, index := range indexes { - rv[index.Name()] = make(map[string]interface{}) - } - - for _, index := range indexes { - for k, v := range req.PreSearchData { - switch k { - case search.KnnPreSearchDataKey: - rv[index.Name()][k] = segregatedKnnHits[index.Name()] - default: - rv[index.Name()][k] = v - } - } - } - return rv, nil -} - func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor { kArray := make([]int64, len(req.KNN)) for i, knnReq := range req.KNN { diff --git a/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go index bb72e15a9..c91980589 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go @@ -187,7 +187,7 @@ func requestHasKNN(req *SearchRequest) bool { func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { } -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { +func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) { return nil, nil } diff --git a/vendor/github.com/blevesearch/bleve_index_api/document.go b/vendor/github.com/blevesearch/bleve_index_api/document.go index 0f9012fd1..bc91c6c4c 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/document.go +++ b/vendor/github.com/blevesearch/bleve_index_api/document.go @@ -14,7 +14,10 @@ package index -import "time" +import ( + "net" + "time" +) type Document interface { ID() string @@ -29,6 +32,8 @@ type Document interface { AddIDField() StoredFieldsBytes() uint64 + + Indexed() bool } type FieldVisitor func(Field) @@ -81,6 +86,11 @@ type GeoPointField interface { type GeoShapeField interface { GeoShape() (GeoJSON, error) + EncodedShape() []byte +} + +type IPField interface { + IP() (net.IP, error) } // TokenizableSpatialField is an optional interface for fields that @@ -91,3 +101,26 @@ type TokenizableSpatialField interface { // to override the spatial token generations during the analysis phase. SetSpatialAnalyzerPlugin(SpatialAnalyzerPlugin) } + +// SynonymField represents a field that contains a list of synonyms for a set of terms. +// Each SynonymField is generated from a single synonym definition, and its name corresponds +// to the synonym source to which the synonym definition belongs. +type SynonymField interface { + Field + // IterateSynonyms iterates over the synonyms for the term in the field. + // The provided visitor function is called with each term and its corresponding synonyms. + IterateSynonyms(visitor func(term string, synonyms []string)) +} + +// SynonymFieldVisitor is a function type used to visit a SynonymField within a document. +type SynonymFieldVisitor func(SynonymField) + +// SynonymDocument represents a special type of document that contains synonym fields. +// Each SynonymField is a field with a list of synonyms for a set of terms. +// These fields are derived from synonym definitions, and their names correspond to the synonym sources. +type SynonymDocument interface { + Document + // VisitSynonymFields allows iteration over all synonym fields in the document. + // The provided visitor function is called for each synonym field. + VisitSynonymFields(visitor SynonymFieldVisitor) +} diff --git a/vendor/github.com/blevesearch/bleve_index_api/index.go b/vendor/github.com/blevesearch/bleve_index_api/index.go index c2125d660..34222d733 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/index.go +++ b/vendor/github.com/blevesearch/bleve_index_api/index.go @@ -105,12 +105,41 @@ type CopyReader interface { CloseCopyReader() error } -type IndexReaderRegexp interface { - FieldDictRegexp(field string, regex string) (FieldDict, error) +// RegexAutomaton abstracts an automaton built using a regex pattern. +type RegexAutomaton interface { + // MatchesRegex returns true if the given string matches the regex pattern + // used to build the automaton. + MatchesRegex(string) bool } +// IndexReaderRegexp provides functionality to work with regex-based field dictionaries. +type IndexReaderRegexp interface { + // FieldDictRegexp returns a FieldDict for terms matching the specified regex pattern + // in the dictionary of the given field. + FieldDictRegexp(field string, regex string) (FieldDict, error) + + // FieldDictRegexpAutomaton returns a FieldDict and a RegexAutomaton that can be used + // to match strings against the regex pattern. + FieldDictRegexpAutomaton(field string, regex string) (FieldDict, RegexAutomaton, error) +} + +// FuzzyAutomaton abstracts a Levenshtein automaton built using a term and a fuzziness value. +type FuzzyAutomaton interface { + // MatchAndDistance checks if the given string is within the fuzziness distance + // of the term used to build the automaton. It also returns the edit (Levenshtein) + // distance between the string and the term. + MatchAndDistance(term string) (bool, uint8) +} + +// IndexReaderFuzzy provides functionality to work with fuzzy matching in field dictionaries. type IndexReaderFuzzy interface { + // FieldDictFuzzy returns a FieldDict for terms that are within the specified fuzziness + // distance of the given term and match the specified prefix in the given field. FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error) + + // FieldDictFuzzyAutomaton returns a FieldDict and a FuzzyAutomaton that can be used + // to calculate the edit distance between the term and other strings. + FieldDictFuzzyAutomaton(field string, term string, fuzziness int, prefix string) (FieldDict, FuzzyAutomaton, error) } type IndexReaderContains interface { @@ -202,14 +231,16 @@ type TermFieldReader interface { } type DictEntry struct { - Term string - Count uint64 + Term string + Count uint64 + EditDistance uint8 } type FieldDict interface { Next() (*DictEntry, error) Close() error + Cardinality() int BytesRead() uint64 } @@ -251,3 +282,68 @@ type IndexBuilder interface { Index(doc Document) error Close() error } + +// ThesaurusTermReader is an interface for enumerating synonyms of a term in a thesaurus. +type ThesaurusTermReader interface { + // Next returns the next synonym of the term, or an error if something goes wrong. + // Returns nil when the enumeration is complete. + Next() (string, error) + + // Close releases any resources associated with the reader. + Close() error + + Size() int +} + +// ThesaurusEntry represents a term in the thesaurus for which synonyms are stored. +type ThesaurusEntry struct { + Term string +} + +// ThesaurusKeys is an interface for enumerating terms (keys) in a thesaurus. +type ThesaurusKeys interface { + // Next returns the next key in the thesaurus, or an error if something goes wrong. + // Returns nil when the enumeration is complete. + Next() (*ThesaurusEntry, error) + + // Close releases any resources associated with the reader. + Close() error +} + +// ThesaurusReader is an interface for accessing a thesaurus in the index. +type ThesaurusReader interface { + IndexReader + + // ThesaurusTermReader returns a reader for the synonyms of a given term in the + // specified thesaurus. + ThesaurusTermReader(ctx context.Context, name string, term []byte) (ThesaurusTermReader, error) + + // ThesaurusKeys returns a reader for all terms in the specified thesaurus. + ThesaurusKeys(name string) (ThesaurusKeys, error) + + // ThesaurusKeysFuzzy returns a reader for terms in the specified thesaurus that + // match the given prefix and are within the specified fuzziness distance from + // the provided term. + ThesaurusKeysFuzzy(name string, term string, fuzziness int, prefix string) (ThesaurusKeys, error) + + // ThesaurusKeysRegexp returns a reader for terms in the specified thesaurus that + // match the given regular expression pattern. + ThesaurusKeysRegexp(name string, regex string) (ThesaurusKeys, error) + + // ThesaurusKeysPrefix returns a reader for terms in the specified thesaurus that + // start with the given prefix. + ThesaurusKeysPrefix(name string, termPrefix []byte) (ThesaurusKeys, error) +} + +// EligibleDocumentSelector filters documents based on specific eligibility criteria. +// It can be extended with additional methods for filtering and retrieval. +type EligibleDocumentSelector interface { + // AddEligibleDocumentMatch marks a document as eligible for selection. + // id is the internal identifier of the document to be added. + AddEligibleDocumentMatch(id IndexInternalID) error + + // SegmentEligibleDocs returns a list of eligible document IDs within a given segment. + // segmentID identifies the segment for which eligible documents are retrieved. + // This must be called after all eligible documents have been added. + SegmentEligibleDocs(segmentID int) []uint64 +} diff --git a/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go b/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go index 9724ccae0..a587dedf8 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go +++ b/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go @@ -24,6 +24,21 @@ const ( SkipFreqNorm ) +const ( + BM25Scoring = "bm25" + TFIDFScoring = "tfidf" +) + +// Scoring model indicates the algorithm used to rank documents fetched +// for a query performed on a text field. +const DefaultScoringModel = TFIDFScoring + +// Supported similarity models +var SupportedScoringModels = map[string]struct{}{ + BM25Scoring: {}, + TFIDFScoring: {}, +} + func (o FieldIndexingOptions) IsIndexed() bool { return o&IndexField != 0 } diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector.go b/vendor/github.com/blevesearch/bleve_index_api/vector.go index c1b5837a5..1057cf980 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/vector.go +++ b/vendor/github.com/blevesearch/bleve_index_api/vector.go @@ -37,10 +37,10 @@ const ( CosineSimilarity = "cosine" ) -const DefaultSimilarityMetric = EuclideanDistance +const DefaultVectorSimilarityMetric = EuclideanDistance // Supported similarity metrics for vector fields -var SupportedSimilarityMetrics = map[string]struct{}{ +var SupportedVectorSimilarityMetrics = map[string]struct{}{ EuclideanDistance: {}, InnerProduct: {}, CosineSimilarity: {}, diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector_index.go b/vendor/github.com/blevesearch/bleve_index_api/vector_index.go index d1a4ca3fe..23fc916db 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/vector_index.go +++ b/vendor/github.com/blevesearch/bleve_index_api/vector_index.go @@ -47,12 +47,21 @@ type VectorReader interface { Size() int } +// VectorIndexReader is an index reader that can retrieve similar vectors from a vector-based index. type VectorIndexReader interface { - VectorReader(ctx context.Context, vector []float32, field string, k int64, - searchParams json.RawMessage) (VectorReader, error) + // NewEligibleDocumentSelector returns an instance of an eligible document selector. + // This selector filters documents for KNN search based on a pre-filter query. + NewEligibleDocumentSelector() EligibleDocumentSelector - VectorReaderWithFilter(ctx context.Context, vector []float32, field string, k int64, - searchParams json.RawMessage, filterIDs []IndexInternalID) (VectorReader, error) + // VectorReader creates a new vector reader for performing KNN search. + // - vector: the query vector + // - field: the field to search in + // - k: the number of similar vectors to return + // - searchParams: additional search parameters + // - selector: an eligible document selector to filter documents before KNN search + VectorReader(ctx context.Context, vector []float32, field string, k int64, + searchParams json.RawMessage, selector EligibleDocumentSelector) ( + VectorReader, error) } type VectorDoc struct { diff --git a/vendor/github.com/blevesearch/go-faiss/index.go b/vendor/github.com/blevesearch/go-faiss/index.go index 58543f291..18177fc7e 100644 --- a/vendor/github.com/blevesearch/go-faiss/index.go +++ b/vendor/github.com/blevesearch/go-faiss/index.go @@ -44,9 +44,20 @@ type Index interface { // AddWithIDs is like Add, but stores xids instead of sequential IDs. AddWithIDs(x []float32, xids []int64) error - // Applicable only to IVF indexes: Return a map of centroid ID --> []vector IDs - // for the cluster. - ObtainClusterToVecIDsFromIVFIndex() (ids map[int64][]int64, err error) + // Returns true if the index is an IVF index. + IsIVFIndex() bool + + // Applicable only to IVF indexes: Returns a map where the keys + // are cluster IDs and the values represent the count of input vectors that belong + // to each cluster. + // This method only considers the given vecIDs and does not account for all + // vectors in the index. + // Example: + // If vecIDs = [1, 2, 3, 4, 5], and: + // - Vectors 1 and 2 belong to cluster 1 + // - Vectors 3, 4, and 5 belong to cluster 2 + // The output will be: map[1:2, 2:3] + ObtainClusterVectorCountsFromIVFIndex(vecIDs []int64) (map[int64]int64, error) // Applicable only to IVF indexes: Returns the centroid IDs in decreasing order // of proximity to query 'x' and their distance from 'x' @@ -65,7 +76,7 @@ type Index interface { labels []int64, err error) // Applicable only to IVF indexes: Search clusters whose IDs are in eligibleCentroidIDs - SearchClustersFromIVFIndex(selector Selector, nvecs int, eligibleCentroidIDs []int64, + SearchClustersFromIVFIndex(selector Selector, eligibleCentroidIDs []int64, minEligibleCentroids int, k int64, x, centroidDis []float32, params json.RawMessage) ([]float32, []int64, error) @@ -140,24 +151,31 @@ func (idx *faissIndex) Add(x []float32) error { return nil } -func (idx *faissIndex) ObtainClusterToVecIDsFromIVFIndex() (map[int64][]int64, error) { - // This type assertion is required to determine whether to invoke - // ObtainClustersWithDistancesFromIVFIndex, SearchClustersFromIVFIndex or not. +func (idx *faissIndex) ObtainClusterVectorCountsFromIVFIndex(vecIDs []int64) (map[int64]int64, error) { + if !idx.IsIVFIndex() { + return nil, fmt.Errorf("index is not an IVF index") + } + clusterIDs := make([]int64, len(vecIDs)) + if c := C.faiss_get_lists_for_keys( + idx.idx, + (*C.idx_t)(unsafe.Pointer(&vecIDs[0])), + (C.size_t)(len(vecIDs)), + (*C.idx_t)(unsafe.Pointer(&clusterIDs[0])), + ); c != 0 { + return nil, getLastError() + } + rv := make(map[int64]int64, len(vecIDs)) + for _, v := range clusterIDs { + rv[v]++ + } + return rv, nil +} + +func (idx *faissIndex) IsIVFIndex() bool { if ivfIdx := C.faiss_IndexIVF_cast(idx.cPtr()); ivfIdx == nil { - return nil, nil + return false } - - clusterVectorIDMap := make(map[int64][]int64) - - nlist := C.faiss_IndexIVF_nlist(idx.idx) - for i := 0; i < int(nlist); i++ { - list_size := C.faiss_IndexIVF_get_list_size(idx.idx, C.size_t(i)) - invlist := make([]int64, list_size) - C.faiss_IndexIVF_invlists_get_ids(idx.idx, C.size_t(i), (*C.idx_t)(&invlist[0])) - clusterVectorIDMap[int64(i)] = invlist - } - - return clusterVectorIDMap, nil + return true } func (idx *faissIndex) ObtainClustersWithDistancesFromIVFIndex(x []float32, centroidIDs []int64) ( @@ -169,10 +187,11 @@ func (idx *faissIndex) ObtainClustersWithDistancesFromIVFIndex(x []float32, cent } defer includeSelector.Delete() - params, err := NewSearchParams(idx, json.RawMessage{}, includeSelector.Get()) + params, err := NewSearchParams(idx, json.RawMessage{}, includeSelector.Get(), nil) if err != nil { return nil, nil, err } + defer params.Delete() // Populate these with the centroids and their distances. centroids := make([]int64, len(centroidIDs)) @@ -180,9 +199,14 @@ func (idx *faissIndex) ObtainClustersWithDistancesFromIVFIndex(x []float32, cent n := len(x) / idx.D() - c := C.faiss_Search_closest_eligible_centroids(idx.idx, (C.int)(n), - (*C.float)(&x[0]), (C.int)(len(centroidIDs)), - (*C.float)(¢roidDistances[0]), (*C.idx_t)(¢roids[0]), params.sp) + c := C.faiss_Search_closest_eligible_centroids( + idx.idx, + (C.idx_t)(n), + (*C.float)(&x[0]), + (C.idx_t)(len(centroidIDs)), + (*C.float)(¢roidDistances[0]), + (*C.idx_t)(¢roids[0]), + params.sp) if c != 0 { return nil, nil, getLastError() } @@ -190,24 +214,22 @@ func (idx *faissIndex) ObtainClustersWithDistancesFromIVFIndex(x []float32, cent return centroids, centroidDistances, nil } -func (idx *faissIndex) SearchClustersFromIVFIndex(selector Selector, nvecs int, +func (idx *faissIndex) SearchClustersFromIVFIndex(selector Selector, eligibleCentroidIDs []int64, minEligibleCentroids int, k int64, x, centroidDis []float32, params json.RawMessage) ([]float32, []int64, error) { - defer selector.Delete() - tempParams := defaultSearchParamsIVF{ + tempParams := &defaultSearchParamsIVF{ Nlist: len(eligibleCentroidIDs), // Have to override nprobe so that more clusters will be searched for this // query, if required. Nprobe: minEligibleCentroids, - Nvecs: nvecs, } - searchParams, err := NewSearchParamsIVF(idx, params, selector.Get(), - tempParams) + searchParams, err := NewSearchParams(idx, params, selector.Get(), tempParams) if err != nil { return nil, nil, err } + defer searchParams.Delete() n := len(x) / idx.D() @@ -285,11 +307,11 @@ func (idx *faissIndex) SearchWithoutIDs(x []float32, k int64, exclude []int64, p defer excludeSelector.Delete() } - searchParams, err := NewSearchParams(idx, params, selector) - defer searchParams.Delete() + searchParams, err := NewSearchParams(idx, params, selector, nil) if err != nil { return nil, nil, err } + defer searchParams.Delete() distances, labels, err = idx.searchWithParams(x, k, searchParams.sp) @@ -305,7 +327,7 @@ func (idx *faissIndex) SearchWithIDs(x []float32, k int64, include []int64, } defer includeSelector.Delete() - searchParams, err := NewSearchParams(idx, params, includeSelector.Get()) + searchParams, err := NewSearchParams(idx, params, includeSelector.Get(), nil) if err != nil { return nil, nil, err } diff --git a/vendor/github.com/blevesearch/go-faiss/search_params.go b/vendor/github.com/blevesearch/go-faiss/search_params.go index 7099abf2e..608607382 100644 --- a/vendor/github.com/blevesearch/go-faiss/search_params.go +++ b/vendor/github.com/blevesearch/go-faiss/search_params.go @@ -34,7 +34,6 @@ type searchParamsIVF struct { type defaultSearchParamsIVF struct { Nprobe int `json:"ivf_nprobe,omitempty"` Nlist int `json:"ivf_nlist,omitempty"` - Nvecs int `json:"ivf_nvecs,omitempty"` } func (s *searchParamsIVF) Validate() error { @@ -55,119 +54,60 @@ func getNProbeFromSearchParams(params *SearchParams) int32 { return int32(C.faiss_SearchParametersIVF_nprobe(params.sp)) } -func NewSearchParamsIVF(idx Index, params json.RawMessage, sel *C.FaissIDSelector, - defaultParams defaultSearchParamsIVF) (*SearchParams, error) { - rv := &SearchParams{} - if ivfIdx := C.faiss_IndexIVF_cast(idx.cPtr()); ivfIdx != nil { - rv.sp = C.faiss_SearchParametersIVF_cast(rv.sp) - if len(params) == 0 && sel == nil { - return rv, nil - } - - var nprobe, maxCodes, nlist int - nlist = int(C.faiss_IndexIVF_nlist(ivfIdx)) - // It's important to set nprobe to the value decided at the time of - // index creation. Otherwise, nprobe will be set to the default - // value of 1. - nprobe = int(C.faiss_IndexIVF_nprobe(ivfIdx)) - - nvecs := idx.Ntotal() - if defaultParams.Nlist > 0 { - nlist = defaultParams.Nlist - } - if defaultParams.Nprobe > 0 { - nprobe = defaultParams.Nprobe - } - - var ivfParams searchParamsIVF - if len(params) > 0 { - if err := json.Unmarshal(params, &ivfParams); err != nil { - return rv, fmt.Errorf("failed to unmarshal IVF search params, "+ - "err:%v", err) - } - if err := ivfParams.Validate(); err != nil { - return rv, err - } - } - - if ivfParams.NprobePct > 0 { - // in the situation when the calculated nprobe happens to be - // between 0 and 1, we'll round it up. - nprobe = max(int(float32(nlist)*(ivfParams.NprobePct/100)), 1) - } - - if ivfParams.MaxCodesPct > 0 { - maxCodes = int(float32(nvecs) * (ivfParams.MaxCodesPct / 100)) - } // else, maxCodes will be set to the default value of 0, which means no limit - - if c := C.faiss_SearchParametersIVF_new_with( - &rv.sp, - sel, - C.size_t(nprobe), - C.size_t(maxCodes), - ); c != 0 { - return rv, fmt.Errorf("failed to create faiss IVF search params") - } - } - return rv, nil -} - -// Always return a valid SearchParams object, +// Returns a valid SearchParams object, // thus caller must clean up the object -// by invoking Delete() method, even if an error is returned. +// by invoking Delete() method. func NewSearchParams(idx Index, params json.RawMessage, sel *C.FaissIDSelector, -) (*SearchParams, error) { + defaultParams *defaultSearchParamsIVF) (*SearchParams, error) { rv := &SearchParams{} if c := C.faiss_SearchParameters_new(&rv.sp, sel); c != 0 { - return rv, fmt.Errorf("failed to create faiss search params") + return nil, fmt.Errorf("failed to create faiss search params") } - // check if the index is IVF and set the search params if ivfIdx := C.faiss_IndexIVF_cast(idx.cPtr()); ivfIdx != nil { rv.sp = C.faiss_SearchParametersIVF_cast(rv.sp) if len(params) == 0 && sel == nil { return rv, nil } - + var nlist, nprobe, nvecs, maxCodes int + nlist = int(C.faiss_IndexIVF_nlist(ivfIdx)) + nprobe = int(C.faiss_IndexIVF_nprobe(ivfIdx)) + nvecs = int(C.faiss_Index_ntotal(idx.cPtr())) + if defaultParams != nil { + if defaultParams.Nlist > 0 { + nlist = defaultParams.Nlist + } + if defaultParams.Nprobe > 0 { + nprobe = defaultParams.Nprobe + } + } var ivfParams searchParamsIVF if len(params) > 0 { if err := json.Unmarshal(params, &ivfParams); err != nil { - return rv, fmt.Errorf("failed to unmarshal IVF search params, "+ + rv.Delete() + return nil, fmt.Errorf("failed to unmarshal IVF search params, "+ "err:%v", err) } if err := ivfParams.Validate(); err != nil { - return rv, err + rv.Delete() + return nil, err } } - - var nprobe, maxCodes int - if ivfParams.NprobePct > 0 { - nlist := float32(C.faiss_IndexIVF_nlist(ivfIdx)) - // in the situation when the calculated nprobe happens to be - // between 0 and 1, we'll round it up. - nprobe = max(int(nlist*(ivfParams.NprobePct/100)), 1) - } else { - // it's important to set nprobe to the value decided at the time of - // index creation. Otherwise, nprobe will be set to the default - // value of 1. - nprobe = int(C.faiss_IndexIVF_nprobe(ivfIdx)) + nprobe = max(int(float32(nlist)*(ivfParams.NprobePct/100)), 1) } - if ivfParams.MaxCodesPct > 0 { - nvecs := C.faiss_Index_ntotal(idx.cPtr()) maxCodes = int(float32(nvecs) * (ivfParams.MaxCodesPct / 100)) } // else, maxCodes will be set to the default value of 0, which means no limit - if c := C.faiss_SearchParametersIVF_new_with( &rv.sp, sel, C.size_t(nprobe), C.size_t(maxCodes), ); c != 0 { - return rv, fmt.Errorf("failed to create faiss IVF search params") + rv.Delete() + return nil, fmt.Errorf("failed to create faiss IVF search params") } } - return rv, nil } diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go index 8e4a3d99c..526bb8872 100644 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go @@ -17,7 +17,7 @@ package segment import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" ) @@ -68,6 +68,9 @@ type TermDictionary interface { startKeyInclusive, endKeyExclusive []byte) DictionaryIterator Contains(key []byte) (bool, error) + + // returns total number of terms in the term dictionary + Cardinality() int } type DictionaryIterator interface { @@ -178,3 +181,59 @@ type FieldStats interface { Aggregate(stats FieldStats) Fetch() map[string]map[string]uint64 } + +// ThesaurusSegment provides access to a thesaurus within a specific segment of the index. +type ThesaurusSegment interface { + Segment + // Thesaurus returns the Thesaurus with the specified name. + Thesaurus(name string) (Thesaurus, error) +} + +// Thesaurus encapsulates a structured collection of terms and their associated synonyms. +type Thesaurus interface { + // SynonymsList retrieves a list of synonyms for the specified term. The `except` parameter + // excludes specific synonyms, such as those originating from deleted documents. The `prealloc` + // parameter allows the use of preallocated memory to optimize performance. + SynonymsList(term []byte, except *roaring.Bitmap, prealloc SynonymsList) (SynonymsList, error) + + // AutomatonIterator creates an iterator over the thesaurus keys/terms using the provided automaton. + // The iteration is constrained by the specified key range [startKeyInclusive, endKeyExclusive). + // These terms or keys are the ones that have a SynonymsList associated with them, in the thesaurus. + AutomatonIterator(a Automaton, startKeyInclusive, endKeyExclusive []byte) ThesaurusIterator + + // Contains checks if the given key exists in the thesaurus. + Contains(key []byte) (bool, error) +} + +// ThesaurusIterator iterates over terms in a thesaurus. +type ThesaurusIterator interface { + // Next returns the next entry in the thesaurus or an error if iteration fails. + Next() (*index.ThesaurusEntry, error) +} + +// SynonymsList represents a list of synonyms for a term. +type SynonymsList interface { + // Iterator returns an iterator to traverse the list of synonyms. + // The `prealloc` parameter can be used to reuse existing memory for the iterator. + Iterator(prealloc SynonymsIterator) SynonymsIterator + + Size() int +} + +// SynonymsIterator provides a mechanism to iterate over a list of synonyms. +type SynonymsIterator interface { + // Next returns the next synonym in the list or an error if iteration fails. + Next() (Synonym, error) + + Size() int +} + +// Synonym represents a single synonym for a term in the thesaurus. +type Synonym interface { + // Number returns the document number from which the synonym originates. + Number() uint32 + // Term returns the textual representation of the synonym. + Term() string + + Size() int +} diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go index 77b036212..a57e0b494 100644 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go @@ -20,7 +20,7 @@ package segment import ( "encoding/json" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) type VecPostingsList interface { diff --git a/vendor/github.com/blevesearch/vellum/automaton.go b/vendor/github.com/blevesearch/vellum/automaton.go index 70398f2d4..41ac73fd5 100644 --- a/vendor/github.com/blevesearch/vellum/automaton.go +++ b/vendor/github.com/blevesearch/vellum/automaton.go @@ -83,3 +83,9 @@ func (m *AlwaysMatch) Accept(int, byte) int { // creating an alwaysMatchAutomaton to avoid unnecessary repeated allocations. var alwaysMatchAutomaton = &AlwaysMatch{} + +type FuzzyAutomaton interface { + Automaton + EditDistance(int) uint8 + MatchAndDistance(input string) (bool, uint8) +} diff --git a/vendor/github.com/blevesearch/vellum/fst_iterator.go b/vendor/github.com/blevesearch/vellum/fst_iterator.go index 2c6b0d68e..f5c374e24 100644 --- a/vendor/github.com/blevesearch/vellum/fst_iterator.go +++ b/vendor/github.com/blevesearch/vellum/fst_iterator.go @@ -44,6 +44,11 @@ type Iterator interface { Close() error } +type FuzzyIterator interface { + Iterator + EditDistance() uint8 +} + // FSTIterator is a structure for iterating key/value pairs in this FST in // lexicographic order. Iterators should be constructed with the FSTIterator // method on the parent FST structure. @@ -61,6 +66,8 @@ type FSTIterator struct { autStatesStack []int nextStart []byte + + editDistance uint8 } func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, @@ -74,6 +81,10 @@ func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, return rv, nil } +func (i *FSTIterator) EditDistance() uint8 { + return i.editDistance +} + // Reset resets the Iterator' internal state to allow for iterator // reuse (e.g. pooling). func (i *FSTIterator) Reset(f *FST, @@ -206,6 +217,9 @@ OUTER: cmp := bytes.Compare(i.keysStack, i.nextStart) if cmp > 0 { + if fa, ok := i.aut.(FuzzyAutomaton); ok { + i.editDistance = fa.EditDistance(autCurr) + } // in final state greater than start key return nil } diff --git a/vendor/github.com/blevesearch/vellum/levenshtein/dfa.go b/vendor/github.com/blevesearch/vellum/levenshtein/dfa.go index d0e43cac2..dc8ed0843 100644 --- a/vendor/github.com/blevesearch/vellum/levenshtein/dfa.go +++ b/vendor/github.com/blevesearch/vellum/levenshtein/dfa.go @@ -28,23 +28,45 @@ type DFA struct { ed uint8 } -/// Returns the initial state +// Returns the initial state func (d *DFA) initialState() int { return d.initState } -/// Returns the Levenshtein distance associated to the -/// current state. +// Returns the Levenshtein distance associated to the +// current state. func (d *DFA) distance(stateId int) Distance { return d.distances[stateId] } -/// Returns the number of states in the `DFA`. +func (d *DFA) EditDistance(stateId int) uint8 { + return d.distances[stateId].distance() +} + +func (d *DFA) MatchAndDistance(input string) (bool, uint8) { + currentState := d.Start() + index := 0 + // Traverse the DFA while characters can still match + for d.CanMatch(currentState) && index < len(input) { + currentState = d.Accept(currentState, input[index]) + if currentState == int(SinkState) { + break + } + index++ + } + // Ensure we've processed the entire input and check if the current state is a match + if index == len(input) && d.IsMatch(currentState) { + return true, d.EditDistance(currentState) + } + return false, 0 +} + +// Returns the number of states in the `DFA`. func (d *DFA) numStates() int { return len(d.transitions) } -/// Returns the destination state reached after consuming a given byte. +// Returns the destination state reached after consuming a given byte. func (d *DFA) transition(fromState int, b uint8) int { return int(d.transitions[fromState][b]) } diff --git a/vendor/github.com/blevesearch/vellum/regexp/regexp.go b/vendor/github.com/blevesearch/vellum/regexp/regexp.go index 920ddc370..8d28b23c0 100644 --- a/vendor/github.com/blevesearch/vellum/regexp/regexp.go +++ b/vendor/github.com/blevesearch/vellum/regexp/regexp.go @@ -117,3 +117,14 @@ func (r *Regexp) Accept(s int, b byte) int { } return 0 } + +func (r *Regexp) MatchesRegex(input string) bool { + currentState := r.Start() + index := 0 + // Traverse the DFA while characters can still match + for r.CanMatch(currentState) && index < len(input) { + currentState = r.Accept(currentState, input[index]) + index++ + } + return index == len(input) && r.IsMatch(currentState) +} diff --git a/vendor/github.com/blevesearch/zapx/v11/dict.go b/vendor/github.com/blevesearch/zapx/v11/dict.go index e30bf2420..bd54ab039 100644 --- a/vendor/github.com/blevesearch/zapx/v11/dict.go +++ b/vendor/github.com/blevesearch/zapx/v11/dict.go @@ -17,7 +17,7 @@ package zap import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -46,6 +46,13 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, return d.postingsList(term, except, preallocPL) } +func (d *Dictionary) Cardinality() int { + if d.fst != nil { + return d.fst.Len() + } + return 0 +} + func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { if d.fstReader == nil { if rv == nil || rv == emptyPostingsList { diff --git a/vendor/github.com/blevesearch/zapx/v11/merge.go b/vendor/github.com/blevesearch/zapx/v11/merge.go index f0770e990..50bb2ba54 100644 --- a/vendor/github.com/blevesearch/zapx/v11/merge.go +++ b/vendor/github.com/blevesearch/zapx/v11/merge.go @@ -23,7 +23,7 @@ import ( "os" "sort" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" seg "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" "github.com/golang/snappy" diff --git a/vendor/github.com/blevesearch/zapx/v11/new.go b/vendor/github.com/blevesearch/zapx/v11/new.go index 4491422aa..2d29a3b6c 100644 --- a/vendor/github.com/blevesearch/zapx/v11/new.go +++ b/vendor/github.com/blevesearch/zapx/v11/new.go @@ -21,7 +21,7 @@ import ( "sort" "sync" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v11/posting.go b/vendor/github.com/blevesearch/zapx/v11/posting.go index 71b8e52be..857aeae5f 100644 --- a/vendor/github.com/blevesearch/zapx/v11/posting.go +++ b/vendor/github.com/blevesearch/zapx/v11/posting.go @@ -20,7 +20,7 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/zapx/v11/segment.go b/vendor/github.com/blevesearch/zapx/v11/segment.go index 0b0b192f5..7465dac15 100644 --- a/vendor/github.com/blevesearch/zapx/v11/segment.go +++ b/vendor/github.com/blevesearch/zapx/v11/segment.go @@ -23,7 +23,7 @@ import ( "sync" "unsafe" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" mmap "github.com/blevesearch/mmap-go" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v11/write.go b/vendor/github.com/blevesearch/zapx/v11/write.go index cddaedd00..4cbf95bf3 100644 --- a/vendor/github.com/blevesearch/zapx/v11/write.go +++ b/vendor/github.com/blevesearch/zapx/v11/write.go @@ -18,7 +18,7 @@ import ( "encoding/binary" "io" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) // writes out the length of the roaring bitmap in bytes as varint diff --git a/vendor/github.com/blevesearch/zapx/v12/dict.go b/vendor/github.com/blevesearch/zapx/v12/dict.go index e30bf2420..bd54ab039 100644 --- a/vendor/github.com/blevesearch/zapx/v12/dict.go +++ b/vendor/github.com/blevesearch/zapx/v12/dict.go @@ -17,7 +17,7 @@ package zap import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -46,6 +46,13 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, return d.postingsList(term, except, preallocPL) } +func (d *Dictionary) Cardinality() int { + if d.fst != nil { + return d.fst.Len() + } + return 0 +} + func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { if d.fstReader == nil { if rv == nil || rv == emptyPostingsList { diff --git a/vendor/github.com/blevesearch/zapx/v12/merge.go b/vendor/github.com/blevesearch/zapx/v12/merge.go index 6a853a16a..e962c6ec1 100644 --- a/vendor/github.com/blevesearch/zapx/v12/merge.go +++ b/vendor/github.com/blevesearch/zapx/v12/merge.go @@ -23,7 +23,7 @@ import ( "os" "sort" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" seg "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" "github.com/golang/snappy" diff --git a/vendor/github.com/blevesearch/zapx/v12/new.go b/vendor/github.com/blevesearch/zapx/v12/new.go index b4e0d0341..47a509a17 100644 --- a/vendor/github.com/blevesearch/zapx/v12/new.go +++ b/vendor/github.com/blevesearch/zapx/v12/new.go @@ -21,7 +21,7 @@ import ( "sort" "sync" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v12/posting.go b/vendor/github.com/blevesearch/zapx/v12/posting.go index 627a19b1d..a54dac5c2 100644 --- a/vendor/github.com/blevesearch/zapx/v12/posting.go +++ b/vendor/github.com/blevesearch/zapx/v12/posting.go @@ -20,7 +20,7 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/zapx/v12/segment.go b/vendor/github.com/blevesearch/zapx/v12/segment.go index 1fbf78480..936b63836 100644 --- a/vendor/github.com/blevesearch/zapx/v12/segment.go +++ b/vendor/github.com/blevesearch/zapx/v12/segment.go @@ -23,7 +23,7 @@ import ( "sync" "unsafe" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" mmap "github.com/blevesearch/mmap-go" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v12/write.go b/vendor/github.com/blevesearch/zapx/v12/write.go index 77aefdbfc..d3786261b 100644 --- a/vendor/github.com/blevesearch/zapx/v12/write.go +++ b/vendor/github.com/blevesearch/zapx/v12/write.go @@ -18,7 +18,7 @@ import ( "encoding/binary" "io" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) // writes out the length of the roaring bitmap in bytes as varint diff --git a/vendor/github.com/blevesearch/zapx/v13/dict.go b/vendor/github.com/blevesearch/zapx/v13/dict.go index e30bf2420..bd54ab039 100644 --- a/vendor/github.com/blevesearch/zapx/v13/dict.go +++ b/vendor/github.com/blevesearch/zapx/v13/dict.go @@ -17,7 +17,7 @@ package zap import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -46,6 +46,13 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, return d.postingsList(term, except, preallocPL) } +func (d *Dictionary) Cardinality() int { + if d.fst != nil { + return d.fst.Len() + } + return 0 +} + func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { if d.fstReader == nil { if rv == nil || rv == emptyPostingsList { diff --git a/vendor/github.com/blevesearch/zapx/v13/merge.go b/vendor/github.com/blevesearch/zapx/v13/merge.go index 6a853a16a..e962c6ec1 100644 --- a/vendor/github.com/blevesearch/zapx/v13/merge.go +++ b/vendor/github.com/blevesearch/zapx/v13/merge.go @@ -23,7 +23,7 @@ import ( "os" "sort" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" seg "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" "github.com/golang/snappy" diff --git a/vendor/github.com/blevesearch/zapx/v13/new.go b/vendor/github.com/blevesearch/zapx/v13/new.go index b4e0d0341..47a509a17 100644 --- a/vendor/github.com/blevesearch/zapx/v13/new.go +++ b/vendor/github.com/blevesearch/zapx/v13/new.go @@ -21,7 +21,7 @@ import ( "sort" "sync" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v13/posting.go b/vendor/github.com/blevesearch/zapx/v13/posting.go index 1ba133d6e..a07ea75f2 100644 --- a/vendor/github.com/blevesearch/zapx/v13/posting.go +++ b/vendor/github.com/blevesearch/zapx/v13/posting.go @@ -20,7 +20,7 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/zapx/v13/segment.go b/vendor/github.com/blevesearch/zapx/v13/segment.go index 1fbf78480..936b63836 100644 --- a/vendor/github.com/blevesearch/zapx/v13/segment.go +++ b/vendor/github.com/blevesearch/zapx/v13/segment.go @@ -23,7 +23,7 @@ import ( "sync" "unsafe" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" mmap "github.com/blevesearch/mmap-go" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v13/write.go b/vendor/github.com/blevesearch/zapx/v13/write.go index 77aefdbfc..d3786261b 100644 --- a/vendor/github.com/blevesearch/zapx/v13/write.go +++ b/vendor/github.com/blevesearch/zapx/v13/write.go @@ -18,7 +18,7 @@ import ( "encoding/binary" "io" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) // writes out the length of the roaring bitmap in bytes as varint diff --git a/vendor/github.com/blevesearch/zapx/v14/dict.go b/vendor/github.com/blevesearch/zapx/v14/dict.go index e30bf2420..bd54ab039 100644 --- a/vendor/github.com/blevesearch/zapx/v14/dict.go +++ b/vendor/github.com/blevesearch/zapx/v14/dict.go @@ -17,7 +17,7 @@ package zap import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -46,6 +46,13 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, return d.postingsList(term, except, preallocPL) } +func (d *Dictionary) Cardinality() int { + if d.fst != nil { + return d.fst.Len() + } + return 0 +} + func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { if d.fstReader == nil { if rv == nil || rv == emptyPostingsList { diff --git a/vendor/github.com/blevesearch/zapx/v14/merge.go b/vendor/github.com/blevesearch/zapx/v14/merge.go index 6a853a16a..e962c6ec1 100644 --- a/vendor/github.com/blevesearch/zapx/v14/merge.go +++ b/vendor/github.com/blevesearch/zapx/v14/merge.go @@ -23,7 +23,7 @@ import ( "os" "sort" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" seg "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" "github.com/golang/snappy" diff --git a/vendor/github.com/blevesearch/zapx/v14/new.go b/vendor/github.com/blevesearch/zapx/v14/new.go index b4e0d0341..47a509a17 100644 --- a/vendor/github.com/blevesearch/zapx/v14/new.go +++ b/vendor/github.com/blevesearch/zapx/v14/new.go @@ -21,7 +21,7 @@ import ( "sort" "sync" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v14/posting.go b/vendor/github.com/blevesearch/zapx/v14/posting.go index 8d138509d..a167abe0b 100644 --- a/vendor/github.com/blevesearch/zapx/v14/posting.go +++ b/vendor/github.com/blevesearch/zapx/v14/posting.go @@ -20,7 +20,7 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/zapx/v14/segment.go b/vendor/github.com/blevesearch/zapx/v14/segment.go index 1fbf78480..936b63836 100644 --- a/vendor/github.com/blevesearch/zapx/v14/segment.go +++ b/vendor/github.com/blevesearch/zapx/v14/segment.go @@ -23,7 +23,7 @@ import ( "sync" "unsafe" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" mmap "github.com/blevesearch/mmap-go" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v14/write.go b/vendor/github.com/blevesearch/zapx/v14/write.go index 77aefdbfc..d3786261b 100644 --- a/vendor/github.com/blevesearch/zapx/v14/write.go +++ b/vendor/github.com/blevesearch/zapx/v14/write.go @@ -18,7 +18,7 @@ import ( "encoding/binary" "io" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) // writes out the length of the roaring bitmap in bytes as varint diff --git a/vendor/github.com/blevesearch/zapx/v15/dict.go b/vendor/github.com/blevesearch/zapx/v15/dict.go index 6b8acf52d..94f0ba6aa 100644 --- a/vendor/github.com/blevesearch/zapx/v15/dict.go +++ b/vendor/github.com/blevesearch/zapx/v15/dict.go @@ -17,7 +17,7 @@ package zap import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -48,6 +48,13 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, return d.postingsList(term, except, preallocPL) } +func (d *Dictionary) Cardinality() int { + if d.fst != nil { + return d.fst.Len() + } + return 0 +} + func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { if d.fstReader == nil { if rv == nil || rv == emptyPostingsList { diff --git a/vendor/github.com/blevesearch/zapx/v15/merge.go b/vendor/github.com/blevesearch/zapx/v15/merge.go index 63ff2089b..738c24d6b 100644 --- a/vendor/github.com/blevesearch/zapx/v15/merge.go +++ b/vendor/github.com/blevesearch/zapx/v15/merge.go @@ -23,7 +23,7 @@ import ( "os" "sort" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" seg "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" "github.com/golang/snappy" diff --git a/vendor/github.com/blevesearch/zapx/v15/new.go b/vendor/github.com/blevesearch/zapx/v15/new.go index 869d1b53f..2fd560af1 100644 --- a/vendor/github.com/blevesearch/zapx/v15/new.go +++ b/vendor/github.com/blevesearch/zapx/v15/new.go @@ -22,7 +22,7 @@ import ( "sync" "sync/atomic" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v15/posting.go b/vendor/github.com/blevesearch/zapx/v15/posting.go index 07ae202f6..893f717aa 100644 --- a/vendor/github.com/blevesearch/zapx/v15/posting.go +++ b/vendor/github.com/blevesearch/zapx/v15/posting.go @@ -20,7 +20,7 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/zapx/v15/segment.go b/vendor/github.com/blevesearch/zapx/v15/segment.go index 15bc911aa..a4938b4ba 100644 --- a/vendor/github.com/blevesearch/zapx/v15/segment.go +++ b/vendor/github.com/blevesearch/zapx/v15/segment.go @@ -24,7 +24,7 @@ import ( "sync/atomic" "unsafe" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" mmap "github.com/blevesearch/mmap-go" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" diff --git a/vendor/github.com/blevesearch/zapx/v15/write.go b/vendor/github.com/blevesearch/zapx/v15/write.go index 77aefdbfc..d3786261b 100644 --- a/vendor/github.com/blevesearch/zapx/v15/write.go +++ b/vendor/github.com/blevesearch/zapx/v15/write.go @@ -18,7 +18,7 @@ import ( "encoding/binary" "io" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) // writes out the length of the roaring bitmap in bytes as varint diff --git a/vendor/github.com/blevesearch/zapx/v16/build.go b/vendor/github.com/blevesearch/zapx/v16/build.go index 53fd34d12..cbbd2abc3 100644 --- a/vendor/github.com/blevesearch/zapx/v16/build.go +++ b/vendor/github.com/blevesearch/zapx/v16/build.go @@ -171,6 +171,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64 docValueOffset: 0, // docValueOffsets identified automatically by the section fieldFSTs: make(map[uint16]*vellum.FST), vecIndexCache: newVectorIndexCache(), + synIndexCache: newSynonymIndexCache(), // following fields gets populated by loadFieldsNew fieldsMap: make(map[string]uint16), dictLocs: make([]uint64, 0), diff --git a/vendor/github.com/blevesearch/zapx/v16/dict.go b/vendor/github.com/blevesearch/zapx/v16/dict.go index d06278fd5..5ec7e27fd 100644 --- a/vendor/github.com/blevesearch/zapx/v16/dict.go +++ b/vendor/github.com/blevesearch/zapx/v16/dict.go @@ -17,7 +17,7 @@ package zap import ( "fmt" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -38,6 +38,13 @@ type Dictionary struct { // represents an immutable, empty dictionary var emptyDictionary = &Dictionary{} +func (d *Dictionary) Cardinality() int { + if d.fst != nil { + return d.fst.Len() + } + return 0 +} + // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, prealloc segment.PostingsList) (segment.PostingsList, error) { @@ -165,6 +172,9 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { return nil, nil } term, postingsOffset := i.itr.Current() + if fitr, ok := i.itr.(vellum.FuzzyIterator); ok { + i.entry.EditDistance = fitr.EditDistance() + } i.entry.Term = string(term) if !i.omitCount { i.err = i.tmp.read(postingsOffset, i.d) diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache.go index f4aa6087a..f315db2b2 100644 --- a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache.go +++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_cache.go @@ -23,7 +23,7 @@ import ( "sync/atomic" "time" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" faiss "github.com/blevesearch/go-faiss" ) @@ -105,7 +105,7 @@ func (vc *vectorIndexCache) addDocVecIDMapToCacheLOCKED(ce *cacheEntry) map[uint return ce.docVecIDMap } - docVecIDMap := make(map[uint32][]int64) + docVecIDMap := make(map[uint32][]int64, len(ce.vecDocIDMap)) for vecID, docID := range ce.vecDocIDMap { docVecIDMap[docID] = append(docVecIDMap[docID], vecID) } diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go index 9c53c0745..0c823c13e 100644 --- a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go +++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go @@ -23,8 +23,9 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" - "github.com/RoaringBitmap/roaring/roaring64" + "github.com/RoaringBitmap/roaring/v2" + "github.com/RoaringBitmap/roaring/v2/roaring64" + "github.com/bits-and-blooms/bitset" faiss "github.com/blevesearch/go-faiss" segment "github.com/blevesearch/scorch_segment_api/v2" ) @@ -87,7 +88,9 @@ var emptyVecPostingsIterator = &VecPostingsIterator{} var emptyVecPostingsList = &VecPostingsList{} func (vpl *VecPostingsList) Iterator(prealloc segment.VecPostingsIterator) segment.VecPostingsIterator { - + if vpl.postings == nil { + return emptyVecPostingsIterator + } // tbd: do we check the cardinality of postings and scores? var preallocPI *VecPostingsIterator pi, ok := prealloc.(*VecPostingsIterator) @@ -374,185 +377,139 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool except: nil, // todo: handle the except bitmap within postings iterator. postings: roaring64.New(), } - if vecIndex == nil || vecIndex.D() != len(qVector) { // vector index not found or dimensionality mismatched return rv, nil } - - if len(eligibleDocIDs) > 0 { - // Non-zero documents eligible per the filter query. - - // If every element in the index is eligible(eg. high selectivity - // cases), then this can basically be considered unfiltered kNN. - if len(eligibleDocIDs) == int(sb.numDocs) { - scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k, - vectorIDsToExclude, params) - if err != nil { - return nil, err - } - - addIDsToPostingsList(rv, ids, scores) - return rv, nil - } - - // vector IDs corresponding to the local doc numbers to be - // considered for the search - vectorIDsToInclude := make([]int64, 0, len(eligibleDocIDs)) - for _, id := range eligibleDocIDs { - vectorIDsToInclude = append(vectorIDsToInclude, docVecIDMap[uint32(id)]...) - } - - if len(vectorIDsToInclude) == 0 { - return rv, nil - } - - // Retrieve the mapping of centroid IDs to vectors within - // the cluster. - clusterAssignment, _ := vecIndex.ObtainClusterToVecIDsFromIVFIndex() - // Accounting for a flat index - if len(clusterAssignment) == 0 { - scores, ids, err := vecIndex.SearchWithIDs(qVector, k, - vectorIDsToInclude, params) - if err != nil { - return nil, err - } - - addIDsToPostingsList(rv, ids, scores) - return rv, nil - } - - // Converting to roaring bitmap for ease of intersect ops with - // the set of eligible doc IDs. - centroidVecIDMap := make(map[int64]*roaring.Bitmap) - for centroidID, vecIDs := range clusterAssignment { - if _, exists := centroidVecIDMap[centroidID]; !exists { - centroidVecIDMap[centroidID] = roaring.NewBitmap() - } - vecIDsUint32 := make([]uint32, 0, len(vecIDs)) - for _, vecID := range vecIDs { - vecIDsUint32 = append(vecIDsUint32, uint32(vecID)) - } - centroidVecIDMap[centroidID].AddMany(vecIDsUint32) - } - - // Determining which clusters, identified by centroid ID, - // have at least one eligible vector and hence, ought to be - // probed. - eligibleCentroidIDs := make([]int64, 0) - - var selector faiss.Selector - var err error - // If there are more elements to be included than excluded, it - // might be quicker to use an exclusion selector as a filter - // instead of an inclusion selector. - if float32(len(eligibleDocIDs))/float32(len(docVecIDMap)) > 0.5 { - ineligibleVecIDsBitmap := roaring.NewBitmap() - eligibleDocIDsMap := make(map[uint64]struct{}) - for _, eligibleDocID := range eligibleDocIDs { - eligibleDocIDsMap[(eligibleDocID)] = struct{}{} - } - - ineligibleVectorIDs := make([]int64, 0, len(vecDocIDMap)- - len(vectorIDsToInclude)) - - for docID, vecIDs := range docVecIDMap { - if _, exists := eligibleDocIDsMap[uint64(docID)]; !exists { - for _, vecID := range vecIDs { - ineligibleVecIDsBitmap.Add(uint32(vecID)) - ineligibleVectorIDs = append(ineligibleVectorIDs, vecID) - } - } - } - - for centroidID, vecIDs := range centroidVecIDMap { - vecIDs.AndNot(ineligibleVecIDsBitmap) - // At least one eligible vec in cluster. - if !vecIDs.IsEmpty() { - // The mapping is now reduced to those vectors which - // are also eligible docs for the filter query. - centroidVecIDMap[centroidID] = vecIDs - eligibleCentroidIDs = append(eligibleCentroidIDs, centroidID) - } else { - // don't consider clusters with no eligible IDs. - delete(centroidVecIDMap, centroidID) - } - } - - selector, err = faiss.NewIDSelectorNot(ineligibleVectorIDs) - } else { - // Getting the vector IDs corresponding to the eligible - // doc IDs. - // The docVecIDMap maps each docID to vectorIDs corresponding - // to it. - // Usually, each docID has one vecID mapped to it unless - // the vector is nested, in which case there can be multiple - // vectorIDs mapped to the same docID. - // Eg. docID d1 -> vecID v1, for the first case - // d1 -> {v1,v2}, for the second case. - eligibleVecIDsBitmap := roaring.NewBitmap() - vecIDsUint32 := make([]uint32, 0) - for _, eligibleDocID := range eligibleDocIDs { - vecIDs := docVecIDMap[uint32(eligibleDocID)] - for _, vecID := range vecIDs { - vecIDsUint32 = append(vecIDsUint32, uint32(vecID)) - } - } - eligibleVecIDsBitmap.AddMany(vecIDsUint32) - for centroidID, vecIDs := range centroidVecIDMap { - vecIDs.And(eligibleVecIDsBitmap) - if !vecIDs.IsEmpty() { - // The mapping is now reduced to those vectors which - // are also eligible docs for the filter query. - centroidVecIDMap[centroidID] = vecIDs - eligibleCentroidIDs = append(eligibleCentroidIDs, centroidID) - } else { - // don't consider clusters with no eligible IDs. - delete(centroidVecIDMap, centroidID) - } - } - - selector, err = faiss.NewIDSelectorBatch(vectorIDsToInclude) - } + // Check and proceed only if non-zero documents eligible per the filter query. + if len(eligibleDocIDs) == 0 { + return rv, nil + } + // If every element in the index is eligible (full selectivity), + // then this can basically be considered unfiltered kNN. + if len(eligibleDocIDs) == int(sb.numDocs) { + scores, ids, err := vecIndex.SearchWithoutIDs(qVector, k, + vectorIDsToExclude, params) if err != nil { return nil, err } - - // Ordering the retrieved centroid IDs by increasing order - // of distance i.e. decreasing order of proximity to query vector. - closestCentroidIDs, centroidDistances, _ := - vecIndex.ObtainClustersWithDistancesFromIVFIndex(qVector, - eligibleCentroidIDs) - - // Getting the nprobe value set at index time. - nprobe := vecIndex.GetNProbe() - - eligibleDocsTillNow := int64(0) - minEligibleCentroids := 0 - for i, centroidID := range closestCentroidIDs { - eligibleDocsTillNow += int64(centroidVecIDMap[centroidID].GetCardinality()) - if eligibleDocsTillNow >= k && i >= int(nprobe-1) { - // Continue till at least 'K' cumulative vectors are - // collected or 'nprobe' clusters are examined, whichever - // comes later. - minEligibleCentroids = i + 1 - break - } - minEligibleCentroids = i + 1 - } - - // Search the clusters specified by 'closestCentroidIDs' for - // vectors whose IDs are present in 'vectorIDsToInclude' - scores, ids, err := vecIndex.SearchClustersFromIVFIndex( - selector, len(vectorIDsToInclude), closestCentroidIDs, - minEligibleCentroids, k, qVector, centroidDistances, params) - if err != nil { - return nil, err - } - addIDsToPostingsList(rv, ids, scores) return rv, nil } + // vector IDs corresponding to the local doc numbers to be + // considered for the search + vectorIDsToInclude := make([]int64, 0, len(eligibleDocIDs)) + for _, id := range eligibleDocIDs { + vecIDs := docVecIDMap[uint32(id)] + // In the common case where vecIDs has only one element, which occurs + // when a document has only one vector field, we can + // avoid the unnecessary overhead of slice unpacking (append(vecIDs...)). + // Directly append the single element for efficiency. + if len(vecIDs) == 1 { + vectorIDsToInclude = append(vectorIDsToInclude, vecIDs[0]) + } else { + vectorIDsToInclude = append(vectorIDsToInclude, vecIDs...) + } + } + // In case a doc has invalid vector fields but valid non-vector fields, + // filter hit IDs may be ineligible for the kNN since the document does + // not have any/valid vectors. + if len(vectorIDsToInclude) == 0 { + return rv, nil + } + // If the index is not an IVF index, then the search can be + // performed directly, using the Flat index. + if !vecIndex.IsIVFIndex() { + // vector IDs corresponding to the local doc numbers to be + // considered for the search + scores, ids, err := vecIndex.SearchWithIDs(qVector, k, + vectorIDsToInclude, params) + if err != nil { + return nil, err + } + addIDsToPostingsList(rv, ids, scores) + return rv, nil + } + // Determining which clusters, identified by centroid ID, + // have at least one eligible vector and hence, ought to be + // probed. + clusterVectorCounts, err := vecIndex.ObtainClusterVectorCountsFromIVFIndex(vectorIDsToInclude) + if err != nil { + return nil, err + } + var selector faiss.Selector + // If there are more elements to be included than excluded, it + // might be quicker to use an exclusion selector as a filter + // instead of an inclusion selector. + if float32(len(eligibleDocIDs))/float32(len(docVecIDMap)) > 0.5 { + // Use a bitset to efficiently track eligible document IDs. + // This reduces the lookup cost when checking if a document ID is eligible, + // compared to using a map or slice. + bs := bitset.New(uint(len(eligibleDocIDs))) + for _, docID := range eligibleDocIDs { + bs.Set(uint(docID)) + } + ineligibleVectorIDs := make([]int64, 0, len(vecDocIDMap)-len(vectorIDsToInclude)) + for docID, vecIDs := range docVecIDMap { + // Check if the document ID is NOT in the eligible set, marking it as ineligible. + if !bs.Test(uint(docID)) { + // In the common case where vecIDs has only one element, which occurs + // when a document has only one vector field, we can + // avoid the unnecessary overhead of slice unpacking (append(vecIDs...)). + // Directly append the single element for efficiency. + if len(vecIDs) == 1 { + ineligibleVectorIDs = append(ineligibleVectorIDs, vecIDs[0]) + } else { + ineligibleVectorIDs = append(ineligibleVectorIDs, vecIDs...) + } + } + } + selector, err = faiss.NewIDSelectorNot(ineligibleVectorIDs) + } else { + selector, err = faiss.NewIDSelectorBatch(vectorIDsToInclude) + } + if err != nil { + return nil, err + } + // If no error occured during the creation of the selector, then + // it should be deleted once the search is complete. + defer selector.Delete() + // Ordering the retrieved centroid IDs by increasing order + // of distance i.e. decreasing order of proximity to query vector. + centroidIDs := make([]int64, 0, len(clusterVectorCounts)) + for centroidID := range clusterVectorCounts { + centroidIDs = append(centroidIDs, centroidID) + } + closestCentroidIDs, centroidDistances, err := + vecIndex.ObtainClustersWithDistancesFromIVFIndex(qVector, centroidIDs) + if err != nil { + return nil, err + } + // Getting the nprobe value set at index time. + nprobe := int(vecIndex.GetNProbe()) + // Determining the minimum number of centroids to be probed + // to ensure that at least 'k' vectors are collected while + // examining at least 'nprobe' centroids. + var eligibleDocsTillNow int64 + minEligibleCentroids := len(closestCentroidIDs) + for i, centroidID := range closestCentroidIDs { + eligibleDocsTillNow += clusterVectorCounts[centroidID] + // Stop once we've examined at least 'nprobe' centroids and + // collected at least 'k' vectors. + if eligibleDocsTillNow >= k && i+1 >= nprobe { + minEligibleCentroids = i + 1 + break + } + } + // Search the clusters specified by 'closestCentroidIDs' for + // vectors whose IDs are present in 'vectorIDsToInclude' + scores, ids, err := vecIndex.SearchClustersFromIVFIndex( + selector, closestCentroidIDs, minEligibleCentroids, + k, qVector, centroidDistances, params) + if err != nil { + return nil, err + } + addIDsToPostingsList(rv, ids, scores) return rv, nil }, close: func() { diff --git a/vendor/github.com/blevesearch/zapx/v16/merge.go b/vendor/github.com/blevesearch/zapx/v16/merge.go index 683d92098..479f10be8 100644 --- a/vendor/github.com/blevesearch/zapx/v16/merge.go +++ b/vendor/github.com/blevesearch/zapx/v16/merge.go @@ -23,7 +23,7 @@ import ( "os" "sort" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" seg "github.com/blevesearch/scorch_segment_api/v2" "github.com/golang/snappy" ) diff --git a/vendor/github.com/blevesearch/zapx/v16/new.go b/vendor/github.com/blevesearch/zapx/v16/new.go index f0d37c434..c99b933d7 100644 --- a/vendor/github.com/blevesearch/zapx/v16/new.go +++ b/vendor/github.com/blevesearch/zapx/v16/new.go @@ -174,23 +174,6 @@ func (s *interim) convert() (uint64, uint64, error) { s.FieldsMap = map[string]uint16{} } - args := map[string]interface{}{ - "results": s.results, - "chunkMode": s.chunkMode, - } - if s.opaque == nil { - s.opaque = map[int]resetable{} - for i, x := range segmentSections { - s.opaque[int(i)] = x.InitOpaque(args) - } - } else { - for k, v := range args { - for _, op := range s.opaque { - op.Set(k, v) - } - } - } - s.getOrDefineField("_id") // _id field is fieldID 0 for _, result := range s.results { @@ -208,6 +191,25 @@ func (s *interim) convert() (uint64, uint64, error) { s.FieldsMap[fieldName] = uint16(fieldID + 1) } + args := map[string]interface{}{ + "results": s.results, + "chunkMode": s.chunkMode, + "fieldsMap": s.FieldsMap, + "fieldsInv": s.FieldsInv, + } + if s.opaque == nil { + s.opaque = map[int]resetable{} + for i, x := range segmentSections { + s.opaque[int(i)] = x.InitOpaque(args) + } + } else { + for k, v := range args { + for _, op := range s.opaque { + op.Set(k, v) + } + } + } + s.processDocuments() storedIndexOffset, err := s.writeStoredFields() diff --git a/vendor/github.com/blevesearch/zapx/v16/posting.go b/vendor/github.com/blevesearch/zapx/v16/posting.go index 07ae202f6..893f717aa 100644 --- a/vendor/github.com/blevesearch/zapx/v16/posting.go +++ b/vendor/github.com/blevesearch/zapx/v16/posting.go @@ -20,7 +20,7 @@ import ( "math" "reflect" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" segment "github.com/blevesearch/scorch_segment_api/v2" ) diff --git a/vendor/github.com/blevesearch/zapx/v16/section.go b/vendor/github.com/blevesearch/zapx/v16/section.go index 1ace25e3b..c6e8e1139 100644 --- a/vendor/github.com/blevesearch/zapx/v16/section.go +++ b/vendor/github.com/blevesearch/zapx/v16/section.go @@ -17,7 +17,7 @@ package zap import ( "sync" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" ) @@ -58,6 +58,7 @@ type resetable interface { const ( SectionInvertedTextIndex = iota SectionFaissVectorIndex + SectionSynonymIndex ) // ----------------------------------------------------------------------------- diff --git a/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go b/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go index 1c9f91a06..f163a7539 100644 --- a/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go +++ b/vendor/github.com/blevesearch/zapx/v16/section_faiss_vector_index.go @@ -25,7 +25,7 @@ import ( "sync/atomic" "time" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" faiss "github.com/blevesearch/go-faiss" seg "github.com/blevesearch/scorch_segment_api/v2" @@ -36,10 +36,10 @@ const defaultFaissOMPThreads = 1 func init() { rand.Seed(time.Now().UTC().UnixNano()) registerSegmentSection(SectionFaissVectorIndex, &faissVectorIndexSection{}) - isFieldNotApplicableToInvertedTextSection = func(field index.Field) bool { + invertedTextIndexSectionExclusionChecks = append(invertedTextIndexSectionExclusionChecks, func(field index.Field) bool { _, ok := field.(index.VectorField) return ok - } + }) faiss.SetOMPThreads(defaultFaissOMPThreads) } @@ -146,17 +146,13 @@ func (v *faissVectorIndexSection) Merge(opaque map[int]resetable, segments []*Se // remap the docID from the old segment to the new document nos. // provided. furthermore, also drop the now-invalid doc nums // of that segment - var vecIDNotDeleted bool // indicates if the vector ID was not deleted. - var newDocID uint64 // new docID in the new segment if newDocNumsIn[segI][uint32(docID)] != docDropped { - newDocID = newDocNumsIn[segI][uint32(docID)] - vecIDNotDeleted = true - } - // if the remapped doc ID is valid, track it - // as part of vecs to be reconstructed (for larger indexes). - // this would account only the valid vector IDs, so the deleted - // ones won't be reconstructed in the final index. - if vecIDNotDeleted { + newDocID := newDocNumsIn[segI][uint32(docID)] + + // if the remapped doc ID is valid, track it + // as part of vecs to be reconstructed (for larger indexes). + // this would account only the valid vector IDs, so the deleted + // ones won't be reconstructed in the final index. vecToDocID[vecID] = newDocID indexes[curIdx].vecIds = append(indexes[curIdx].vecIds, vecID) } @@ -250,11 +246,7 @@ func (v *vectorIndexOpaque) flushVectorIndex(indexBytes []byte, w *CountHashWrit // write the vector index data _, err = w.Write(indexBytes) - if err != nil { - return err - } - - return nil + return err } // Divide the estimated nprobe with this value to optimize @@ -379,7 +371,6 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase, // the reconstructed ones anymore and doing so will hold up memory which can // be detrimental while creating indexes during introduction. freeReconstructedIndexes(vecIndexes) - vecIndexes = nil faissIndex, err := faiss.IndexFactory(dims, indexDescription, metric) if err != nil { @@ -414,20 +405,12 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(sbs []*SegmentBase, return err } - indexData = nil - finalVecIDs = nil - var mergedIndexBytes []byte - mergedIndexBytes, err = faiss.WriteIndexIntoBuffer(faissIndex) + mergedIndexBytes, err := faiss.WriteIndexIntoBuffer(faissIndex) if err != nil { return err } - err = v.flushVectorIndex(mergedIndexBytes, w) - if err != nil { - return err - } - - return nil + return v.flushVectorIndex(mergedIndexBytes, w) } // todo: can be parallelized. diff --git a/vendor/github.com/blevesearch/zapx/v16/section_inverted_text_index.go b/vendor/github.com/blevesearch/zapx/v16/section_inverted_text_index.go index ea8722e47..400a02968 100644 --- a/vendor/github.com/blevesearch/zapx/v16/section_inverted_text_index.go +++ b/vendor/github.com/blevesearch/zapx/v16/section_inverted_text_index.go @@ -21,7 +21,7 @@ import ( "sort" "sync/atomic" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" index "github.com/blevesearch/bleve_index_api" seg "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -34,16 +34,35 @@ func init() { type invertedTextIndexSection struct { } -// this function is something that tells the inverted index section whether to -// process a particular field or not - since it might be processed by another -// section this function helps in avoiding unnecessary work. -// (only used by faiss vector section currently, will need a separate API for every -// section we introduce in the future or a better way forward - TODO) -var isFieldNotApplicableToInvertedTextSection func(field index.Field) bool +// This function checks whether the inverted text index section should avoid processing +// a particular field, preventing unnecessary work if another section will handle it. +// +// NOTE: The exclusion check is applicable only to the InvertedTextIndexSection +// because it serves as a catch-all section. This section processes every field +// unless explicitly excluded, similar to a "default" case in a switch statement. +// Other sections, such as VectorSection and SynonymSection, rely on inclusion +// checks to process only specific field types (e.g., index.VectorField or +// index.SynonymField). Any new section added in the future must define its +// special field type and inclusion logic explicitly. +var isFieldExcludedFromInvertedTextIndexSection = func(field index.Field) bool { + for _, excludeField := range invertedTextIndexSectionExclusionChecks { + if excludeField(field) { + // atleast one section has agreed to exclude this field + // from inverted text index section processing and has + // agreed to process it independently + return true + } + } + // no section has excluded this field from inverted index processing + // so it should be processed by the inverted index section + return false +} + +// List of checks to determine if a field is excluded from the inverted text index section +var invertedTextIndexSectionExclusionChecks = make([]func(field index.Field) bool, 0) func (i *invertedTextIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) { - if isFieldNotApplicableToInvertedTextSection == nil || - !isFieldNotApplicableToInvertedTextSection(field) { + if !isFieldExcludedFromInvertedTextIndexSection(field) { invIndexOpaque := i.getInvertedIndexOpaque(opaque) invIndexOpaque.process(field, fieldID, docNum) } @@ -592,6 +611,11 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(io.results)-1), w, false) if io.IncludeDocValues[fieldID] { for docNum, docTerms := range docTermMap { + if fieldTermMap, ok := io.extraDocValues[docNum]; ok { + if sTerm, ok := fieldTermMap[uint16(fieldID)]; ok { + docTerms = append(append(docTerms, sTerm...), termSeparator) + } + } if len(docTerms) > 0 { err = fdvEncoder.Add(uint64(docNum), docTerms) if err != nil { @@ -736,7 +760,7 @@ func (i *invertedIndexOpaque) realloc() { i.FieldsMap[fieldName] = uint16(fieldID + 1) } - visitField := func(field index.Field) { + visitField := func(field index.Field, docNum int) { fieldID := uint16(i.getOrDefineField(field.Name())) dict := i.Dicts[fieldID] @@ -770,6 +794,13 @@ func (i *invertedIndexOpaque) realloc() { if field.Options().IncludeDocValues() { i.IncludeDocValues[fieldID] = true } + + if f, ok := field.(index.GeoShapeField); ok { + if _, exists := i.extraDocValues[docNum]; !exists { + i.extraDocValues[docNum] = make(map[uint16][]byte) + } + i.extraDocValues[docNum][fieldID] = f.EncodedShape() + } } if cap(i.IncludeDocValues) >= len(i.FieldsInv) { @@ -778,14 +809,20 @@ func (i *invertedIndexOpaque) realloc() { i.IncludeDocValues = make([]bool, len(i.FieldsInv)) } - for _, result := range i.results { + if i.extraDocValues == nil { + i.extraDocValues = map[int]map[uint16][]byte{} + } + + for docNum, result := range i.results { // walk each composite field result.VisitComposite(func(field index.CompositeField) { - visitField(field) + visitField(field, docNum) }) // walk each field - result.VisitFields(visitField) + result.VisitFields(func(field index.Field) { + visitField(field, docNum) + }) } numPostingsLists := pidNext @@ -896,6 +933,8 @@ func (i *invertedTextIndexSection) InitOpaque(args map[string]interface{}) reset } type invertedIndexOpaque struct { + bytesWritten uint64 // atomic access to this variable, moved to top to correct alignment issues on ARM, 386 and 32-bit MIPS. + results []index.Document chunkMode uint32 @@ -937,6 +976,11 @@ type invertedIndexOpaque struct { numTermsPerPostingsList []int // key is postings list id numLocsPerPostingsList []int // key is postings list id + // store terms that are unnecessary for the term dictionaries but needed in doc values + // eg - encoded geoshapes + // docNum -> fieldID -> term + extraDocValues map[int]map[uint16][]byte + builder *vellum.Builder builderBuf bytes.Buffer @@ -948,9 +992,8 @@ type invertedIndexOpaque struct { fieldAddrs map[int]int - bytesWritten uint64 - fieldsSame bool - numDocs uint64 + fieldsSame bool + numDocs uint64 } func (io *invertedIndexOpaque) Reset() (err error) { @@ -997,6 +1040,7 @@ func (io *invertedIndexOpaque) Reset() (err error) { io.reusableFieldTFs = io.reusableFieldTFs[:0] io.tmp0 = io.tmp0[:0] + io.extraDocValues = nil atomic.StoreUint64(&io.bytesWritten, 0) io.fieldsSame = false io.numDocs = 0 diff --git a/vendor/github.com/blevesearch/zapx/v16/section_synonym_index.go b/vendor/github.com/blevesearch/zapx/v16/section_synonym_index.go new file mode 100644 index 000000000..3894d1ae7 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/section_synonym_index.go @@ -0,0 +1,786 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "sort" + + "github.com/RoaringBitmap/roaring/v2" + "github.com/RoaringBitmap/roaring/v2/roaring64" + index "github.com/blevesearch/bleve_index_api" + seg "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +func init() { + registerSegmentSection(SectionSynonymIndex, &synonymIndexSection{}) + invertedTextIndexSectionExclusionChecks = append(invertedTextIndexSectionExclusionChecks, func(field index.Field) bool { + _, ok := field.(index.SynonymField) + return ok + }) +} + +// ----------------------------------------------------------------------------- + +type synonymIndexOpaque struct { + results []index.Document + + // indicates whether the following structs are initialized + init bool + + // FieldsMap maps field name to field id and must be set in + // the index opaque using the key "fieldsMap" + // used for ensuring accurate mapping between fieldID and + // thesaurusID + // name -> field id + 1 + FieldsMap map[string]uint16 + + // ThesaurusMap adds 1 to thesaurus id to avoid zero value issues + // name -> thesaurus id + 1 + ThesaurusMap map[string]uint16 + + // ThesaurusMapInv is the inverse of ThesaurusMap + // thesaurus id + 1 -> name + ThesaurusInv []string + + // Thesaurus for each thesaurus ID + // thesaurus id -> LHS term -> synonym postings list id + 1 + Thesauri []map[string]uint64 + + // LHS Terms for each thesaurus ID, where terms are sorted ascending + // thesaurus id -> []term + ThesaurusKeys [][]string + + // FieldIDtoThesaurusID maps the field id to the thesaurus id + // field id -> thesaurus id + FieldIDtoThesaurusID map[uint16]int + + // SynonymIDtoTerm maps synonym id to term for each thesaurus + // thesaurus id -> synonym id -> term + SynonymTermToID []map[string]uint32 + + // SynonymTermToID maps term to synonym id for each thesaurus + // thesaurus id -> term -> synonym id + // this is the inverse of SynonymIDtoTerm for each thesaurus + SynonymIDtoTerm []map[uint32]string + + // synonym postings list -> synonym bitmap + Synonyms []*roaring64.Bitmap + + // A reusable vellum FST builder that will be stored in the synonym opaque + // and reused across multiple document batches during the persist phase + // of the synonym index section, the FST builder is used to build the + // FST for each thesaurus, which maps terms to their corresponding synonym bitmaps. + builder *vellum.Builder + + // A reusable buffer for the vellum FST builder. It streams data written + // into the builder into a byte slice. The final byte slice represents + // the serialized vellum FST, which will be written to disk + builderBuf bytes.Buffer + + // A reusable buffer for temporary use within the synonym index opaque + tmp0 []byte + + // A map linking thesaurus IDs to their corresponding thesaurus' file offsets + thesaurusAddrs map[int]int +} + +// Set the fieldsMap and results in the synonym index opaque before the section processes a synonym field. +func (so *synonymIndexOpaque) Set(key string, value interface{}) { + switch key { + case "results": + so.results = value.([]index.Document) + case "fieldsMap": + so.FieldsMap = value.(map[string]uint16) + } +} + +// Reset the synonym index opaque after a batch of documents have been processed into a segment. +func (so *synonymIndexOpaque) Reset() (err error) { + // cleanup stuff over here + so.results = nil + so.init = false + so.ThesaurusMap = nil + so.ThesaurusInv = so.ThesaurusInv[:0] + for i := range so.Thesauri { + so.Thesauri[i] = nil + } + so.Thesauri = so.Thesauri[:0] + for i := range so.ThesaurusKeys { + so.ThesaurusKeys[i] = so.ThesaurusKeys[i][:0] + } + so.ThesaurusKeys = so.ThesaurusKeys[:0] + for _, idn := range so.Synonyms { + idn.Clear() + } + so.Synonyms = so.Synonyms[:0] + so.builderBuf.Reset() + if so.builder != nil { + err = so.builder.Reset(&so.builderBuf) + } + so.FieldIDtoThesaurusID = nil + so.SynonymTermToID = so.SynonymTermToID[:0] + so.SynonymIDtoTerm = so.SynonymIDtoTerm[:0] + + so.tmp0 = so.tmp0[:0] + return err +} + +func (so *synonymIndexOpaque) process(field index.SynonymField, fieldID uint16, docNum uint32) { + // if this is the first time we are processing a synonym field in this batch + // we need to allocate memory for the thesauri and related data structures + if !so.init { + so.realloc() + so.init = true + } + + // get the thesaurus id for this field + tid := so.FieldIDtoThesaurusID[fieldID] + + // get the thesaurus for this field + thesaurus := so.Thesauri[tid] + + termSynMap := so.SynonymTermToID[tid] + + field.IterateSynonyms(func(term string, synonyms []string) { + pid := thesaurus[term] - 1 + + bs := so.Synonyms[pid] + + for _, syn := range synonyms { + code := encodeSynonym(termSynMap[syn], docNum) + bs.Add(code) + } + }) +} + +// a one-time call to allocate memory for the thesauri and synonyms which takes +// all the documents in the result batch and the fieldsMap and predetermines the +// size of the data structures in the synonymIndexOpaque +func (so *synonymIndexOpaque) realloc() { + var pidNext int + var sidNext uint32 + so.ThesaurusMap = map[string]uint16{} + so.FieldIDtoThesaurusID = map[uint16]int{} + + // count the number of unique thesauri from the batch of documents + for _, result := range so.results { + if synDoc, ok := result.(index.SynonymDocument); ok { + synDoc.VisitSynonymFields(func(synField index.SynonymField) { + fieldIDPlus1 := so.FieldsMap[synField.Name()] + so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name()) + }) + } + } + + for _, result := range so.results { + if synDoc, ok := result.(index.SynonymDocument); ok { + synDoc.VisitSynonymFields(func(synField index.SynonymField) { + fieldIDPlus1 := so.FieldsMap[synField.Name()] + thesaurusID := so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name()) + + thesaurus := so.Thesauri[thesaurusID] + thesaurusKeys := so.ThesaurusKeys[thesaurusID] + + synTermMap := so.SynonymIDtoTerm[thesaurusID] + + termSynMap := so.SynonymTermToID[thesaurusID] + + // iterate over all the term-synonyms pair from the field + synField.IterateSynonyms(func(term string, synonyms []string) { + _, exists := thesaurus[term] + if !exists { + pidNext++ + pidPlus1 := uint64(pidNext) + + thesaurus[term] = pidPlus1 + thesaurusKeys = append(thesaurusKeys, term) + } + for _, syn := range synonyms { + _, exists := termSynMap[syn] + if !exists { + termSynMap[syn] = sidNext + synTermMap[sidNext] = syn + sidNext++ + } + } + }) + so.ThesaurusKeys[thesaurusID] = thesaurusKeys + }) + } + } + + numSynonymsLists := pidNext + + if cap(so.Synonyms) >= numSynonymsLists { + so.Synonyms = so.Synonyms[:numSynonymsLists] + } else { + synonyms := make([]*roaring64.Bitmap, numSynonymsLists) + copy(synonyms, so.Synonyms[:cap(so.Synonyms)]) + for i := 0; i < numSynonymsLists; i++ { + if synonyms[i] == nil { + synonyms[i] = roaring64.New() + } + } + so.Synonyms = synonyms + } + + for _, thes := range so.ThesaurusKeys { + sort.Strings(thes) + } +} + +// getOrDefineThesaurus returns the thesaurus id for the given field id and thesaurus name. +func (so *synonymIndexOpaque) getOrDefineThesaurus(fieldID uint16, thesaurusName string) int { + thesaurusIDPlus1, exists := so.ThesaurusMap[thesaurusName] + if !exists { + // need to create a new thesaurusID for this thesaurusName and + thesaurusIDPlus1 = uint16(len(so.ThesaurusInv) + 1) + so.ThesaurusMap[thesaurusName] = thesaurusIDPlus1 + so.ThesaurusInv = append(so.ThesaurusInv, thesaurusName) + + so.Thesauri = append(so.Thesauri, make(map[string]uint64)) + + so.SynonymIDtoTerm = append(so.SynonymIDtoTerm, make(map[uint32]string)) + + so.SynonymTermToID = append(so.SynonymTermToID, make(map[string]uint32)) + + // map the fieldID to the thesaurusID + so.FieldIDtoThesaurusID[fieldID] = int(thesaurusIDPlus1 - 1) + + n := len(so.ThesaurusKeys) + if n < cap(so.ThesaurusKeys) { + so.ThesaurusKeys = so.ThesaurusKeys[:n+1] + so.ThesaurusKeys[n] = so.ThesaurusKeys[n][:0] + } else { + so.ThesaurusKeys = append(so.ThesaurusKeys, []string(nil)) + } + } + + return int(thesaurusIDPlus1 - 1) +} + +// grabBuf returns a reusable buffer of the given size from the synonymIndexOpaque. +func (so *synonymIndexOpaque) grabBuf(size int) []byte { + buf := so.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + so.tmp0 = buf + } + return buf[:size] +} + +func (so *synonymIndexOpaque) writeThesauri(w *CountHashWriter) (thesOffsets []uint64, err error) { + + if so.results == nil || len(so.results) == 0 { + return nil, nil + } + + thesOffsets = make([]uint64, len(so.ThesaurusInv)) + + buf := so.grabBuf(binary.MaxVarintLen64) + + if so.builder == nil { + so.builder, err = vellum.New(&so.builderBuf, nil) + if err != nil { + return nil, err + } + } + + for thesaurusID, terms := range so.ThesaurusKeys { + thes := so.Thesauri[thesaurusID] + for _, term := range terms { // terms are already sorted + pid := thes[term] - 1 + postingsBS := so.Synonyms[pid] + postingsOffset, err := writeSynonyms(postingsBS, w, buf) + if err != nil { + return nil, err + } + + if postingsOffset > uint64(0) { + err = so.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return nil, err + } + } + } + + err = so.builder.Close() + if err != nil { + return nil, err + } + + thesOffsets[thesaurusID] = uint64(w.Count()) + + vellumData := so.builderBuf.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, err + } + + // reset vellum for reuse + so.builderBuf.Reset() + + err = so.builder.Reset(&so.builderBuf) + if err != nil { + return nil, err + } + + // write out the synTermMap for this thesaurus + err := writeSynTermMap(so.SynonymIDtoTerm[thesaurusID], w, buf) + if err != nil { + return nil, err + } + + thesaurusStart := w.Count() + + n = binary.PutUvarint(buf, fieldNotUninverted) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + n = binary.PutUvarint(buf, fieldNotUninverted) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + n = binary.PutUvarint(buf, thesOffsets[thesaurusID]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + so.thesaurusAddrs[thesaurusID] = thesaurusStart + } + return thesOffsets, nil +} + +// ----------------------------------------------------------------------------- + +type synonymIndexSection struct { +} + +func (s *synonymIndexSection) getSynonymIndexOpaque(opaque map[int]resetable) *synonymIndexOpaque { + if _, ok := opaque[SectionSynonymIndex]; !ok { + opaque[SectionSynonymIndex] = s.InitOpaque(nil) + } + return opaque[SectionSynonymIndex].(*synonymIndexOpaque) +} + +// Implementations of the Section interface for the synonym index section. +// InitOpaque initializes the synonym index opaque, which sets the FieldsMap and +// results in the opaque before the section processes a synonym field. +func (s *synonymIndexSection) InitOpaque(args map[string]interface{}) resetable { + rv := &synonymIndexOpaque{ + thesaurusAddrs: map[int]int{}, + } + for k, v := range args { + rv.Set(k, v) + } + + return rv +} + +// Process processes a synonym field by adding the synonyms to the thesaurus +// pointed to by the fieldID, implements the Process API for the synonym index section. +func (s *synonymIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) { + if fieldID == math.MaxUint16 { + return + } + if sf, ok := field.(index.SynonymField); ok { + so := s.getSynonymIndexOpaque(opaque) + so.process(sf, fieldID, docNum) + } +} + +// Persist serializes and writes the thesauri processed to the writer, along +// with the synonym postings lists, and the synonym term map. Implements the +// Persist API for the synonym index section. +func (s *synonymIndexSection) Persist(opaque map[int]resetable, w *CountHashWriter) (n int64, err error) { + synIndexOpaque := s.getSynonymIndexOpaque(opaque) + _, err = synIndexOpaque.writeThesauri(w) + return 0, err +} + +// AddrForField returns the file offset of the thesaurus for the given fieldID, +// it uses the FieldIDtoThesaurusID map to translate the fieldID to the thesaurusID, +// and returns the corresponding thesaurus offset from the thesaurusAddrs map. +// Implements the AddrForField API for the synonym index section. +func (s *synonymIndexSection) AddrForField(opaque map[int]resetable, fieldID int) int { + synIndexOpaque := s.getSynonymIndexOpaque(opaque) + if synIndexOpaque == nil || synIndexOpaque.FieldIDtoThesaurusID == nil { + return 0 + } + tid, exists := synIndexOpaque.FieldIDtoThesaurusID[uint16(fieldID)] + if !exists { + return 0 + } + return synIndexOpaque.thesaurusAddrs[tid] +} + +// Merge merges the thesauri, synonym postings lists and synonym term maps from +// the segments into a single thesaurus and serializes and writes the merged +// thesaurus and associated data to the writer. Implements the Merge API for the +// synonym index section. +func (s *synonymIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase, + drops []*roaring.Bitmap, fieldsInv []string, newDocNumsIn [][]uint64, + w *CountHashWriter, closeCh chan struct{}) error { + so := s.getSynonymIndexOpaque(opaque) + thesaurusAddrs, fieldIDtoThesaurusID, err := mergeAndPersistSynonymSection(segments, drops, fieldsInv, newDocNumsIn, w, closeCh) + if err != nil { + return err + } + + so.thesaurusAddrs = thesaurusAddrs + so.FieldIDtoThesaurusID = fieldIDtoThesaurusID + return nil +} + +// ----------------------------------------------------------------------------- + +// encodeSynonym encodes a synonymID and a docID into a single uint64 value. +// The encoding format splits the 64 bits as follows: +// +// 63 32 31 0 +// +-----------+----------+ +// | synonymID | docNum | +// +-----------+----------+ +// +// The upper 32 bits (63-32) store the synonymID, and the lower 32 bits (31-0) store the docID. +// +// Parameters: +// +// synonymID - A 32-bit unsigned integer representing the ID of the synonym. +// docID - A 32-bit unsigned integer representing the document ID. +// +// Returns: +// +// A 64-bit unsigned integer that combines the synonymID and docID. +func encodeSynonym(synonymID uint32, docID uint32) uint64 { + return uint64(synonymID)<<32 | uint64(docID) +} + +// writeSynonyms serilizes and writes the synonym postings list to the writer, by first +// serializing the postings list to a byte slice and then writing the length +// of the byte slice followed by the byte slice itself. +func writeSynonyms(postings *roaring64.Bitmap, w *CountHashWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + postingsOffset := uint64(w.Count()) + + buf, err := postings.ToBytes() + if err != nil { + return 0, err + } + // write out the length + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(buf))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + // write out the roaring bytes + _, err = w.Write(buf) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + +// writeSynTermMap serializes and writes the synonym term map to the writer, by first +// writing the length of the map followed by the map entries, where each entry +// consists of the synonym ID, the length of the term, and the term itself. +func writeSynTermMap(synTermMap map[uint32]string, w *CountHashWriter, bufMaxVarintLen64 []byte) error { + if len(synTermMap) == 0 { + return nil + } + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(synTermMap))) + _, err := w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return err + } + + for sid, term := range synTermMap { + n = binary.PutUvarint(bufMaxVarintLen64, uint64(sid)) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(bufMaxVarintLen64, uint64(len(term))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return err + } + + _, err = w.Write([]byte(term)) + if err != nil { + return err + } + } + + return nil +} + +func mergeAndPersistSynonymSection(segments []*SegmentBase, dropsIn []*roaring.Bitmap, + fieldsInv []string, newDocNumsIn [][]uint64, w *CountHashWriter, + closeCh chan struct{}) (map[int]int, map[uint16]int, error) { + + var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + + var synonyms *SynonymsList + var synItr *SynonymsIterator + + thesaurusAddrs := make(map[int]int) + + var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, nil, err + } + + newRoaring := roaring64.NewBitmap() + + newDocNums := make([][]uint64, 0, len(segments)) + + drops := make([]*roaring.Bitmap, 0, len(segments)) + + thesauri := make([]*Thesaurus, 0, len(segments)) + + itrs := make([]vellum.Iterator, 0, len(segments)) + + fieldIDtoThesaurusID := make(map[uint16]int) + + var thesaurusID int + var newSynonymID uint32 + + // for each field + for fieldID, fieldName := range fieldsInv { + // collect FST iterators from all active segments for this field + newDocNums = newDocNums[:0] + drops = drops[:0] + thesauri = thesauri[:0] + itrs = itrs[:0] + newSynonymID = 0 + synTermMap := make(map[uint32]string) + termSynMap := make(map[string]uint32) + + for segmentI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, nil, seg.ErrClosed + } + + thes, err2 := segment.thesaurus(fieldName) + if err2 != nil { + return nil, nil, err2 + } + if thes != nil && thes.fst != nil { + itr, err2 := thes.fst.Iterator(nil, nil) + if err2 != nil && err2 != vellum.ErrIteratorDone { + return nil, nil, err2 + } + if itr != nil { + newDocNums = append(newDocNums, newDocNumsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } + thesauri = append(thesauri, thes) + itrs = append(itrs, itr) + } + } + } + + // if no iterators, skip this field + if len(itrs) == 0 { + continue + } + + var prevTerm []byte + + newRoaring.Clear() + + finishTerm := func(term []byte) error { + postingsOffset, err := writeSynonyms(newRoaring, w, bufMaxVarintLen64) + if err != nil { + return err + } + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) + if err != nil { + return err + } + } + newRoaring.Clear() + return nil + } + + enumerator, err := newEnumerator(itrs) + + for err == nil { + term, itrI, postingsOffset := enumerator.Current() + + if prevTerm != nil && !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, nil, seg.ErrClosed + } + + // if the term changed, write out the info collected + // for the previous term + err = finishTerm(prevTerm) + if err != nil { + return nil, nil, err + } + } + + synonyms, err = thesauri[itrI].synonymsListFromOffset( + postingsOffset, drops[itrI], synonyms) + if err != nil { + return nil, nil, err + } + synItr = synonyms.iterator(synItr) + + var next seg.Synonym + next, err = synItr.Next() + for next != nil && err == nil { + synNewDocNum := newDocNums[itrI][next.Number()] + if synNewDocNum == docDropped { + return nil, nil, fmt.Errorf("see hit with dropped docNum") + } + nextTerm := next.Term() + var synNewID uint32 + if synID, ok := termSynMap[nextTerm]; ok { + synNewID = synID + } else { + synNewID = newSynonymID + termSynMap[nextTerm] = newSynonymID + synTermMap[newSynonymID] = nextTerm + newSynonymID++ + } + synNewCode := encodeSynonym(synNewID, uint32(synNewDocNum)) + newRoaring.Add(synNewCode) + next, err = synItr.Next() + } + if err != nil { + return nil, nil, err + } + + prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem + prevTerm = append(prevTerm, term...) + err = enumerator.Next() + } + if err != vellum.ErrIteratorDone { + return nil, nil, err + } + // close the enumerator to free the underlying iterators + err = enumerator.Close() + if err != nil { + return nil, nil, err + } + + if prevTerm != nil { + err = finishTerm(prevTerm) + if err != nil { + return nil, nil, err + } + } + + err = newVellum.Close() + if err != nil { + return nil, nil, err + } + vellumData := vellumBuf.Bytes() + + thesOffset := uint64(w.Count()) + + // write out the length of the vellum data + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, nil, err + } + + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) + if err != nil { + return nil, nil, err + } + + // write out the synTermMap for this thesaurus + err = writeSynTermMap(synTermMap, w, bufMaxVarintLen64) + if err != nil { + return nil, nil, err + } + + thesStart := w.Count() + + // the synonym index section does not have any doc value data + // so we write two special entries to indicate that + // the field is not uninverted and the thesaurus offset + n = binary.PutUvarint(bufMaxVarintLen64, fieldNotUninverted) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, fieldNotUninverted) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + // write out the thesaurus offset from which the vellum data starts + n = binary.PutUvarint(bufMaxVarintLen64, thesOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + // if we have a new thesaurus, add it to the thesaurus map + fieldIDtoThesaurusID[uint16(fieldID)] = thesaurusID + thesaurusAddrs[thesaurusID] = thesStart + thesaurusID++ + } + + return thesaurusAddrs, fieldIDtoThesaurusID, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v16/segment.go b/vendor/github.com/blevesearch/zapx/v16/segment.go index 8780ead19..19aebe3e9 100644 --- a/vendor/github.com/blevesearch/zapx/v16/segment.go +++ b/vendor/github.com/blevesearch/zapx/v16/segment.go @@ -24,7 +24,7 @@ import ( "sync/atomic" "unsafe" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" mmap "github.com/blevesearch/mmap-go" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -56,6 +56,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { fieldsMap: make(map[string]uint16), fieldFSTs: make(map[uint16]*vellum.FST), vecIndexCache: newVectorIndexCache(), + synIndexCache: newSynonymIndexCache(), fieldDvReaders: make([]map[uint16]*docValueReader, len(segmentSections)), }, f: f, @@ -88,6 +89,10 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { // SegmentBase is a memory only, read-only implementation of the // segment.Segment interface, using zap's data representation. type SegmentBase struct { + // atomic access to these variables, moved to top to correct alignment issues on ARM, 386 and 32-bit MIPS. + bytesRead uint64 + bytesWritten uint64 + mem []byte memCRC uint32 chunkMode uint32 @@ -104,15 +109,12 @@ type SegmentBase struct { fieldDvNames []string // field names cached in fieldDvReaders size uint64 - // atomic access to these variables - bytesRead uint64 - bytesWritten uint64 - m sync.Mutex fieldFSTs map[uint16]*vellum.FST // this cache comes into play when vectors are supported in builds. vecIndexCache *vectorIndexCache + synIndexCache *synonymIndexCache } func (sb *SegmentBase) Size() int { @@ -149,7 +151,11 @@ func (sb *SegmentBase) updateSize() { func (sb *SegmentBase) AddRef() {} func (sb *SegmentBase) DecRef() (err error) { return nil } -func (sb *SegmentBase) Close() (err error) { sb.vecIndexCache.Clear(); return nil } +func (sb *SegmentBase) Close() (err error) { + sb.vecIndexCache.Clear() + sb.synIndexCache.Clear() + return nil +} // Segment implements a persisted segment.Segment interface, by // embedding an mmap()'ed SegmentBase. @@ -472,6 +478,48 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { return rv, nil } +// Thesaurus returns the thesaurus with the specified name, or an empty thesaurus if not found. +func (s *SegmentBase) Thesaurus(name string) (segment.Thesaurus, error) { + thesaurus, err := s.thesaurus(name) + if err == nil && thesaurus == nil { + return emptyThesaurus, nil + } + return thesaurus, err +} + +func (sb *SegmentBase) thesaurus(name string) (rv *Thesaurus, err error) { + fieldIDPlus1 := sb.fieldsMap[name] + if fieldIDPlus1 == 0 { + return nil, nil + } + pos := sb.fieldsSectionsMap[fieldIDPlus1-1][SectionSynonymIndex] + if pos > 0 { + rv = &Thesaurus{ + sb: sb, + name: name, + fieldID: fieldIDPlus1 - 1, + } + // skip the doc value offsets as doc values are not supported in thesaurus + for i := 0; i < 2; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + } + thesLoc, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + fst, synTermMap, err := sb.synIndexCache.loadOrCreate(rv.fieldID, sb.mem[thesLoc:]) + if err != nil { + return nil, fmt.Errorf("thesaurus name %s err: %v", name, err) + } + rv.fst = fst + rv.synIDTermMap = synTermMap + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("thesaurus name %s vellum reader err: %v", name, err) + } + } + return rv, nil +} + // visitDocumentCtx holds data structures that are reusable across // multiple VisitDocument() calls to avoid memory allocations type visitDocumentCtx struct { @@ -648,8 +696,9 @@ func (s *Segment) Close() (err error) { } func (s *Segment) closeActual() (err error) { - // clear contents from the vector index cache before un-mmapping + // clear contents from the vector and synonym index cache before un-mmapping s.vecIndexCache.Clear() + s.synIndexCache.Clear() if s.mm != nil { err = s.mm.Unmap() @@ -719,6 +768,25 @@ func (s *Segment) DictAddr(field string) (uint64, error) { return s.dictLocs[fieldIDPlus1-1], nil } +// ThesaurusAddr is a helper function to compute the file offset where the +// thesaurus is stored with the specified name. +func (s *Segment) ThesaurusAddr(name string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[name] + if !ok { + return 0, fmt.Errorf("no such thesaurus '%s'", name) + } + thesaurusStart := s.fieldsSectionsMap[fieldIDPlus1-1][SectionSynonymIndex] + if thesaurusStart == 0 { + return 0, fmt.Errorf("no such thesaurus '%s'", name) + } + for i := 0; i < 2; i++ { + _, n := binary.Uvarint(s.mem[thesaurusStart : thesaurusStart+binary.MaxVarintLen64]) + thesaurusStart += uint64(n) + } + thesLoc, _ := binary.Uvarint(s.mem[thesaurusStart : thesaurusStart+binary.MaxVarintLen64]) + return thesLoc, nil +} + func (s *Segment) getSectionDvOffsets(fieldID int, secID uint16) (uint64, uint64, uint64, error) { // Version is gonna be 16 var fieldLocStart uint64 = fieldNotUninverted diff --git a/vendor/github.com/blevesearch/zapx/v16/synonym_cache.go b/vendor/github.com/blevesearch/zapx/v16/synonym_cache.go new file mode 100644 index 000000000..0b8d56c29 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/synonym_cache.go @@ -0,0 +1,126 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "sync" + + "github.com/blevesearch/vellum" +) + +func newSynonymIndexCache() *synonymIndexCache { + return &synonymIndexCache{ + cache: make(map[uint16]*synonymCacheEntry), + } +} + +type synonymIndexCache struct { + m sync.RWMutex + + cache map[uint16]*synonymCacheEntry +} + +// Clear clears the synonym cache which would mean tha the termID to term map would no longer be available. +func (sc *synonymIndexCache) Clear() { + sc.m.Lock() + sc.cache = nil + sc.m.Unlock() +} + +// loadOrCreate loads the synonym index cache for the specified fieldID if it is already present, +// or creates it if not. The synonym index cache for a fieldID consists of a tuple: +// - A Vellum FST (Finite State Transducer) representing the thesaurus. +// - A map associating synonym IDs to their corresponding terms. +// This function returns the loaded or newly created tuple (FST and map). +func (sc *synonymIndexCache) loadOrCreate(fieldID uint16, mem []byte) (*vellum.FST, map[uint32][]byte, error) { + sc.m.RLock() + entry, ok := sc.cache[fieldID] + if ok { + sc.m.RUnlock() + return entry.load() + } + + sc.m.RUnlock() + + sc.m.Lock() + defer sc.m.Unlock() + + entry, ok = sc.cache[fieldID] + if ok { + return entry.load() + } + + return sc.createAndCacheLOCKED(fieldID, mem) +} + +// createAndCacheLOCKED creates the synonym index cache for the specified fieldID and caches it. +func (sc *synonymIndexCache) createAndCacheLOCKED(fieldID uint16, mem []byte) (*vellum.FST, map[uint32][]byte, error) { + var pos uint64 + vellumLen, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if vellumLen == 0 || read <= 0 { + return nil, nil, fmt.Errorf("vellum length is 0") + } + pos += uint64(read) + fstBytes := mem[pos : pos+vellumLen] + fst, err := vellum.Load(fstBytes) + if err != nil { + return nil, nil, fmt.Errorf("vellum err: %v", err) + } + pos += vellumLen + numSyns, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + if numSyns == 0 { + return nil, nil, fmt.Errorf("no synonyms found") + } + synTermMap := make(map[uint32][]byte, numSyns) + for i := 0; i < int(numSyns); i++ { + synID, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + termLen, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + if termLen == 0 { + return nil, nil, fmt.Errorf("term length is 0") + } + term := mem[pos : pos+uint64(termLen)] + pos += uint64(termLen) + synTermMap[uint32(synID)] = term + } + sc.insertLOCKED(fieldID, fst, synTermMap) + return fst, synTermMap, nil +} + +// insertLOCKED inserts the vellum FST and the map of synonymID to term into the cache for the specified fieldID. +func (sc *synonymIndexCache) insertLOCKED(fieldID uint16, fst *vellum.FST, synTermMap map[uint32][]byte) { + _, ok := sc.cache[fieldID] + if !ok { + sc.cache[fieldID] = &synonymCacheEntry{ + fst: fst, + synTermMap: synTermMap, + } + } +} + +// synonymCacheEntry is a tuple of the vellum FST and the map of synonymID to term, +// and is the value stored in the synonym cache, for a given fieldID. +type synonymCacheEntry struct { + fst *vellum.FST + synTermMap map[uint32][]byte +} + +func (ce *synonymCacheEntry) load() (*vellum.FST, map[uint32][]byte, error) { + return ce.fst, ce.synTermMap, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v16/synonym_posting.go b/vendor/github.com/blevesearch/zapx/v16/synonym_posting.go new file mode 100644 index 000000000..c411a69b4 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/synonym_posting.go @@ -0,0 +1,239 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "reflect" + + "github.com/RoaringBitmap/roaring/v2" + "github.com/RoaringBitmap/roaring/v2/roaring64" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizeSynonymsList int +var reflectStaticSizeSynonymsIterator int +var reflectStaticSizeSynonym int + +func init() { + var sl SynonymsList + reflectStaticSizeSynonymsList = int(reflect.TypeOf(sl).Size()) + var si SynonymsIterator + reflectStaticSizeSynonymsIterator = int(reflect.TypeOf(si).Size()) + var s Synonym + reflectStaticSizeSynonym = int(reflect.TypeOf(s).Size()) +} + +// SynonymsList represents a list of synonyms for a term, stored in a Roaring64 bitmap. +type SynonymsList struct { + sb *SegmentBase + synonymsOffset uint64 + synonyms *roaring64.Bitmap + except *roaring.Bitmap + + synIDTermMap map[uint32][]byte + + buffer *bytes.Reader +} + +// immutable, empty synonyms list +var emptySynonymsList = &SynonymsList{} + +func (p *SynonymsList) Size() int { + sizeInBytes := reflectStaticSizeSynonymsList + SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +// Iterator creates and returns a SynonymsIterator for the SynonymsList. +// If the synonyms bitmap is nil, it returns an empty iterator. +func (s *SynonymsList) Iterator(prealloc segment.SynonymsIterator) segment.SynonymsIterator { + if s.synonyms == nil { + return emptySynonymsIterator + } + + var preallocSI *SynonymsIterator + pi, ok := prealloc.(*SynonymsIterator) + if ok && pi != nil { + preallocSI = pi + } + if preallocSI == emptySynonymsIterator { + preallocSI = nil + } + + return s.iterator(preallocSI) +} + +// iterator initializes a SynonymsIterator for the SynonymsList and returns it. +// If a preallocated iterator is provided, it resets and reuses it; otherwise, it creates a new one. +func (s *SynonymsList) iterator(rv *SynonymsIterator) *SynonymsIterator { + if rv == nil { + rv = &SynonymsIterator{} + } else { + *rv = SynonymsIterator{} // clear the struct + } + rv.synonyms = s + rv.except = s.except + rv.Actual = s.synonyms.Iterator() + rv.ActualBM = s.synonyms + rv.synIDTermMap = s.synIDTermMap + return rv +} + +// read initializes a SynonymsList by reading data from the given synonymsOffset in the Thesaurus. +// It reads and parses the Roaring64 bitmap that represents the synonyms. +func (rv *SynonymsList) read(synonymsOffset uint64, t *Thesaurus) error { + rv.synonymsOffset = synonymsOffset + + var n uint64 + var read int + + var synonymsLen uint64 + synonymsLen, read = binary.Uvarint(t.sb.mem[synonymsOffset+n : synonymsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes := t.sb.mem[synonymsOffset+n : synonymsOffset+n+synonymsLen] + + if rv.synonyms == nil { + rv.synonyms = roaring64.NewBitmap() + } + + rv.buffer.Reset(roaringBytes) + + _, err := rv.synonyms.ReadFrom(rv.buffer) + if err != nil { + return fmt.Errorf("error loading roaring bitmap: %v", err) + } + + return nil +} + +// ----------------------------------------------------------------------------- + +// SynonymsIterator provides a way to iterate through the synonyms list. +type SynonymsIterator struct { + synonyms *SynonymsList + except *roaring.Bitmap + + Actual roaring64.IntPeekable64 + ActualBM *roaring64.Bitmap + + synIDTermMap map[uint32][]byte + nextSyn Synonym +} + +// immutable, empty synonyms iterator +var emptySynonymsIterator = &SynonymsIterator{} + +func (i *SynonymsIterator) Size() int { + sizeInBytes := reflectStaticSizeSynonymsIterator + SizeOfPtr + + i.nextSyn.Size() + + return sizeInBytes +} + +// Next returns the next Synonym in the iteration or an error if the end is reached. +func (i *SynonymsIterator) Next() (segment.Synonym, error) { + return i.next() +} + +// next retrieves the next synonym from the iterator, populates the nextSyn field, +// and returns it. If no valid synonym is found, it returns an error. +func (i *SynonymsIterator) next() (segment.Synonym, error) { + synID, docNum, exists, err := i.nextSynonym() + if err != nil || !exists { + return nil, err + } + + if i.synIDTermMap == nil { + return nil, fmt.Errorf("synIDTermMap is nil") + } + + // If the synonymID is not found in the map, return an error + term, exists := i.synIDTermMap[synID] + if !exists { + return nil, fmt.Errorf("synonymID %d not found in map", synID) + } + + i.nextSyn = Synonym{} // clear the struct + rv := &i.nextSyn + rv.term = string(term) + rv.docNum = docNum + + return rv, nil +} + +// nextSynonym decodes the next synonym from the roaring bitmap iterator, +// ensuring it is not in the "except" set. Returns the synonymID, docNum, +// and a flag indicating success. +func (i *SynonymsIterator) nextSynonym() (uint32, uint32, bool, error) { + // If no synonyms are available, return early + if i.Actual == nil || i.synonyms == nil || i.synonyms == emptySynonymsList { + return 0, 0, false, nil + } + + var code uint64 + var docNum uint32 + var synID uint32 + + // Loop to find the next valid docNum, checking against the except + for i.Actual.HasNext() { + code = i.Actual.Next() + synID, docNum = decodeSynonym(code) + + // If docNum is not in the 'except' set, it's a valid result + if i.except == nil || !i.except.Contains(docNum) { + return synID, docNum, true, nil + } + } + + // If no valid docNum is found, return false + return 0, 0, false, nil +} + +// Synonym represents a single synonym, containing the term, synonymID, and document number. +type Synonym struct { + term string + docNum uint32 +} + +// Size returns the memory size of the Synonym, including the length of the term string. +func (p *Synonym) Size() int { + sizeInBytes := reflectStaticSizeSynonym + SizeOfPtr + + len(p.term) + + return sizeInBytes +} + +// Term returns the term of the Synonym. +func (s *Synonym) Term() string { + return s.term +} + +// Number returns the document number of the Synonym. +func (s *Synonym) Number() uint32 { + return s.docNum +} + +// decodeSynonym decodes a synonymCode into its synonymID and document ID components. +func decodeSynonym(synonymCode uint64) (synonymID uint32, docID uint32) { + return uint32(synonymCode >> 32), uint32(synonymCode) +} diff --git a/vendor/github.com/blevesearch/zapx/v16/thesaurus.go b/vendor/github.com/blevesearch/zapx/v16/thesaurus.go new file mode 100644 index 000000000..34a43629f --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v16/thesaurus.go @@ -0,0 +1,159 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "fmt" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +// Thesaurus is the zap representation of a Thesaurus +type Thesaurus struct { + sb *SegmentBase + name string + fieldID uint16 + synIDTermMap map[uint32][]byte + fst *vellum.FST + + fstReader *vellum.Reader +} + +// represents an immutable, empty Thesaurus +var emptyThesaurus = &Thesaurus{} + +// SynonymsList returns the synonyms list for the specified term +func (t *Thesaurus) SynonymsList(term []byte, except *roaring.Bitmap, prealloc segment.SynonymsList) (segment.SynonymsList, error) { + var preallocSL *SynonymsList + sl, ok := prealloc.(*SynonymsList) + if ok && sl != nil { + preallocSL = sl + } + return t.synonymsList(term, except, preallocSL) +} + +func (t *Thesaurus) synonymsList(term []byte, except *roaring.Bitmap, rv *SynonymsList) (*SynonymsList, error) { + if t.fstReader == nil { + if rv == nil || rv == emptySynonymsList { + return emptySynonymsList, nil + } + return t.synonymsListInit(rv, except), nil + } + + synonymsOffset, exists, err := t.fstReader.Get(term) + + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if !exists { + if rv == nil || rv == emptySynonymsList { + return emptySynonymsList, nil + } + return t.synonymsListInit(rv, except), nil + } + + return t.synonymsListFromOffset(synonymsOffset, except, rv) +} + +func (t *Thesaurus) synonymsListFromOffset(synonymsOffset uint64, except *roaring.Bitmap, rv *SynonymsList) (*SynonymsList, error) { + rv = t.synonymsListInit(rv, except) + + err := rv.read(synonymsOffset, t) + if err != nil { + return nil, err + } + + return rv, nil +} + +func (t *Thesaurus) synonymsListInit(rv *SynonymsList, except *roaring.Bitmap) *SynonymsList { + if rv == nil || rv == emptySynonymsList { + rv = &SynonymsList{} + rv.buffer = bytes.NewReader(nil) + } else { + synonyms := rv.synonyms + buf := rv.buffer + if synonyms != nil { + synonyms.Clear() + } + if buf != nil { + buf.Reset(nil) + } + + *rv = SynonymsList{} // clear the struct + + rv.synonyms = synonyms + rv.buffer = buf + } + rv.sb = t.sb + rv.except = except + rv.synIDTermMap = t.synIDTermMap + return rv +} + +func (t *Thesaurus) Contains(key []byte) (bool, error) { + if t.fst != nil { + return t.fst.Contains(key) + } + return false, nil +} + +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (t *Thesaurus) AutomatonIterator(a segment.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.ThesaurusIterator { + if t.fst != nil { + rv := &ThesaurusIterator{ + t: t, + } + + itr, err := t.fst.Search(a, startKeyInclusive, endKeyExclusive) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + + return rv + } + return emptyThesaurusIterator +} + +var emptyThesaurusIterator = &ThesaurusIterator{} + +// ThesaurusIterator is an iterator for term dictionary +type ThesaurusIterator struct { + t *Thesaurus + itr vellum.Iterator + err error + entry index.ThesaurusEntry +} + +// Next returns the next entry in the dictionary +func (i *ThesaurusIterator) Next() (*index.ThesaurusEntry, error) { + if i.err != nil && i.err != vellum.ErrIteratorDone { + return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil + } + term, _ := i.itr.Current() + i.entry.Term = string(term) + i.err = i.itr.Next() + return &i.entry, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v16/write.go b/vendor/github.com/blevesearch/zapx/v16/write.go index 7b2c99e1b..4e7f55237 100644 --- a/vendor/github.com/blevesearch/zapx/v16/write.go +++ b/vendor/github.com/blevesearch/zapx/v16/write.go @@ -18,7 +18,7 @@ import ( "encoding/binary" "io" - "github.com/RoaringBitmap/roaring" + "github.com/RoaringBitmap/roaring/v2" ) // writes out the length of the roaring bitmap in bytes as varint diff --git a/vendor/github.com/blevesearch/zapx/v16/zap.md b/vendor/github.com/blevesearch/zapx/v16/zap.md index 675ac56c0..8bafac62e 100644 --- a/vendor/github.com/blevesearch/zapx/v16/zap.md +++ b/vendor/github.com/blevesearch/zapx/v16/zap.md @@ -28,9 +28,9 @@ ### Chunked data - [--------] - [ ] - [--------] + [--------] + [ ] + [--------] ## Overview @@ -65,7 +65,6 @@ Footer section describes the configuration of particular ZAP file. The format of V. Version. CC. CRC32. - ## Stored Fields Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located. @@ -114,10 +113,9 @@ Sections Index is a set of NF uint64 addresses (0 through F# - 1) each of which NS. Number of index sections Sn. nth index section - ## Inverted Text Index Section -Each fields has its own types of indexes in separate sections as indicated above. This can be a vector index or inverted text index. +Each field has its own types of indexes in separate sections as indicated above. This can be a vector index or inverted text index. In case of inverted text index, the dictionary is encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term. @@ -151,6 +149,8 @@ In case of inverted text index, the dictionary is encoded in [Vellum](https://gi | | | | |================================================================+- Vector Index Section | | | + | +================================================================+- Synonym Index Section + | | | | |================================================================+- Sections Info +-----------------------------+ | | | | @@ -162,22 +162,76 @@ In case of inverted text index, the dictionary is encoded in [Vellum](https://gi ITI - Inverted Text Index +## Synonym Index Section + +In a synonyms index, the relationship between a term and its synonyms is represented using a Thesaurus. The Thesaurus is encoded in the [Vellum](https://github.com/couchbase/vellum) format and consists of pairs in the form `(term, offset)`. Here, the offset specifies the position of the postings list containing the synonyms for the given term. The postings list is stored as a Roaring64 bitmap, with each entry representing an encoded synonym for the term. + + |================================================================+- Inverted Text Index Section + | | + |================================================================+- Vector Index Section + | | + +================================================================+- Synonym Index Section + | | + | (Offset) +~~~~~+----------+...+---+ | + | +--------->| RL | ROARING64 BITMAP | | + | | +~~~~~+----------+...+---+ +-------------------+ + | |(Term -> Offset) | + | +--------+ | + | | Term ID to Term map (NST Entries) | + | +~~~~+~~~~+~~~~~[{~~~~~+~~~~+~~~~~~}{~~~~~+~~~~+~~~~~~}...{~~~~~+~~~~+~~~~~~}] | + | +->| VL | VD | NST || TID | TL | Term || TID | TL | Term | | TID | TL | Term | | + | | +~~~~+~~~~+~~~~~[{~~~~~+~~~~+~~~~~~}{~~~~~+~~~~+~~~~~~}...{~~~~~+~~~~+~~~~~~}] | + | | | + | +----------------------------+ | + | | | + | +~~~~~~~~~~+~~~~~~~~+~~~~~~~~~~~~~~~~~+ | + +-----> DV Start | DV End | ThesaurusOffset | | + | | +~~~~~~~~~~+~~~~~~~~+~~~~~~~~~~~~~~~~~+ +-------------------+ + | | | + | | | + | |================================================================+- Sections Info + +-----------------------------+ | + | | | + | +-------+-----+-----+------+~~~~~~~~+~~~~~~~~+--+...+--+ | + | | ... | SI | SI ADDR | NS | Length | Name | | + | +-------+-----+------------+~~~~~~~~+~~~~~~~~+--+...+--+ | + +================================================================+ + + SI - Synonym Index + VL - Vellum Length + VD - Vellum Data (Term -> Offset) + RL - Roaring64 Length + NST - Number of entries in the term ID to term map + TID - Term ID (32-bit) + TL - Term Length + +### Synonym Encoding + + ROARING64 BITMAP + + Each 64-bit entry consists of two parts: the first 32 bits represent the Term ID (TID), + and the next 32 bits represent the Document Number (DN). + + [{~~~~~+~~~~}{~~~~~+~~~~}...{~~~~~+~~~~}] + | TID | DN || TID | DN | | TID | DN | + [{~~~~~+~~~~}{~~~~~+~~~~}...{~~~~~+~~~~}] + + TID - Term ID (32-bit) + DN - Document Number (32-bit) ## Doc Values DocValue start and end offsets are stored within the section content of each field. This allows each field having its own type of index to choose whether to store the doc values or not. For example, it may not make sense to store doc values for vector indexing and so, the offsets can be invalid ones for it whereas the fields having text indexing may have valid doc values offsets. - - +================================================================+ - | +------...--+ | - | +->+ DocValues +<-+ | - | | +------...--+ | | - |==|=================|===========================================+- Inverted Text - ++~+~~~~~~~~~+~~~~~~~+~~+~~~~~~~~+-----------------------...--+ | Index Section - || DV START | DV END | LENGTH | VELLUM DATA: TERM -> OFFSET| | - ++~~~~~~~~~~~+~~~~~~~~~~+~~~~~~~~+-----------------------...--+ | - +================================================================+ - + +================================================================+ + | +------...--+ | + | +->+ DocValues +<-+ | + | | +------...--+ | | + |==|=================|===========================================+- Inverted Text + ++~+~~~~~~~~~+~~~~~~~+~~+~~~~~~~~+-----------------------...--+ | Index Section + || DV START | DV END | LENGTH | VELLUM DATA: TERM -> OFFSET| | + ++~~~~~~~~~~~+~~~~~~~~~~+~~~~~~~~+-----------------------...--+ | + +================================================================+ DocValues is chunked Snappy-compressed values for each document and field. diff --git a/vendor/modules.txt b/vendor/modules.txt index 9119683ad..b34a99a77 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -68,11 +68,11 @@ github.com/ProtonMail/go-crypto/openpgp/packet github.com/ProtonMail/go-crypto/openpgp/s2k github.com/ProtonMail/go-crypto/openpgp/x25519 github.com/ProtonMail/go-crypto/openpgp/x448 -# github.com/RoaringBitmap/roaring v1.9.3 -## explicit; go 1.14 -github.com/RoaringBitmap/roaring -github.com/RoaringBitmap/roaring/internal -github.com/RoaringBitmap/roaring/roaring64 +# github.com/RoaringBitmap/roaring/v2 v2.4.5 +## explicit; go 1.15 +github.com/RoaringBitmap/roaring/v2 +github.com/RoaringBitmap/roaring/v2/internal +github.com/RoaringBitmap/roaring/v2/roaring64 # github.com/agnivade/levenshtein v1.2.1 ## explicit; go 1.21 github.com/agnivade/levenshtein @@ -158,11 +158,11 @@ github.com/beorn7/perks/quantile # github.com/bitly/go-simplejson v0.5.0 ## explicit github.com/bitly/go-simplejson -# github.com/bits-and-blooms/bitset v1.12.0 +# github.com/bits-and-blooms/bitset v1.22.0 ## explicit; go 1.16 github.com/bits-and-blooms/bitset -# github.com/blevesearch/bleve/v2 v2.4.4 -## explicit; go 1.21 +# github.com/blevesearch/bleve/v2 v2.5.0 +## explicit; go 1.23 github.com/blevesearch/bleve/v2 github.com/blevesearch/bleve/v2/analysis github.com/blevesearch/bleve/v2/analysis/analyzer/custom @@ -203,14 +203,14 @@ github.com/blevesearch/bleve/v2/search/scorer github.com/blevesearch/bleve/v2/search/searcher github.com/blevesearch/bleve/v2/size github.com/blevesearch/bleve/v2/util -# github.com/blevesearch/bleve_index_api v1.1.12 -## explicit; go 1.20 +# github.com/blevesearch/bleve_index_api v1.2.7 +## explicit; go 1.21 github.com/blevesearch/bleve_index_api # github.com/blevesearch/geo v0.1.20 ## explicit; go 1.18 github.com/blevesearch/geo/geojson github.com/blevesearch/geo/s2 -# github.com/blevesearch/go-faiss v1.0.24 +# github.com/blevesearch/go-faiss v1.0.25 ## explicit; go 1.21 github.com/blevesearch/go-faiss # github.com/blevesearch/go-porterstemmer v1.0.3 @@ -222,7 +222,7 @@ github.com/blevesearch/gtreap # github.com/blevesearch/mmap-go v1.0.4 ## explicit; go 1.13 github.com/blevesearch/mmap-go -# github.com/blevesearch/scorch_segment_api/v2 v2.2.16 +# github.com/blevesearch/scorch_segment_api/v2 v2.3.9 ## explicit; go 1.21 github.com/blevesearch/scorch_segment_api/v2 # github.com/blevesearch/segment v0.9.1 @@ -235,28 +235,28 @@ github.com/blevesearch/snowballstem/english # github.com/blevesearch/upsidedown_store_api v1.0.2 ## explicit; go 1.18 github.com/blevesearch/upsidedown_store_api -# github.com/blevesearch/vellum v1.0.10 -## explicit; go 1.18 +# github.com/blevesearch/vellum v1.1.0 +## explicit; go 1.21 github.com/blevesearch/vellum github.com/blevesearch/vellum/levenshtein github.com/blevesearch/vellum/regexp github.com/blevesearch/vellum/utf8 -# github.com/blevesearch/zapx/v11 v11.3.10 -## explicit; go 1.19 +# github.com/blevesearch/zapx/v11 v11.4.1 +## explicit; go 1.21 github.com/blevesearch/zapx/v11 -# github.com/blevesearch/zapx/v12 v12.3.10 -## explicit; go 1.19 +# github.com/blevesearch/zapx/v12 v12.4.1 +## explicit; go 1.21 github.com/blevesearch/zapx/v12 -# github.com/blevesearch/zapx/v13 v13.3.10 -## explicit; go 1.19 +# github.com/blevesearch/zapx/v13 v13.4.1 +## explicit; go 1.21 github.com/blevesearch/zapx/v13 -# github.com/blevesearch/zapx/v14 v14.3.10 -## explicit; go 1.19 +# github.com/blevesearch/zapx/v14 v14.4.1 +## explicit; go 1.21 github.com/blevesearch/zapx/v14 -# github.com/blevesearch/zapx/v15 v15.3.16 -## explicit; go 1.19 +# github.com/blevesearch/zapx/v15 v15.4.1 +## explicit; go 1.21 github.com/blevesearch/zapx/v15 -# github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b +# github.com/blevesearch/zapx/v16 v16.2.2 ## explicit; go 1.21 github.com/blevesearch/zapx/v16 # github.com/bluele/gcache v0.0.2