From 4ae0951f5fb89e0888b417851fad7a312eabcb9c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:18:08 +0000 Subject: [PATCH] build(deps): bump github.com/blevesearch/bleve/v2 from 2.5.4 to 2.5.5 Bumps [github.com/blevesearch/bleve/v2](https://github.com/blevesearch/bleve) from 2.5.4 to 2.5.5. - [Release notes](https://github.com/blevesearch/bleve/releases) - [Commits](https://github.com/blevesearch/bleve/compare/v2.5.4...v2.5.5) --- updated-dependencies: - dependency-name: github.com/blevesearch/bleve/v2 dependency-version: 2.5.5 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- go.mod | 10 +- go.sum | 20 +-- .../blevesearch/bleve/v2/fusion/util.go | 4 +- .../github.com/blevesearch/bleve/v2/index.go | 8 + .../bleve/v2/index/scorch/snapshot_index.go | 59 +++++++ .../v2/index/scorch/snapshot_index_vr.go | 50 ++++++ .../blevesearch/bleve/v2/index_alias_impl.go | 158 +++++++++++++++++ .../blevesearch/bleve/v2/index_impl.go | 165 +++++++++++++----- .../github.com/blevesearch/bleve/v2/search.go | 1 - .../bleve/v2/search/query/boolean.go | 39 +++-- .../bleve/v2/search/query/query.go | 10 +- .../blevesearch/bleve_index_api/index.go | 26 +++ .../github.com/blevesearch/go-faiss/index.go | 71 ++++++++ .../scorch_segment_api/v2/segment_vector.go | 3 + .../zapx/v16/faiss_vector_posting.go | 27 +++ vendor/modules.txt | 10 +- 16 files changed, 582 insertions(+), 79 deletions(-) diff --git a/go.mod b/go.mod index 5ac978244..c43076364 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/Nerzal/gocloak/v13 v13.9.0 github.com/bbalet/stopwords v1.0.0 github.com/beevik/etree v1.6.0 - github.com/blevesearch/bleve/v2 v2.5.4 + github.com/blevesearch/bleve/v2 v2.5.5 github.com/cenkalti/backoff v2.2.1+incompatible github.com/coreos/go-oidc/v3 v3.16.0 github.com/cs3org/go-cs3apis v0.0.0-20250908152307-4ca807afe54e @@ -140,13 +140,13 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/bitly/go-simplejson v0.5.0 // indirect github.com/bits-and-blooms/bitset v1.22.0 // indirect - github.com/blevesearch/bleve_index_api v1.2.10 // indirect + github.com/blevesearch/bleve_index_api v1.2.11 // indirect github.com/blevesearch/geo v0.2.4 // indirect - github.com/blevesearch/go-faiss v1.0.25 // indirect + github.com/blevesearch/go-faiss v1.0.26 // indirect github.com/blevesearch/go-porterstemmer v1.0.3 // indirect github.com/blevesearch/gtreap v0.1.1 // indirect github.com/blevesearch/mmap-go v1.0.4 // indirect - github.com/blevesearch/scorch_segment_api/v2 v2.3.12 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.3.13 // indirect github.com/blevesearch/segment v0.9.1 // indirect github.com/blevesearch/snowballstem v0.9.0 // indirect github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect @@ -156,7 +156,7 @@ require ( github.com/blevesearch/zapx/v13 v13.4.2 // indirect github.com/blevesearch/zapx/v14 v14.4.2 // indirect github.com/blevesearch/zapx/v15 v15.4.2 // indirect - github.com/blevesearch/zapx/v16 v16.2.6 // indirect + github.com/blevesearch/zapx/v16 v16.2.7 // indirect github.com/bluele/gcache v0.0.2 // indirect github.com/bombsimon/logrusr/v3 v3.1.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect diff --git a/go.sum b/go.sum index 2d1b5d8cb..874192145 100644 --- a/go.sum +++ b/go.sum @@ -151,22 +151,22 @@ github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6 github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= -github.com/blevesearch/bleve/v2 v2.5.4 h1:1iur8e+PHsxtncV2xIVuqlQme/V8guEDO2uV6Wll3lQ= -github.com/blevesearch/bleve/v2 v2.5.4/go.mod h1:yB4PnV4N2q5rTEpB2ndG8N2ISexBQEFIYgwx4ztfvoo= -github.com/blevesearch/bleve_index_api v1.2.10 h1:FMFmZCmTX6PdoLLvwUnKF2RsmILFFwO3h0WPevXY9fE= -github.com/blevesearch/bleve_index_api v1.2.10/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= +github.com/blevesearch/bleve/v2 v2.5.5 h1:lzC89QUCco+y1qBnJxGqm4AbtsdsnlUvq0kXok8n3C8= +github.com/blevesearch/bleve/v2 v2.5.5/go.mod h1:t5WoESS5TDteTdnjhhvpA1BpLYErOBX2IQViTMLK7wo= +github.com/blevesearch/bleve_index_api v1.2.11 h1:bXQ54kVuwP8hdrXUSOnvTQfgK0KI1+f9A0ITJT8tX1s= +github.com/blevesearch/bleve_index_api v1.2.11/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= -github.com/blevesearch/go-faiss v1.0.25 h1:lel1rkOUGbT1CJ0YgzKwC7k+XH0XVBHnCVWahdCXk4U= -github.com/blevesearch/go-faiss v1.0.25/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-faiss v1.0.26 h1:4dRLolFgjPyjkaXwff4NfbZFdE/dfywbzDqporeQvXI= +github.com/blevesearch/go-faiss v1.0.26/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.3.12 h1:GGZc2qwbyRBwtckPPkHkLyXw64mmsLJxdturBI1cM+c= -github.com/blevesearch/scorch_segment_api/v2 v2.3.12/go.mod h1:JBRGAneqgLSI2+jCNjtwMqp2B7EBF3/VUzgDPIU33MM= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13 h1:ZPjv/4VwWvHJZKeMSgScCapOy8+DdmsmRyLmSB88UoY= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13/go.mod h1:ENk2LClTehOuMS8XzN3UxBEErYmtwkE7MAArFTXs9Vc= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= @@ -185,8 +185,8 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= -github.com/blevesearch/zapx/v16 v16.2.6 h1:OHuUl2GhM+FpBq9RwNsJ4k/QodqbMMHoQEgn/IHYpu8= -github.com/blevesearch/zapx/v16 v16.2.6/go.mod h1:cuAPB+YoIyRngNhno1S1GPr9SfMk+x/SgAHBLXSIq3k= +github.com/blevesearch/zapx/v16 v16.2.7 h1:xcgFRa7f/tQXOwApVq7JWgPYSlzyUMmkuYa54tMDuR0= +github.com/blevesearch/zapx/v16 v16.2.7/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw= github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0= github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY= diff --git a/vendor/github.com/blevesearch/bleve/v2/fusion/util.go b/vendor/github.com/blevesearch/bleve/v2/fusion/util.go index 8563d9642..6dfa80a45 100644 --- a/vendor/github.com/blevesearch/bleve/v2/fusion/util.go +++ b/vendor/github.com/blevesearch/bleve/v2/fusion/util.go @@ -82,8 +82,8 @@ func scoreSortFunc() func(i, j *search.DocumentMatch) int { func getFusionExplAt(hit *search.DocumentMatch, i int, value float64, message string) *search.Explanation { return &search.Explanation{ - Value: value, - Message: message, + Value: value, + Message: message, Children: []*search.Explanation{hit.Expl.Children[i]}, } } diff --git a/vendor/github.com/blevesearch/bleve/v2/index.go b/vendor/github.com/blevesearch/bleve/v2/index.go index a9c8ada34..2f1ba5fbf 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index.go @@ -388,3 +388,11 @@ type SynonymIndex interface { // IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection. IndexSynonym(id string, collection string, definition *SynonymDefinition) error } + +type InsightsIndex interface { + Index + // TermFrequencies returns the tokens ordered by frequencies for the field index. + TermFrequencies(field string, limit int, descending bool) ([]index.TermFreq, error) + // CentroidCardinalities returns the centroids (clusters) from IVF indexes ordered by data density. + CentroidCardinalities(field string, limit int, desceding bool) ([]index.CentroidCardinality, error) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go index c09a7db40..981640710 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go @@ -23,6 +23,7 @@ import ( "path/filepath" "reflect" "sort" + "strings" "sync" "sync/atomic" @@ -1234,3 +1235,61 @@ func (is *IndexSnapshot) MergeUpdateFieldsInfo(updatedFields map[string]*index.U } } } + +// TermFrequencies returns the top N terms ordered by the frequencies +// for a given field across all segments in the index snapshot. +func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending bool) ( + termFreqs []index.TermFreq, err error) { + if len(is.segment) == 0 { + return nil, nil + } + + if limit <= 0 { + return nil, fmt.Errorf("limit must be positive") + } + + // Use FieldDict which aggregates term frequencies across all segments + fieldDict, err := is.FieldDict(field) + if err != nil { + return nil, fmt.Errorf("failed to get field dictionary for field %s: %v", field, err) + } + defer fieldDict.Close() + + // Preallocate slice with capacity equal to the number of unique terms + // in the field dictionary + termFreqs = make([]index.TermFreq, 0, fieldDict.Cardinality()) + + // Iterate through all terms using FieldDict + for { + dictEntry, err := fieldDict.Next() + if err != nil { + return nil, fmt.Errorf("error iterating field dictionary: %v", err) + } + if dictEntry == nil { + break // End of terms + } + + termFreqs = append(termFreqs, index.TermFreq{ + Term: dictEntry.Term, + Frequency: dictEntry.Count, + }) + } + + // Sort by frequency (descending or ascending) + sort.Slice(termFreqs, func(i, j int) bool { + if termFreqs[i].Frequency == termFreqs[j].Frequency { + // If frequencies are equal, sort by term lexicographically + return strings.Compare(termFreqs[i].Term, termFreqs[j].Term) < 0 + } + if descending { + return termFreqs[i].Frequency > termFreqs[j].Frequency + } + return termFreqs[i].Frequency < termFreqs[j].Frequency + }) + + if limit >= len(termFreqs) { + return termFreqs, nil + } + + return termFreqs[:limit], nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go index 3f2a43a12..bd57ad3e0 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go @@ -23,6 +23,7 @@ import ( "encoding/json" "fmt" "reflect" + "sort" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" @@ -167,3 +168,52 @@ func (i *IndexSnapshotVectorReader) Close() error { // TODO Consider if any scope of recycling here. return nil } + +func (i *IndexSnapshot) CentroidCardinalities(field string, limit int, descending bool) ( + []index.CentroidCardinality, error) { + if len(i.segment) == 0 { + return nil, nil + } + + if limit <= 0 { + return nil, fmt.Errorf("limit must be positive") + } + + centroids := make([]index.CentroidCardinality, 0, limit*len(i.segment)) + + for _, segment := range i.segment { + if sv, ok := segment.segment.(segment_api.VectorSegment); ok { + vecIndex, err := sv.InterpretVectorIndex(field, + false /* does not require filtering */, segment.deleted) + if err != nil { + return nil, fmt.Errorf("failed to interpret vector index for field %s in segment: %v", field, err) + } + + centroidCardinalities, err := vecIndex.ObtainKCentroidCardinalitiesFromIVFIndex(limit, descending) + if err != nil { + return nil, fmt.Errorf("failed to obtain top k centroid cardinalities for field %s in segment: %v", field, err) + } + + if len(centroidCardinalities) > 0 { + centroids = append(centroids, centroidCardinalities...) + } + } + } + + if len(centroids) == 0 { + return nil, nil + } + + sort.Slice(centroids, func(i, j int) bool { + if descending { + return centroids[i].Cardinality > centroids[j].Cardinality + } + return centroids[i].Cardinality < centroids[j].Cardinality + }) + + if limit >= len(centroids) { + return centroids, nil + } + + return centroids[:limit], nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go index 896b6e5ae..eeb63ca9d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go @@ -17,6 +17,8 @@ package bleve import ( "context" "fmt" + "sort" + "strings" "sync" "time" @@ -1136,3 +1138,159 @@ func (f *indexAliasImplFieldDict) Close() error { func (f *indexAliasImplFieldDict) Cardinality() int { return f.fieldDict.Cardinality() } + +// ----------------------------------------------------------------------------- + +func (i *indexAliasImpl) TermFrequencies(field string, limit int, descending bool) ( + []index.TermFreq, error) { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return nil, ErrorIndexClosed + } + + if len(i.indexes) < 1 { + return nil, ErrorAliasEmpty + } + + // short circuit the simple case + if len(i.indexes) == 1 { + if idx, ok := i.indexes[0].(InsightsIndex); ok { + return idx.TermFrequencies(field, limit, descending) + } + return nil, nil + } + + // run search on each index in separate go routine + var waitGroup sync.WaitGroup + asyncResults := make(chan []index.TermFreq, len(i.indexes)) + + searchChildIndex := func(in Index, field string, limit int, descending bool) { + var rv []index.TermFreq + if idx, ok := in.(InsightsIndex); ok { + // over sample for higher accuracy + rv, _ = idx.TermFrequencies(field, limit*5, descending) + } + asyncResults <- rv + waitGroup.Done() + } + + waitGroup.Add(len(i.indexes)) + for _, in := range i.indexes { + go searchChildIndex(in, field, limit, descending) + } + + // on another go routine, close after finished + go func() { + waitGroup.Wait() + close(asyncResults) + }() + + rvTermFreqsMap := make(map[string]uint64) + for asr := range asyncResults { + for _, entry := range asr { + rvTermFreqsMap[entry.Term] += entry.Frequency + } + } + + rvTermFreqs := make([]index.TermFreq, 0, len(rvTermFreqsMap)) + for term, freq := range rvTermFreqsMap { + rvTermFreqs = append(rvTermFreqs, index.TermFreq{ + Term: term, + Frequency: freq, + }) + } + + if descending { + sort.Slice(rvTermFreqs, func(i, j int) bool { + if rvTermFreqs[i].Frequency == rvTermFreqs[j].Frequency { + // If frequencies are equal, sort by term lexicographically + return strings.Compare(rvTermFreqs[i].Term, rvTermFreqs[j].Term) < 0 + } + return rvTermFreqs[i].Frequency > rvTermFreqs[j].Frequency + }) + } else { + sort.Slice(rvTermFreqs, func(i, j int) bool { + if rvTermFreqs[i].Frequency == rvTermFreqs[j].Frequency { + // If frequencies are equal, sort by term lexicographically + return strings.Compare(rvTermFreqs[i].Term, rvTermFreqs[j].Term) < 0 + } + return rvTermFreqs[i].Frequency < rvTermFreqs[j].Frequency + }) + } + + if limit > len(rvTermFreqs) { + limit = len(rvTermFreqs) + } + + return rvTermFreqs[:limit], nil +} + +func (i *indexAliasImpl) CentroidCardinalities(field string, limit int, descending bool) ( + []index.CentroidCardinality, error) { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return nil, ErrorIndexClosed + } + + if len(i.indexes) < 1 { + return nil, ErrorAliasEmpty + } + + // short circuit the simple case + if len(i.indexes) == 1 { + if idx, ok := i.indexes[0].(InsightsIndex); ok { + return idx.CentroidCardinalities(field, limit, descending) + } + return nil, nil + } + + // run search on each index in separate go routine + var waitGroup sync.WaitGroup + asyncResults := make(chan []index.CentroidCardinality, len(i.indexes)) + + searchChildIndex := func(in Index, field string, limit int, descending bool) { + var rv []index.CentroidCardinality + if idx, ok := in.(InsightsIndex); ok { + rv, _ = idx.CentroidCardinalities(field, limit, descending) + } + asyncResults <- rv + waitGroup.Done() + } + + waitGroup.Add(len(i.indexes)) + for _, in := range i.indexes { + go searchChildIndex(in, field, limit, descending) + } + + // on another go routine, close after finished + go func() { + waitGroup.Wait() + close(asyncResults) + }() + + rvCentroidCardinalitiesResult := make([]index.CentroidCardinality, 0, limit) + for asr := range asyncResults { + asr = append(asr, rvCentroidCardinalitiesResult...) + if descending { + sort.Slice(asr, func(i, j int) bool { + return asr[i].Cardinality > asr[j].Cardinality + }) + } else { + sort.Slice(asr, func(i, j int) bool { + return asr[i].Cardinality < asr[j].Cardinality + }) + } + + if limit > len(asr) { + limit = len(asr) + } + + rvCentroidCardinalitiesResult = asr[:limit] + } + + return rvCentroidCardinalitiesResult, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_impl.go index a43b3cf75..fd3d3c8e0 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_impl.go @@ -57,8 +57,6 @@ type indexImpl struct { const storePath = "store" -var mappingInternalKey = []byte("_mapping") - const ( SearchQueryStartCallbackKey search.ContextKey = "_search_query_start_callback_key" SearchQueryEndCallbackKey search.ContextKey = "_search_query_end_callback_key" @@ -641,8 +639,57 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } } + // ------------------------------------------------------------------------------------------ + // set up additional contexts for any search operation that will proceed from + // here, such as presearch, collectors etc. + + // Scoring model callback to be used to get scoring model + scoringModelCallback := func() string { + if isBM25Enabled(i.m) { + return index.BM25Scoring + } + return index.DefaultScoringModel + } + ctx = context.WithValue(ctx, search.GetScoringModelCallbackKey, + search.GetScoringModelCallbackFn(scoringModelCallback)) + + // This callback and variable handles the tracking of bytes read + // 1. as part of creation of tfr and its Next() calls which is + // accounted by invoking this callback when the TFR is closed. + // 2. the docvalues portion (accounted in collector) and the retrieval + // of stored fields bytes (by LoadAndHighlightFields) + var totalSearchCost uint64 + sendBytesRead := func(bytesRead uint64) { + totalSearchCost += bytesRead + } + // Ensure IO cost accounting and result cost assignment happen on all return paths + defer func() { + if sr != nil { + sr.Cost = totalSearchCost + } + if is, ok := indexReader.(*scorch.IndexSnapshot); ok { + is.UpdateIOStats(totalSearchCost) + } + search.RecordSearchCost(ctx, search.DoneM, 0) + }() + + ctx = context.WithValue(ctx, search.SearchIOStatsCallbackKey, search.SearchIOStatsCallbackFunc(sendBytesRead)) + + // Geo buffer pool callback to be used for getting geo buffer pool + var bufPool *s2.GeoBufferPool + getBufferPool := func() *s2.GeoBufferPool { + if bufPool == nil { + bufPool = s2.NewGeoBufferPool(search.MaxGeoBufPoolSize, search.MinGeoBufPoolSize) + } + + return bufPool + } + + ctx = context.WithValue(ctx, search.GeoBufferPoolCallbackKey, search.GeoBufferPoolCallbackFunc(getBufferPool)) + // ------------------------------------------------------------------------------------------ + if _, ok := ctx.Value(search.PreSearchKey).(bool); ok { - preSearchResult, err := i.preSearch(ctx, req, indexReader) + sr, err = i.preSearch(ctx, req, indexReader) if err != nil { return nil, err } @@ -656,7 +703,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr // time stat searchDuration := time.Since(searchStart) atomic.AddUint64(&i.stats.searchTime, uint64(searchDuration)) - return preSearchResult, nil + + return sr, nil } var reverseQueryExecution bool @@ -726,6 +774,9 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr // if score fusion, run collect if rescorer is defined if rescorer != nil && requestHasKNN(req) { knnHits, err = i.runKnnCollector(ctx, req, indexReader, false) + if err != nil { + return nil, err + } } } @@ -745,7 +796,6 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if !contextScoreFusionKeyExists { setKnnHitsInCollector(knnHits, req, coll) } - if fts != nil { if is, ok := indexReader.(*scorch.IndexSnapshot); ok { @@ -754,44 +804,12 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr ctx = context.WithValue(ctx, search.FieldTermSynonymMapKey, fts) } - scoringModelCallback := func() string { - if isBM25Enabled(i.m) { - return index.BM25Scoring - } - return index.DefaultScoringModel - } - ctx = context.WithValue(ctx, search.GetScoringModelCallbackKey, - search.GetScoringModelCallbackFn(scoringModelCallback)) - // set the bm25Stats (stats important for consistent scoring) in // the context object if bm25Stats != nil { ctx = context.WithValue(ctx, search.BM25StatsKey, bm25Stats) } - // This callback and variable handles the tracking of bytes read - // 1. as part of creation of tfr and its Next() calls which is - // accounted by invoking this callback when the TFR is closed. - // 2. the docvalues portion (accounted in collector) and the retrieval - // of stored fields bytes (by LoadAndHighlightFields) - var totalSearchCost uint64 - sendBytesRead := func(bytesRead uint64) { - totalSearchCost += bytesRead - } - - ctx = context.WithValue(ctx, search.SearchIOStatsCallbackKey, search.SearchIOStatsCallbackFunc(sendBytesRead)) - - var bufPool *s2.GeoBufferPool - getBufferPool := func() *s2.GeoBufferPool { - if bufPool == nil { - bufPool = s2.NewGeoBufferPool(search.MaxGeoBufPoolSize, search.MinGeoBufPoolSize) - } - - return bufPool - } - - ctx = context.WithValue(ctx, search.GeoBufferPoolCallbackKey, search.GeoBufferPoolCallbackFunc(getBufferPool)) - searcher, err := req.Query.Searcher(ctx, indexReader, i.m, search.SearcherOptions{ Explain: req.Explain, IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, @@ -804,14 +822,6 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if serr := searcher.Close(); err == nil && serr != nil { err = serr } - if sr != nil { - sr.Cost = totalSearchCost - } - if sr, ok := indexReader.(*scorch.IndexSnapshot); ok { - sr.UpdateIOStats(totalSearchCost) - } - - search.RecordSearchCost(ctx, search.DoneM, 0) }() if req.Facets != nil { @@ -1388,3 +1398,68 @@ func (i *indexImpl) FireIndexEvent() { internalEventIndex.FireIndexEvent() } } + +// ----------------------------------------------------------------------------- + +func (i *indexImpl) TermFrequencies(field string, limit int, descending bool) ( + []index.TermFreq, error) { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return nil, ErrorIndexClosed + } + + reader, err := i.i.Reader() + if err != nil { + return nil, err + } + defer func() { + if cerr := reader.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + insightsReader, ok := reader.(index.IndexInsightsReader) + if !ok { + return nil, fmt.Errorf("index reader does not support TermFrequencies") + } + + return insightsReader.TermFrequencies(field, limit, descending) +} + +func (i *indexImpl) CentroidCardinalities(field string, limit int, descending bool) ( + []index.CentroidCardinality, error) { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return nil, ErrorIndexClosed + } + + reader, err := i.i.Reader() + if err != nil { + return nil, err + } + defer func() { + if cerr := reader.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + insightsReader, ok := reader.(index.IndexInsightsReader) + if !ok { + return nil, fmt.Errorf("index reader does not support CentroidCardinalities") + } + + centroidCardinalities, err := insightsReader.CentroidCardinalities(field, limit, descending) + if err != nil { + return nil, err + } + + for j := 0; j < len(centroidCardinalities); j++ { + centroidCardinalities[j].Index = i.name + } + + return centroidCardinalities, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search.go b/vendor/github.com/blevesearch/bleve/v2/search.go index e3736558a..ef27b414d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search.go +++ b/vendor/github.com/blevesearch/bleve/v2/search.go @@ -755,4 +755,3 @@ func ParseParams(r *SearchRequest, input []byte) (*RequestParams, error) { return params, nil } - diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go b/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go index 41dbcb40a..c5e928959 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go @@ -185,17 +185,36 @@ func (q *BooleanQuery) Searcher(ctx context.Context, i index.IndexReader, m mapp if err != nil { return nil, err } + var init bool + var refDoc *search.DocumentMatch filterFunc = func(sctx *search.SearchContext, d *search.DocumentMatch) bool { - // Attempt to advance the filter searcher to the document identified by - // the base searcher's (unfiltered boolean) current result (d.IndexInternalID). - // - // If the filter searcher successfully finds a document with the same - // internal ID, it means the document satisfies the filter and should be kept. - // - // If the filter searcher returns an error, does not find a matching document, - // or finds a document with a different internal ID, the document should be discarded. - dm, err := filterSearcher.Advance(sctx, d.IndexInternalID) - return err == nil && dm != nil && bytes.Equal(dm.IndexInternalID, d.IndexInternalID) + // Initialize the reference document to point + // to the first document in the filterSearcher + var err error + if !init { + refDoc, err = filterSearcher.Next(sctx) + if err != nil { + return false + } + init = true + } + if refDoc == nil { + // filterSearcher is exhausted, d is not in filter + return false + } + // Compare document IDs + cmp := bytes.Compare(refDoc.IndexInternalID, d.IndexInternalID) + if cmp < 0 { + // filterSearcher is behind the current document, Advance() it + refDoc, err = filterSearcher.Advance(sctx, d.IndexInternalID) + if err != nil || refDoc == nil { + return false + } + // After advance, check if they're now equal + return bytes.Equal(refDoc.IndexInternalID, d.IndexInternalID) + } + // cmp >= 0: either equal (match) or filterSearcher is ahead (no match) + return cmp == 0 } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go index 0c8885fb3..27c3978b1 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go @@ -431,6 +431,10 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { if err != nil { return nil, err } + q.Filter, err = expand(q.Filter) + if err != nil { + return nil, err + } return q, nil default: return query, nil @@ -481,7 +485,7 @@ func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, erro fs, err = ExtractFields(expandedQuery, m, fs) } case *BooleanQuery: - for _, subq := range []Query{q.Must, q.Should, q.MustNot} { + for _, subq := range []Query{q.Must, q.Should, q.MustNot, q.Filter} { fs, err = ExtractFields(subq, m, fs) if err != nil { break @@ -553,6 +557,10 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Thes if err != nil { return nil, err } + rv, err = ExtractSynonyms(ctx, m, r, q.Filter, rv) + if err != nil { + return nil, err + } case *ConjunctionQuery: for _, child := range q.Conjuncts { rv, err = ExtractSynonyms(ctx, m, r, child, rv) diff --git a/vendor/github.com/blevesearch/bleve_index_api/index.go b/vendor/github.com/blevesearch/bleve_index_api/index.go index e76511954..12d907e59 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/index.go +++ b/vendor/github.com/blevesearch/bleve_index_api/index.go @@ -365,3 +365,29 @@ type EligibleDocumentSelector interface { // This must be called after all eligible documents have been added. SegmentEligibleDocs(segmentID int) []uint64 } + +// ----------------------------------------------------------------------------- + +type TermFreq struct { + Term string `json:"term"` + Frequency uint64 `json:"frequency"` +} + +type CentroidCardinality struct { + Index string `json:"index"` + Centroid []float32 `json:"centroid"` + Cardinality uint64 `json:"cardinality"` +} + +// IndexInsightsReader is an extended index reader that supports APIs which can advertise +// details about content held within the index. +type IndexInsightsReader interface { + IndexReader + + // Obtains a maximum limit number of indexed tokens for the field sorted based on frequencies. + TermFrequencies(field string, limit int, descending bool) (termFreqs []TermFreq, err error) + + // Obtains a maximum limit number of centroid vectors from IVF indexes sorted based on + // cluster densities (or cardinalities) + CentroidCardinalities(field string, limit int, descending bool) (cenCards []CentroidCardinality, err error) +} diff --git a/vendor/github.com/blevesearch/go-faiss/index.go b/vendor/github.com/blevesearch/go-faiss/index.go index 18177fc7e..3a399e5b6 100644 --- a/vendor/github.com/blevesearch/go-faiss/index.go +++ b/vendor/github.com/blevesearch/go-faiss/index.go @@ -14,6 +14,7 @@ import "C" import ( "encoding/json" "fmt" + "sort" "unsafe" ) @@ -64,6 +65,10 @@ type Index interface { ObtainClustersWithDistancesFromIVFIndex(x []float32, centroidIDs []int64) ( []int64, []float32, error) + // Applicable only to IVF indexes: Returns the top k centroid cardinalities and + // their vectors in chosen order (descending or ascending) + ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ([]uint64, [][]float32, error) + // Search queries the index with the vectors in x. // Returns the IDs of the k nearest neighbors for each query vector and the // corresponding distances. @@ -214,6 +219,72 @@ func (idx *faissIndex) ObtainClustersWithDistancesFromIVFIndex(x []float32, cent return centroids, centroidDistances, nil } +func (idx *faissIndex) ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ( + []uint64, [][]float32, error) { + if limit <= 0 { + return nil, nil, nil + } + + nlist := int(C.faiss_IndexIVF_nlist(idx.idx)) + if nlist == 0 { + return nil, nil, nil + } + + centroidCardinalities := make([]C.size_t, nlist) + + // Allocate a flat buffer for all centroids, then slice it per centroid + d := idx.D() + flatCentroids := make([]float32, nlist*d) + + // Call the C function to fill centroid vectors and cardinalities + c := C.faiss_IndexIVF_get_centroids_and_cardinality( + idx.idx, + (*C.float)(&flatCentroids[0]), + (*C.size_t)(¢roidCardinalities[0]), + nil, + ) + if c != 0 { + return nil, nil, getLastError() + } + + topIndices := getIndicesOfKCentroidCardinalities( + centroidCardinalities, + min(limit, nlist), + descending) + + rvCardinalities := make([]uint64, len(topIndices)) + rvCentroids := make([][]float32, len(topIndices)) + + for i, idx := range topIndices { + rvCardinalities[i] = uint64(centroidCardinalities[idx]) + rvCentroids[i] = flatCentroids[idx*d : (idx+1)*d] + } + + return rvCardinalities, rvCentroids, nil + +} + +func getIndicesOfKCentroidCardinalities(cardinalities []C.size_t, k int, descending bool) []int { + n := len(cardinalities) + indices := make([]int, n) + for i := range indices { + indices[i] = i + } + + // Sort only the indices based on cardinality values + sort.Slice(indices, func(i, j int) bool { + if descending { + return cardinalities[indices[i]] > cardinalities[indices[j]] + } + return cardinalities[indices[i]] < cardinalities[indices[j]] + }) + if k >= n { + return indices + } + + return indices[:k] +} + func (idx *faissIndex) SearchClustersFromIVFIndex(selector Selector, eligibleCentroidIDs []int64, minEligibleCentroids int, k int64, x, centroidDis []float32, params json.RawMessage) ([]float32, []int64, error) { diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go index a57e0b494..7e50ce46f 100644 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go @@ -20,6 +20,7 @@ package segment import ( "encoding/json" + index "github.com/blevesearch/bleve_index_api" "github.com/RoaringBitmap/roaring/v2" ) @@ -64,6 +65,8 @@ type VectorIndex interface { params json.RawMessage) (VecPostingsList, error) Close() Size() uint64 + + ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ([]index.CentroidCardinality, error) } type VectorSegment interface { diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go index 2a77199c6..1d0dfbae7 100644 --- a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go +++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go @@ -26,6 +26,7 @@ import ( "github.com/RoaringBitmap/roaring/v2" "github.com/RoaringBitmap/roaring/v2/roaring64" "github.com/bits-and-blooms/bitset" + index "github.com/blevesearch/bleve_index_api" faiss "github.com/blevesearch/go-faiss" segment "github.com/blevesearch/scorch_segment_api/v2" ) @@ -279,6 +280,9 @@ type vectorIndexWrapper struct { params json.RawMessage) (segment.VecPostingsList, error) close func() size func() uint64 + + obtainKCentroidCardinalitiesFromIVFIndex func(limit int, descending bool) ( + []index.CentroidCardinality, error) } func (i *vectorIndexWrapper) Search(qVector []float32, k int64, @@ -301,6 +305,11 @@ func (i *vectorIndexWrapper) Size() uint64 { return i.size() } +func (i *vectorIndexWrapper) ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ( + []index.CentroidCardinality, error) { + return i.obtainKCentroidCardinalitiesFromIVFIndex(limit, descending) +} + // InterpretVectorIndex returns a construct of closures (vectorIndexWrapper) // that will allow the caller to - // (1) search within an attached vector index @@ -520,6 +529,24 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool size: func() uint64 { return vecIndexSize }, + obtainKCentroidCardinalitiesFromIVFIndex: func(limit int, descending bool) ([]index.CentroidCardinality, error) { + if vecIndex == nil || !vecIndex.IsIVFIndex() { + return nil, nil + } + + cardinalities, centroids, err := vecIndex.ObtainKCentroidCardinalitiesFromIVFIndex(limit, descending) + if err != nil { + return nil, err + } + centroidCardinalities := make([]index.CentroidCardinality, len(cardinalities)) + for i, cardinality := range cardinalities { + centroidCardinalities[i] = index.CentroidCardinality{ + Centroid: centroids[i], + Cardinality: cardinality, + } + } + return centroidCardinalities, nil + }, } err error diff --git a/vendor/modules.txt b/vendor/modules.txt index b9db4e79e..b1aaeb30d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -117,7 +117,7 @@ github.com/bitly/go-simplejson # github.com/bits-and-blooms/bitset v1.22.0 ## explicit; go 1.16 github.com/bits-and-blooms/bitset -# github.com/blevesearch/bleve/v2 v2.5.4 +# github.com/blevesearch/bleve/v2 v2.5.5 ## explicit; go 1.23 github.com/blevesearch/bleve/v2 github.com/blevesearch/bleve/v2/analysis @@ -160,7 +160,7 @@ github.com/blevesearch/bleve/v2/search/scorer github.com/blevesearch/bleve/v2/search/searcher github.com/blevesearch/bleve/v2/size github.com/blevesearch/bleve/v2/util -# github.com/blevesearch/bleve_index_api v1.2.10 +# github.com/blevesearch/bleve_index_api v1.2.11 ## explicit; go 1.21 github.com/blevesearch/bleve_index_api # github.com/blevesearch/geo v0.2.4 @@ -171,7 +171,7 @@ github.com/blevesearch/geo/r2 github.com/blevesearch/geo/r3 github.com/blevesearch/geo/s1 github.com/blevesearch/geo/s2 -# github.com/blevesearch/go-faiss v1.0.25 +# github.com/blevesearch/go-faiss v1.0.26 ## explicit; go 1.21 github.com/blevesearch/go-faiss # github.com/blevesearch/go-porterstemmer v1.0.3 @@ -183,7 +183,7 @@ github.com/blevesearch/gtreap # github.com/blevesearch/mmap-go v1.0.4 ## explicit; go 1.13 github.com/blevesearch/mmap-go -# github.com/blevesearch/scorch_segment_api/v2 v2.3.12 +# github.com/blevesearch/scorch_segment_api/v2 v2.3.13 ## explicit; go 1.21 github.com/blevesearch/scorch_segment_api/v2 # github.com/blevesearch/segment v0.9.1 @@ -217,7 +217,7 @@ github.com/blevesearch/zapx/v14 # github.com/blevesearch/zapx/v15 v15.4.2 ## explicit; go 1.21 github.com/blevesearch/zapx/v15 -# github.com/blevesearch/zapx/v16 v16.2.6 +# github.com/blevesearch/zapx/v16 v16.2.7 ## explicit; go 1.23 github.com/blevesearch/zapx/v16 # github.com/bluele/gcache v0.0.2