build(deps): bump github.com/blevesearch/bleve/v2 from 2.3.10 to 2.4.0

Bumps [github.com/blevesearch/bleve/v2](https://github.com/blevesearch/bleve) from 2.3.10 to 2.4.0.
- [Release notes](https://github.com/blevesearch/bleve/releases)
- [Commits](https://github.com/blevesearch/bleve/compare/v2.3.10...v2.4.0)

---
updated-dependencies:
- dependency-name: github.com/blevesearch/bleve/v2
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
This commit is contained in:
dependabot[bot]
2024-04-03 12:57:16 +00:00
committed by Ralf Haferkamp
parent 8f432c4cdd
commit 68e4e81870
105 changed files with 15633 additions and 485 deletions
+3 -2
View File
@@ -22,7 +22,8 @@ A modern text indexing library in go
* Conjunction, Disjunction, Boolean (must/should/must_not)
* Term Range, Numeric Range, Date Range
* [Geo Spatial](https://github.com/blevesearch/bleve/blob/master/geo/README.md)
* Simple [query string syntax](http://www.blevesearch.com/docs/Query-String-Query/) for human entry
* Simple [query string syntax](http://www.blevesearch.com/docs/Query-String-Query/)
* [Vector Search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md)
* [tf-idf](https://en.wikipedia.org/wiki/Tf-idf) Scoring
* Query time boosting
* Search result match highlighting with document fragments
@@ -101,7 +102,7 @@ Use "bleve [command] --help" for more information about a command.
Bleve includes general-purpose analyzers (customizable) as well as pre-built text analyzers for the following languages:
Arabic (ar), Bulgarian (bg), Catalan (ca), Chinese-Japanese-Korean (cjk), Kurdish (ckb), Danish (da), German (de), Greek (el), English (en), Spanish - Castilian (es), Basque (eu), Persian (fa), Finnish (fi), French (fr), Gaelic (ga), Spanish - Galician (gl), Hindi (hi), Croatian (hr), Hungarian (hu), Armenian (hy), Indonesian (id, in), Italian (it), Dutch (nl), Norwegian (no), Portuguese (pt), Romanian (ro), Russian (ru), Swedish (sv), Turkish (tr)
Arabic (ar), Bulgarian (bg), Catalan (ca), Chinese-Japanese-Korean (cjk), Kurdish (ckb), Danish (da), German (de), Greek (el), English (en), Spanish - Castilian (es), Basque (eu), Persian (fa), Finnish (fi), French (fr), Gaelic (ga), Spanish - Galician (gl), Hindi (hi), Croatian (hr), Hungarian (hu), Armenian (hy), Indonesian (id, in), Italian (it), Dutch (nl), Norwegian (no), Polish (pl), Portuguese (pt), Romanian (ro), Russian (ru), Swedish (sv), Turkish (tr)
## Text Analysis Wizard
+145
View File
@@ -0,0 +1,145 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package document
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)
var reflectStaticSizeVectorField int
func init() {
var f VectorField
reflectStaticSizeVectorField = int(reflect.TypeOf(f).Size())
}
const DefaultVectorIndexingOptions = index.IndexField
type VectorField struct {
name string
dims int // Dimensionality of the vector
similarity string // Similarity metric to use for scoring
options index.FieldIndexingOptions
value []float32
numPlainTextBytes uint64
vectorIndexOptimizedFor string // Optimization applied to this index.
}
func (n *VectorField) Size() int {
return reflectStaticSizeVectorField + size.SizeOfPtr +
len(n.name) +
int(numBytesFloat32s(n.value))
}
func (n *VectorField) Name() string {
return n.name
}
func (n *VectorField) ArrayPositions() []uint64 {
return nil
}
func (n *VectorField) Options() index.FieldIndexingOptions {
return n.options
}
func (n *VectorField) NumPlainTextBytes() uint64 {
return n.numPlainTextBytes
}
func (n *VectorField) AnalyzedLength() int {
// vectors aren't analyzed
return 0
}
func (n *VectorField) EncodedFieldType() byte {
return 'v'
}
func (n *VectorField) AnalyzedTokenFrequencies() index.TokenFrequencies {
// vectors aren't analyzed
return nil
}
func (n *VectorField) Analyze() {
// vectors aren't analyzed
}
func (n *VectorField) Value() []byte {
return nil
}
func (n *VectorField) GoString() string {
return fmt.Sprintf("&document.VectorField{Name:%s, Options: %s, "+
"Value: %+v}", n.name, n.options, n.value)
}
// For the sake of not polluting the API, we are keeping arrayPositions as a
// parameter, but it is not used.
func NewVectorField(name string, arrayPositions []uint64,
vector []float32, dims int, similarity, vectorIndexOptimizedFor string) *VectorField {
return NewVectorFieldWithIndexingOptions(name, arrayPositions,
vector, dims, similarity, vectorIndexOptimizedFor,
DefaultVectorIndexingOptions)
}
// For the sake of not polluting the API, we are keeping arrayPositions as a
// parameter, but it is not used.
func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64,
vector []float32, dims int, similarity, vectorIndexOptimizedFor string,
options index.FieldIndexingOptions) *VectorField {
options = options | DefaultVectorIndexingOptions
return &VectorField{
name: name,
dims: dims,
similarity: similarity,
options: options,
value: vector,
numPlainTextBytes: numBytesFloat32s(vector),
vectorIndexOptimizedFor: vectorIndexOptimizedFor,
}
}
func numBytesFloat32s(value []float32) uint64 {
return uint64(len(value) * size.SizeOfFloat32)
}
// -----------------------------------------------------------------------------
// Following methods help in implementing the bleve_index_api's VectorField
// interface.
func (n *VectorField) Vector() []float32 {
return n.value
}
func (n *VectorField) Dims() int {
return n.dims
}
func (n *VectorField) Similarity() string {
return n.similarity
}
func (n *VectorField) IndexOptimizedFor() string {
return n.vectorIndexOptimizedFor
}
+12 -10
View File
@@ -26,6 +26,7 @@ const (
ErrorUnknownIndexType
ErrorEmptyID
ErrorIndexReadInconsistency
ErrorTwoPhaseSearchInconsistency
)
// Error represents a more strongly typed bleve error for detecting
@@ -37,14 +38,15 @@ func (e Error) Error() string {
}
var errorMessages = map[Error]string{
ErrorIndexPathExists: "cannot create new index, path already exists",
ErrorIndexPathDoesNotExist: "cannot open index, path does not exist",
ErrorIndexMetaMissing: "cannot open index, metadata missing",
ErrorIndexMetaCorrupt: "cannot open index, metadata corrupt",
ErrorIndexClosed: "index is closed",
ErrorAliasMulti: "cannot perform single index operation on multiple index alias",
ErrorAliasEmpty: "cannot perform operation on empty alias",
ErrorUnknownIndexType: "unknown index type",
ErrorEmptyID: "document ID cannot be empty",
ErrorIndexReadInconsistency: "index read inconsistency detected",
ErrorIndexPathExists: "cannot create new index, path already exists",
ErrorIndexPathDoesNotExist: "cannot open index, path does not exist",
ErrorIndexMetaMissing: "cannot open index, metadata missing",
ErrorIndexMetaCorrupt: "cannot open index, metadata corrupt",
ErrorIndexClosed: "index is closed",
ErrorAliasMulti: "cannot perform single index operation on multiple index alias",
ErrorAliasEmpty: "cannot perform operation on empty alias",
ErrorUnknownIndexType: "unknown index type",
ErrorEmptyID: "document ID cannot be empty",
ErrorIndexReadInconsistency: "index read inconsistency detected",
ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change",
}
+12 -29
View File
@@ -18,6 +18,8 @@ import (
"reflect"
"strconv"
"strings"
"github.com/blevesearch/bleve/v2/util"
)
// ExtractGeoPoint takes an arbitrary interface{} and tries it's best to
@@ -61,12 +63,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
first := thingVal.Index(0)
if first.CanInterface() {
firstVal := first.Interface()
lon, foundLon = extractNumericVal(firstVal)
lon, foundLon = util.ExtractNumericValFloat64(firstVal)
}
second := thingVal.Index(1)
if second.CanInterface() {
secondVal := second.Interface()
lat, foundLat = extractNumericVal(secondVal)
lat, foundLat = util.ExtractNumericValFloat64(secondVal)
}
}
}
@@ -105,12 +107,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
// is it a map
if l, ok := thing.(map[string]interface{}); ok {
if lval, ok := l["lon"]; ok {
lon, foundLon = extractNumericVal(lval)
lon, foundLon = util.ExtractNumericValFloat64(lval)
} else if lval, ok := l["lng"]; ok {
lon, foundLon = extractNumericVal(lval)
lon, foundLon = util.ExtractNumericValFloat64(lval)
}
if lval, ok := l["lat"]; ok {
lat, foundLat = extractNumericVal(lval)
lat, foundLat = util.ExtractNumericValFloat64(lval)
}
}
@@ -121,19 +123,19 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
if thingVal.Field(i).CanInterface() {
fieldVal := thingVal.Field(i).Interface()
lon, foundLon = extractNumericVal(fieldVal)
lon, foundLon = util.ExtractNumericValFloat64(fieldVal)
}
}
if strings.HasPrefix(strings.ToLower(fieldName), "lng") {
if thingVal.Field(i).CanInterface() {
fieldVal := thingVal.Field(i).Interface()
lon, foundLon = extractNumericVal(fieldVal)
lon, foundLon = util.ExtractNumericValFloat64(fieldVal)
}
}
if strings.HasPrefix(strings.ToLower(fieldName), "lat") {
if thingVal.Field(i).CanInterface() {
fieldVal := thingVal.Field(i).Interface()
lat, foundLat = extractNumericVal(fieldVal)
lat, foundLat = util.ExtractNumericValFloat64(fieldVal)
}
}
}
@@ -157,25 +159,6 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
return lon, lat, foundLon && foundLat
}
// extract numeric value (if possible) and returns a float64
func extractNumericVal(v interface{}) (float64, bool) {
val := reflect.ValueOf(v)
if !val.IsValid() {
return 0, false
}
typ := val.Type()
switch typ.Kind() {
case reflect.Float32, reflect.Float64:
return val.Float(), true
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return float64(val.Int()), true
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return float64(val.Uint()), true
}
return 0, false
}
// various support interfaces which can be used to find lat/lon
type loner interface {
Lon() float64
@@ -209,12 +192,12 @@ func extractCoordinates(thing interface{}) []float64 {
first := thingVal.Index(0)
if first.CanInterface() {
firstVal := first.Interface()
lon, foundLon = extractNumericVal(firstVal)
lon, foundLon = util.ExtractNumericValFloat64(firstVal)
}
second := thingVal.Index(1)
if second.CanInterface() {
secondVal := second.Interface()
lat, foundLat = extractNumericVal(secondVal)
lat, foundLat = util.ExtractNumericValFloat64(secondVal)
}
if !foundLon || !foundLat {
+21 -2
View File
@@ -30,6 +30,7 @@ type segmentIntroduction struct {
obsoletes map[uint64]*roaring.Bitmap
ids []string
internal map[string][]byte
stats *fieldStats
applied chan error
persisted chan error
@@ -146,7 +147,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
newss := &SegmentSnapshot{
id: root.segment[i].id,
segment: root.segment[i].segment,
stats: root.segment[i].stats,
cachedDocs: root.segment[i].cachedDocs,
cachedMeta: root.segment[i].cachedMeta,
creator: root.segment[i].creator,
}
@@ -154,7 +157,11 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
if root.segment[i].deleted == nil {
newss.deleted = delta
} else {
newss.deleted = roaring.Or(root.segment[i].deleted, delta)
if delta.IsEmpty() {
newss.deleted = root.segment[i].deleted
} else {
newss.deleted = roaring.Or(root.segment[i].deleted, delta)
}
}
if newss.deleted.IsEmpty() {
newss.deleted = nil
@@ -188,7 +195,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
newSegmentSnapshot := &SegmentSnapshot{
id: next.id,
segment: next.data, // take ownership of next.data's ref-count
stats: next.stats,
cachedDocs: &cachedDocs{cache: nil},
cachedMeta: &cachedMeta{meta: nil},
creator: "introduceSegment",
}
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot)
@@ -275,7 +284,9 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) {
id: segmentSnapshot.id,
segment: replacement,
deleted: segmentSnapshot.deleted,
stats: segmentSnapshot.stats,
cachedDocs: segmentSnapshot.cachedDocs,
cachedMeta: segmentSnapshot.cachedMeta,
creator: "introducePersist",
mmaped: 1,
}
@@ -374,7 +385,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
id: root.segment[i].id,
segment: root.segment[i].segment,
deleted: root.segment[i].deleted,
stats: root.segment[i].stats,
cachedDocs: root.segment[i].cachedDocs,
cachedMeta: root.segment[i].cachedMeta,
creator: root.segment[i].creator,
})
root.segment[i].segment.AddRef()
@@ -394,7 +407,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
}
}
}
// before the newMerge introduction, need to clean the newly
// merged segment wrt the current root segments, hence
// applying the obsolete segment contents to newly merged segment
@@ -415,12 +427,19 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
if nextMerge.new != nil &&
nextMerge.new.Count() > newSegmentDeleted.GetCardinality() {
stats := newFieldStats()
if fsr, ok := nextMerge.new.(segment.FieldStatsReporter); ok {
fsr.UpdateFieldStats(stats)
}
// put new segment at end
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
id: nextMerge.id,
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count
deleted: newSegmentDeleted,
stats: stats,
cachedDocs: &cachedDocs{cache: nil},
cachedMeta: &cachedMeta{meta: nil},
creator: "introduceMerge",
mmaped: nextMerge.mmaped,
})
+4 -4
View File
@@ -290,7 +290,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments)))
oldMap := make(map[uint64]*SegmentSnapshot)
oldMap := make(map[uint64]*SegmentSnapshot, len(task.Segments))
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
segmentsToMerge := make([]segment.Segment, 0, len(task.Segments))
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
@@ -357,7 +357,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
totalBytesRead := seg.BytesRead() + prevBytesReadTotal
seg.ResetBytesRead(totalBytesRead)
oldNewDocNums = make(map[uint64][]uint64)
oldNewDocNums = make(map[uint64][]uint64, len(newDocNums))
for i, segNewDocNums := range newDocNums {
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
}
@@ -485,8 +485,8 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
sm := &segmentMerge{
id: newSegmentID,
old: make(map[uint64]*SegmentSnapshot),
oldNewDocNums: make(map[uint64][]uint64),
old: make(map[uint64]*SegmentSnapshot, len(sbsIndexes)),
oldNewDocNums: make(map[uint64][]uint64, len(sbsIndexes)),
new: seg,
notifyCh: make(chan *mergeTaskIntroStatus),
}
+2 -1
View File
@@ -16,10 +16,11 @@ package scorch
import (
"fmt"
"sync/atomic"
"github.com/RoaringBitmap/roaring"
index "github.com/blevesearch/bleve_index_api"
segment "github.com/blevesearch/scorch_segment_api/v2"
"sync/atomic"
)
var OptimizeConjunction = true
+187
View File
@@ -0,0 +1,187 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package scorch
import (
"context"
"fmt"
"sync"
"sync/atomic"
"github.com/blevesearch/bleve/v2/search"
index "github.com/blevesearch/bleve_index_api"
segment_api "github.com/blevesearch/scorch_segment_api/v2"
)
type OptimizeVR struct {
ctx context.Context
snapshot *IndexSnapshot
totalCost uint64
// maps field to vector readers
vrs map[string][]*IndexSnapshotVectorReader
}
// This setting _MUST_ only be changed during init and not after.
var BleveMaxKNNConcurrency = 10
func (o *OptimizeVR) invokeSearcherEndCallback() {
if o.ctx != nil {
if cb := o.ctx.Value(search.SearcherEndCallbackKey); cb != nil {
if cbF, ok := cb.(search.SearcherEndCallbackFn); ok {
if o.totalCost > 0 {
// notify the callback that the searcher creation etc. is finished
// and report back the total cost for it to track and take actions
// appropriately.
_ = cbF(o.totalCost)
}
}
}
}
}
func (o *OptimizeVR) Finish() error {
// for each field, get the vector index --> invoke the zap func.
// for each VR, populate postings list and iterators
// by passing the obtained vector index and getting similar vectors.
// defer close index - just once.
var errorsM sync.Mutex
var errors []error
defer o.invokeSearcherEndCallback()
wg := sync.WaitGroup{}
semaphore := make(chan struct{}, BleveMaxKNNConcurrency)
// Launch goroutines to get vector index for each segment
for i, seg := range o.snapshot.segment {
if sv, ok := seg.segment.(segment_api.VectorSegment); ok {
wg.Add(1)
semaphore <- struct{}{} // Acquire a semaphore slot
go func(index int, segment segment_api.VectorSegment, origSeg *SegmentSnapshot) {
defer func() {
<-semaphore // Release the semaphore slot
wg.Done()
}()
for field, vrs := range o.vrs {
vecIndex, err := segment.InterpretVectorIndex(field)
if err != nil {
errorsM.Lock()
errors = append(errors, err)
errorsM.Unlock()
return
}
// update the vector index size as a meta value in the segment snapshot
vectorIndexSize := vecIndex.Size()
origSeg.cachedMeta.updateMeta(field, vectorIndexSize)
for _, vr := range vrs {
// for each VR, populate postings list and iterators
// by passing the obtained vector index and getting similar vectors.
pl, err := vecIndex.Search(vr.vector, vr.k, origSeg.deleted)
if err != nil {
errorsM.Lock()
errors = append(errors, err)
errorsM.Unlock()
go vecIndex.Close()
return
}
atomic.AddUint64(&o.snapshot.parent.stats.TotKNNSearches, uint64(1))
// postings and iterators are already alloc'ed when
// IndexSnapshotVectorReader is created
vr.postings[index] = pl
vr.iterators[index] = pl.Iterator(vr.iterators[index])
}
go vecIndex.Close()
}
}(i, sv, seg)
}
}
wg.Wait()
close(semaphore)
if len(errors) > 0 {
return errors[0]
}
return nil
}
func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context,
octx index.VectorOptimizableContext) (index.VectorOptimizableContext, error) {
if s.snapshot.parent.segPlugin.Version() < VectorSearchSupportedSegmentVersion {
return nil, fmt.Errorf("vector search not supported for this index, "+
"index's segment version %v, supported segment version for vector search %v",
s.snapshot.parent.segPlugin.Version(), VectorSearchSupportedSegmentVersion)
}
if octx == nil {
octx = &OptimizeVR{snapshot: s.snapshot,
vrs: make(map[string][]*IndexSnapshotVectorReader),
}
}
o, ok := octx.(*OptimizeVR)
if !ok {
return octx, nil
}
o.ctx = ctx
if o.snapshot != s.snapshot {
o.invokeSearcherEndCallback()
return nil, fmt.Errorf("tried to optimize KNN across different snapshots")
}
// for every searcher creation, consult the segment snapshot to see
// what's the vector index size and since you're anyways going
// to use this vector index to perform the search etc. as part of the Finish()
// perform a check as to whether we allow the searcher creation (the downstream)
// Finish() logic to even occur or not.
var sumVectorIndexSize uint64
for _, seg := range o.snapshot.segment {
vecIndexSize := seg.cachedMeta.fetchMeta(s.field)
if vecIndexSize != nil {
sumVectorIndexSize += vecIndexSize.(uint64)
}
}
if o.ctx != nil {
if cb := o.ctx.Value(search.SearcherStartCallbackKey); cb != nil {
if cbF, ok := cb.(search.SearcherStartCallbackFn); ok {
err := cbF(sumVectorIndexSize)
if err != nil {
// it's important to invoke the end callback at this point since
// if the earlier searchers of this optimze struct were successful
// the cost corresponding to it would be incremented and if the
// current searcher fails the check then we end up erroring out
// the overall optimized searcher creation, the cost needs to be
// handled appropriately.
o.invokeSearcherEndCallback()
return nil, err
}
}
}
}
// total cost is essentially the sum of the vector indexes' size across all the
// searchers - all of them end up reading and maintaining a vector index.
// misacconting this value would end up calling the "end" callback with a value
// not equal to the value passed to "start" callback.
o.totalCost += sumVectorIndexSize
o.vrs[s.field] = append(o.vrs[s.field], s)
return o, nil
}
+29 -1
View File
@@ -17,6 +17,7 @@ package scorch
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"log"
@@ -424,6 +425,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
id: newSegmentID,
segment: segment.segment,
deleted: nil, // nil since merging handled deletions
stats: nil,
})
break
}
@@ -602,6 +604,18 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string,
return nil, nil, err
}
}
// store segment stats
if segmentSnapshot.stats != nil {
b, err := json.Marshal(segmentSnapshot.stats.Fetch())
if err != nil {
return nil, nil, err
}
err = snapshotSegmentBucket.Put(boltStatsKey, b)
if err != nil {
return nil, nil, err
}
}
}
return filenames, newSegmentPaths, nil
@@ -634,7 +648,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
// the newly populated boltdb snapshotBucket above
if len(newSegmentPaths) > 0 {
// now try to open all the new snapshots
newSegments := make(map[uint64]segment.Segment)
newSegments := make(map[uint64]segment.Segment, len(newSegmentPaths))
defer func() {
for _, s := range newSegments {
if s != nil {
@@ -704,6 +718,7 @@ var boltMetaDataKey = []byte{'m'}
var boltMetaDataSegmentTypeKey = []byte("type")
var boltMetaDataSegmentVersionKey = []byte("version")
var boltMetaDataTimeStamp = []byte("timeStamp")
var boltStatsKey = []byte("stats")
var TotBytesWrittenKey = []byte("TotBytesWritten")
func (s *Scorch) loadFromBolt() error {
@@ -858,6 +873,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
rv := &SegmentSnapshot{
segment: segment,
cachedDocs: &cachedDocs{cache: nil},
cachedMeta: &cachedMeta{meta: nil},
}
deletedBytes := segmentBucket.Get(boltDeletedKey)
if deletedBytes != nil {
@@ -872,6 +888,18 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
rv.deleted = deletedBitmap
}
}
statBytes := segmentBucket.Get(boltStatsKey)
if statBytes != nil {
var statsMap map[string]map[string]uint64
err := json.Unmarshal(statBytes, &statsMap)
stats := &fieldStats{statMap: statsMap}
if err != nil {
_ = segment.Close()
return nil, fmt.Errorf("error reading stat bytes: %v", err)
}
rv.stats = stats
}
return rv, nil
}
+73 -3
View File
@@ -428,6 +428,8 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
var newSegment segment.Segment
var bufBytes uint64
stats := newFieldStats()
if len(analysisResults) > 0 {
newSegment, bufBytes, err = s.segPlugin.New(analysisResults)
if err != nil {
@@ -438,11 +440,14 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
segB.BytesWritten())
}
atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes)
if fsr, ok := newSegment.(segment.FieldStatsReporter); ok {
fsr.UpdateFieldStats(stats)
}
} else {
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1)
}
err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback())
err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback(), stats)
if err != nil {
if newSegment != nil {
_ = newSegment.Close()
@@ -462,15 +467,15 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
}
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
internalOps map[string][]byte, persistedCallback index.BatchCallback) error {
internalOps map[string][]byte, persistedCallback index.BatchCallback, stats *fieldStats) error {
// new introduction
introduction := &segmentIntroduction{
id: atomic.AddUint64(&s.nextSegmentID, 1),
data: newSegment,
ids: ids,
obsoletes: make(map[uint64]*roaring.Bitmap),
internal: internalOps,
stats: stats,
applied: make(chan error),
persistedCallback: persistedCallback,
}
@@ -487,6 +492,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
defer func() { _ = root.DecRef() }()
introduction.obsoletes = make(map[uint64]*roaring.Bitmap, len(root.segment))
for _, seg := range root.segment {
delta, err := seg.segment.DocNumbers(ids)
if err != nil {
@@ -617,6 +624,8 @@ func (s *Scorch) StatsMap() map[string]interface{} {
m["index_time"] = m["TotIndexTime"]
m["term_searchers_started"] = m["TotTermSearchersStarted"]
m["term_searchers_finished"] = m["TotTermSearchersFinished"]
m["knn_searches"] = m["TotKNNSearches"]
m["num_bytes_read_at_query_time"] = m["TotBytesReadAtQueryTime"]
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
m["num_bytes_written_at_index_time"] = m["TotBytesWrittenAtIndexTime"]
@@ -638,6 +647,20 @@ func (s *Scorch) StatsMap() map[string]interface{} {
m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"]
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"]
// calculate the aggregate of all the segment's field stats
aggFieldStats := newFieldStats()
for _, segmentSnapshot := range indexSnapshot.Segments() {
if segmentSnapshot.stats != nil {
aggFieldStats.Aggregate(segmentSnapshot.stats)
}
}
aggFieldStatsMap := aggFieldStats.Fetch()
for statName, stats := range aggFieldStatsMap {
for fieldName, val := range stats {
m["field:"+fieldName+":"+statName] = val
}
}
return m
}
@@ -762,3 +785,50 @@ func parseToInteger(i interface{}) (int, error) {
return 0, fmt.Errorf("expects int or float64 value")
}
}
// Holds Zap's field level stats at a segment level
type fieldStats struct {
// StatName -> FieldName -> value
statMap map[string]map[string]uint64
}
// Add the data into the map after checking if the statname is valid
func (fs *fieldStats) Store(statName, fieldName string, value uint64) {
if _, exists := fs.statMap[statName]; !exists {
fs.statMap[statName] = make(map[string]uint64)
}
fs.statMap[statName][fieldName] = value
}
// Combine the given stats map with the existing map
func (fs *fieldStats) Aggregate(stats segment.FieldStats) {
statMap := stats.Fetch()
if statMap == nil {
return
}
for statName, statMap := range statMap {
if _, exists := fs.statMap[statName]; !exists {
fs.statMap[statName] = make(map[string]uint64)
}
for fieldName, val := range statMap {
if _, exists := fs.statMap[statName][fieldName]; !exists {
fs.statMap[statName][fieldName] = 0
}
fs.statMap[statName][fieldName] += val
}
}
}
// Returns the stats map
func (fs *fieldStats) Fetch() map[string]map[string]uint64 {
return fs.statMap
}
// Initializes an empty stats map
func newFieldStats() *fieldStats {
rv := &fieldStats{
statMap: map[string]map[string]uint64{},
}
return rv
}
+3 -1
View File
@@ -28,6 +28,7 @@ import (
zapv13 "github.com/blevesearch/zapx/v13"
zapv14 "github.com/blevesearch/zapx/v14"
zapv15 "github.com/blevesearch/zapx/v15"
zapv16 "github.com/blevesearch/zapx/v16"
)
// SegmentPlugin represents the essential functions required by a package to plug in
@@ -73,7 +74,8 @@ var defaultSegmentPlugin SegmentPlugin
func init() {
ResetSegmentPlugins()
RegisterSegmentPlugin(&zapv15.ZapPlugin{}, true)
RegisterSegmentPlugin(&zapv16.ZapPlugin{}, true)
RegisterSegmentPlugin(&zapv15.ZapPlugin{}, false)
RegisterSegmentPlugin(&zapv14.ZapPlugin{}, false)
RegisterSegmentPlugin(&zapv13.ZapPlugin{}, false)
RegisterSegmentPlugin(&zapv12.ZapPlugin{}, false)
@@ -0,0 +1,158 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package scorch
import (
"bytes"
"context"
"fmt"
"reflect"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
segment_api "github.com/blevesearch/scorch_segment_api/v2"
)
const VectorSearchSupportedSegmentVersion = 16
var reflectStaticSizeIndexSnapshotVectorReader int
func init() {
var istfr IndexSnapshotVectorReader
reflectStaticSizeIndexSnapshotVectorReader = int(reflect.TypeOf(istfr).Size())
}
type IndexSnapshotVectorReader struct {
vector []float32
field string
k int64
snapshot *IndexSnapshot
postings []segment_api.VecPostingsList
iterators []segment_api.VecPostingsIterator
segmentOffset int
currPosting segment_api.VecPosting
currID index.IndexInternalID
ctx context.Context
}
func (i *IndexSnapshotVectorReader) Size() int {
sizeInBytes := reflectStaticSizeIndexSnapshotVectorReader + size.SizeOfPtr +
len(i.vector) + len(i.field) + len(i.currID)
for _, entry := range i.postings {
sizeInBytes += entry.Size()
}
for _, entry := range i.iterators {
sizeInBytes += entry.Size()
}
if i.currPosting != nil {
sizeInBytes += i.currPosting.Size()
}
return sizeInBytes
}
func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) (
*index.VectorDoc, error) {
rv := preAlloced
if rv == nil {
rv = &index.VectorDoc{}
}
for i.segmentOffset < len(i.iterators) {
next, err := i.iterators[i.segmentOffset].Next()
if err != nil {
return nil, err
}
if next != nil {
// make segment number into global number by adding offset
globalOffset := i.snapshot.offsets[i.segmentOffset]
nnum := next.Number()
rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset)
rv.Score = float64(next.Score())
i.currID = rv.ID
i.currPosting = next
return rv, nil
}
i.segmentOffset++
}
return nil, nil
}
func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k)
if err != nil {
return nil, err
}
// close the current term field reader before replacing it with a new one
_ = i.Close()
*i = *(i2.(*IndexSnapshotVectorReader))
}
num, err := docInternalToNumber(ID)
if err != nil {
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
}
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
if segIndex >= len(i.snapshot.segment) {
return nil, fmt.Errorf("computed segment index %d out of bounds %d",
segIndex, len(i.snapshot.segment))
}
// skip directly to the target segment
i.segmentOffset = segIndex
next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
if err != nil {
return nil, err
}
if next == nil {
// we jumped directly to the segment that should have contained it
// but it wasn't there, so reuse Next() which should correctly
// get the next hit after it (we moved i.segmentOffset)
return i.Next(preAlloced)
}
if preAlloced == nil {
preAlloced = &index.VectorDoc{}
}
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
i.snapshot.offsets[segIndex])
i.currID = preAlloced.ID
i.currPosting = next
return preAlloced, nil
}
func (i *IndexSnapshotVectorReader) Count() uint64 {
var rv uint64
for _, posting := range i.postings {
rv += posting.Count()
}
return rv
}
func (i *IndexSnapshotVectorReader) Close() error {
// TODO Consider if any scope of recycling here.
return nil
}
@@ -39,6 +39,9 @@ type SegmentSnapshot struct {
segment segment.Segment
deleted *roaring.Bitmap
creator string
stats *fieldStats
cachedMeta *cachedMeta
cachedDocs *cachedDocs
}
@@ -282,3 +285,30 @@ func (c *cachedDocs) visitDoc(localDocNum uint64,
c.m.Unlock()
}
// the purpose of the cachedMeta is to simply allow the user of this type to record
// and cache certain meta data information (specific to the segment) that can be
// used across calls to save compute on the same.
// for example searcher creations on the same index snapshot can use this struct
// to help and fetch the backing index size information which can be used in
// memory usage calculation thereby deciding whether to allow a query or not.
type cachedMeta struct {
m sync.RWMutex
meta map[string]interface{}
}
func (c *cachedMeta) updateMeta(field string, val interface{}) {
c.m.Lock()
if c.meta == nil {
c.meta = make(map[string]interface{})
}
c.meta[field] = val
c.m.Unlock()
}
func (c *cachedMeta) fetchMeta(field string) (rv interface{}) {
c.m.RLock()
rv = c.meta[field]
c.m.RUnlock()
return rv
}
@@ -0,0 +1,48 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package scorch
import (
"context"
index "github.com/blevesearch/bleve_index_api"
segment_api "github.com/blevesearch/scorch_segment_api/v2"
)
func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32,
field string, k int64) (
index.VectorReader, error) {
rv := &IndexSnapshotVectorReader{
vector: vector,
field: field,
k: k,
snapshot: is,
}
if rv.postings == nil {
rv.postings = make([]segment_api.VecPostingsList, len(is.segment))
}
if rv.iterators == nil {
rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment))
}
// initialize postings and iterators within the OptimizeVR's Finish()
return rv, nil
}
+2
View File
@@ -51,6 +51,8 @@ type Stats struct {
TotTermSearchersStarted uint64
TotTermSearchersFinished uint64
TotKNNSearches uint64
TotEventTriggerStarted uint64
TotEventTriggerCompleted uint64
+281 -39
View File
@@ -21,6 +21,8 @@ import (
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/collector"
"github.com/blevesearch/bleve/v2/search/query"
index "github.com/blevesearch/bleve_index_api"
)
@@ -160,13 +162,92 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest
if len(i.indexes) < 1 {
return nil, ErrorAliasEmpty
}
if _, ok := ctx.Value(search.PreSearchKey).(bool); ok {
// since preSearchKey is set, it means that the request
// is being executed as part of a preSearch, which
// indicates that this index alias is set as an Index
// in another alias, so we need to do a preSearch search
// and NOT a real search
return preSearchDataSearch(ctx, req, i.indexes...)
}
// at this point we know we are doing a real search
// either after a preSearch is done, or directly
// on the alias
// check if request has preSearchData which would indicate that the
// request has already been preSearched and we can skip the
// preSearch step now, we call an optional function to
// redistribute the preSearchData to the individual indexes
// if necessary
var preSearchData map[string]map[string]interface{}
if req.PreSearchData != nil {
if requestHasKNN(req) {
var err error
preSearchData, err = redistributeKNNPreSearchData(req, i.indexes)
if err != nil {
return nil, err
}
}
}
// short circuit the simple case
if len(i.indexes) == 1 {
if preSearchData != nil {
req.PreSearchData = preSearchData[i.indexes[0].Name()]
}
return i.indexes[0].SearchInContext(ctx, req)
}
return MultiSearch(ctx, req, i.indexes...)
// at this stage we know we have multiple indexes
// check if preSearchData needs to be gathered from all indexes
// before executing the query
var err error
// only perform preSearch if
// - the request does not already have preSearchData
// - the request requires preSearch
var preSearchDuration time.Duration
var sr *SearchResult
if req.PreSearchData == nil && preSearchRequired(req) {
searchStart := time.Now()
preSearchResult, err := preSearch(ctx, req, i.indexes...)
if err != nil {
return nil, err
}
// check if the preSearch result has any errors and if so
// return the search result as is without executing the query
// so that the errors are not lost
if preSearchResult.Status.Failed > 0 || len(preSearchResult.Status.Errors) > 0 {
return preSearchResult, nil
}
// finalize the preSearch result now
finalizePreSearchResult(req, preSearchResult)
// if there are no errors, then merge the data in the preSearch result
// and construct the preSearchData to be used in the actual search
// if the request is satisfied by the preSearch result, then we can
// directly return the preSearch result as the final result
if requestSatisfiedByPreSearch(req) {
sr = finalizeSearchResult(req, preSearchResult)
// no need to run the 2nd phase MultiSearch(..)
} else {
preSearchData, err = constructPreSearchData(req, preSearchResult, i.indexes)
if err != nil {
return nil, err
}
}
preSearchDuration = time.Since(searchStart)
}
// check if search result was generated as part of preSearch itself
if sr == nil {
sr, err = MultiSearch(ctx, req, preSearchData, i.indexes...)
if err != nil {
return nil, err
}
}
sr.Took += preSearchDuration
return sr, nil
}
func (i *indexAliasImpl) Fields() ([]string, error) {
@@ -429,22 +510,8 @@ func (i *indexAliasImpl) Swap(in, out []Index) {
// the actual final results.
// Perhaps that part needs to be optional,
// could be slower in remote usages.
func createChildSearchRequest(req *SearchRequest) *SearchRequest {
rv := SearchRequest{
Query: req.Query,
Size: req.Size + req.From,
From: 0,
Highlight: req.Highlight,
Fields: req.Fields,
Facets: req.Facets,
Explain: req.Explain,
Sort: req.Sort.Copy(),
IncludeLocations: req.IncludeLocations,
Score: req.Score,
SearchAfter: req.SearchAfter,
SearchBefore: req.SearchBefore,
}
return &rv
func createChildSearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest {
return copySearchRequest(req, preSearchData)
}
type asyncSearchResult struct {
@@ -453,9 +520,195 @@ type asyncSearchResult struct {
Err error
}
func preSearchRequired(req *SearchRequest) bool {
return requestHasKNN(req)
}
func preSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) {
// create a dummy request with a match none query
// since we only care about the preSearchData in PreSearch
dummyRequest := &SearchRequest{
Query: query.NewMatchNoneQuery(),
}
newCtx := context.WithValue(ctx, search.PreSearchKey, true)
if requestHasKNN(req) {
addKnnToDummyRequest(dummyRequest, req)
}
return preSearchDataSearch(newCtx, dummyRequest, indexes...)
}
// if the request is satisfied by just the preSearch result,
// finalize the result and return it directly without
// performing multi search
func finalizeSearchResult(req *SearchRequest, preSearchResult *SearchResult) *SearchResult {
if preSearchResult == nil {
return nil
}
// global values across all hits irrespective of pagination settings
preSearchResult.Total = uint64(preSearchResult.Hits.Len())
maxScore := float64(0)
for i, hit := range preSearchResult.Hits {
// since we are now using the preSearch result as the final result
// we can discard the indexNames from the hits as they are no longer
// relevant.
hit.IndexNames = nil
if hit.Score > maxScore {
maxScore = hit.Score
}
hit.HitNumber = uint64(i)
}
preSearchResult.MaxScore = maxScore
// now apply pagination settings
var reverseQueryExecution bool
if req.SearchBefore != nil {
reverseQueryExecution = true
req.Sort.Reverse()
req.SearchAfter = req.SearchBefore
}
if req.SearchAfter != nil {
preSearchResult.Hits = collector.FilterHitsBySearchAfter(preSearchResult.Hits, req.Sort, req.SearchAfter)
}
preSearchResult.Hits = hitsInCurrentPage(req, preSearchResult.Hits)
if reverseQueryExecution {
// reverse the sort back to the original
req.Sort.Reverse()
// resort using the original order
mhs := newSearchHitSorter(req.Sort, preSearchResult.Hits)
req.SortFunc()(mhs)
req.SearchAfter = nil
}
if req.Explain {
preSearchResult.Request = req
}
return preSearchResult
}
func requestSatisfiedByPreSearch(req *SearchRequest) bool {
if requestHasKNN(req) && isKNNrequestSatisfiedByPreSearch(req) {
return true
}
return false
}
func constructPreSearchData(req *SearchRequest, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) {
mergedOut := make(map[string]map[string]interface{}, len(indexes))
for _, index := range indexes {
mergedOut[index.Name()] = make(map[string]interface{})
}
var err error
if requestHasKNN(req) {
mergedOut, err = constructKnnPreSearchData(mergedOut, preSearchResult, indexes)
if err != nil {
return nil, err
}
}
return mergedOut, nil
}
func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) {
asyncResults := make(chan *asyncSearchResult, len(indexes))
// run search on each index in separate go routine
var waitGroup sync.WaitGroup
var searchChildIndex = func(in Index, childReq *SearchRequest) {
rv := asyncSearchResult{Name: in.Name()}
rv.Result, rv.Err = in.SearchInContext(ctx, childReq)
asyncResults <- &rv
waitGroup.Done()
}
waitGroup.Add(len(indexes))
for _, in := range indexes {
go searchChildIndex(in, createChildSearchRequest(req, nil))
}
// on another go routine, close after finished
go func() {
waitGroup.Wait()
close(asyncResults)
}()
// the final search result to be returned after combining the preSearch results
var sr *SearchResult
// the preSearch result processor
var prp preSearchResultProcessor
// error map
indexErrors := make(map[string]error)
for asr := range asyncResults {
if asr.Err == nil {
// a valid preSearch result
if prp == nil {
// first valid preSearch result
// create a new preSearch result processor
prp = createPreSearchResultProcessor(req)
}
prp.add(asr.Result, asr.Name)
if sr == nil {
// first result
sr = &SearchResult{
Status: asr.Result.Status,
Cost: asr.Result.Cost,
}
} else {
// merge with previous
sr.Status.Merge(asr.Result.Status)
sr.Cost += asr.Result.Cost
}
} else {
indexErrors[asr.Name] = asr.Err
}
}
// handle case where no results were successful
if sr == nil {
sr = &SearchResult{
Status: &SearchStatus{
Errors: make(map[string]error),
},
}
}
// in preSearch, partial results are not allowed as it can lead to
// the real search giving incorrect results, and hence the search
// result is not populated with any of the processed data from
// the preSearch result processor if there are any errors
// or the preSearch result status has any failures
if len(indexErrors) > 0 || sr.Status.Failed > 0 {
if sr.Status.Errors == nil {
sr.Status.Errors = make(map[string]error)
}
for indexName, indexErr := range indexErrors {
sr.Status.Errors[indexName] = indexErr
sr.Status.Total++
sr.Status.Failed++
}
} else {
prp.finalize(sr)
}
return sr, nil
}
// hitsInCurrentPage returns the hits in the current page
// using the From and Size parameters in the request
func hitsInCurrentPage(req *SearchRequest, hits []*search.DocumentMatch) []*search.DocumentMatch {
sortFunc := req.SortFunc()
// sort all hits with the requested order
if len(req.Sort) > 0 {
sorter := newSearchHitSorter(req.Sort, hits)
sortFunc(sorter)
}
// now skip over the correct From
if req.From > 0 && len(hits) > req.From {
hits = hits[req.From:]
} else if req.From > 0 {
hits = search.DocumentMatchCollection{}
}
// now trim to the correct size
if req.Size > 0 && len(hits) > req.Size {
hits = hits[0:req.Size]
}
return hits
}
// MultiSearch executes a SearchRequest across multiple Index objects,
// then merges the results. The indexes must honor any ctx deadline.
func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) {
func MultiSearch(ctx context.Context, req *SearchRequest, preSearchData map[string]map[string]interface{}, indexes ...Index) (*SearchResult, error) {
searchStart := time.Now()
asyncResults := make(chan *asyncSearchResult, len(indexes))
@@ -480,7 +733,11 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se
waitGroup.Add(len(indexes))
for _, in := range indexes {
go searchChildIndex(in, createChildSearchRequest(req))
var payload map[string]interface{}
if preSearchData != nil {
payload = preSearchData[in.Name()]
}
go searchChildIndex(in, createChildSearchRequest(req, payload))
}
// on another go routine, close after finished
@@ -518,24 +775,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se
}
}
sortFunc := req.SortFunc()
// sort all hits with the requested order
if len(req.Sort) > 0 {
sorter := newSearchHitSorter(req.Sort, sr.Hits)
sortFunc(sorter)
}
// now skip over the correct From
if req.From > 0 && len(sr.Hits) > req.From {
sr.Hits = sr.Hits[req.From:]
} else if req.From > 0 {
sr.Hits = search.DocumentMatchCollection{}
}
// now trim to the correct size
if req.Size > 0 && len(sr.Hits) > req.Size {
sr.Hits = sr.Hits[0:req.Size]
}
sr.Hits = hitsInCurrentPage(req, sr.Hits)
// fix up facets
for name, fr := range req.Facets {
@@ -547,14 +787,16 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se
req.Sort.Reverse()
// resort using the original order
mhs := newSearchHitSorter(req.Sort, sr.Hits)
sortFunc(mhs)
req.SortFunc()(mhs)
// reset request
req.SearchBefore = req.SearchAfter
req.SearchAfter = nil
}
// fix up original request
sr.Request = req
if req.Explain {
sr.Request = req
}
searchDuration := time.Since(searchStart)
sr.Took = searchDuration
+77 -17
View File
@@ -433,6 +433,25 @@ func memNeededForSearch(req *SearchRequest,
return uint64(estimate)
}
func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader index.IndexReader) (*SearchResult, error) {
var knnHits []*search.DocumentMatch
var err error
if requestHasKNN(req) {
knnHits, err = i.runKnnCollector(ctx, req, reader, true)
if err != nil {
return nil, err
}
}
return &SearchResult{
Status: &SearchStatus{
Total: 1,
Successful: 1,
},
Hits: knnHits,
}, nil
}
// SearchInContext executes a search request operation within the provided
// Context. Returns a SearchResult object or an error.
func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) {
@@ -445,6 +464,25 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
return nil, ErrorIndexClosed
}
// open a reader for this search
indexReader, err := i.i.Reader()
if err != nil {
return nil, fmt.Errorf("error opening index reader %v", err)
}
defer func() {
if cerr := indexReader.Close(); err == nil && cerr != nil {
err = cerr
}
}()
if _, ok := ctx.Value(search.PreSearchKey).(bool); ok {
preSearchResult, err := i.preSearch(ctx, req, indexReader)
if err != nil {
return nil, err
}
return preSearchResult, nil
}
var reverseQueryExecution bool
if req.SearchBefore != nil {
reverseQueryExecution = true
@@ -460,16 +498,31 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
coll = collector.NewTopNCollector(req.Size, req.From, req.Sort)
}
// open a reader for this search
indexReader, err := i.i.Reader()
if err != nil {
return nil, fmt.Errorf("error opening index reader %v", err)
}
defer func() {
if cerr := indexReader.Close(); err == nil && cerr != nil {
err = cerr
var knnHits []*search.DocumentMatch
var ok bool
var skipKnnCollector bool
if req.PreSearchData != nil {
for k, v := range req.PreSearchData {
switch k {
case search.KnnPreSearchDataKey:
if v != nil {
knnHits, ok = v.([]*search.DocumentMatch)
if !ok {
return nil, fmt.Errorf("knn preSearchData must be of type []*search.DocumentMatch")
}
}
skipKnnCollector = true
}
}
}()
}
if !skipKnnCollector && requestHasKNN(req) {
knnHits, err = i.runKnnCollector(ctx, req, indexReader, false)
if err != nil {
return nil, err
}
}
setKnnHitsInCollector(knnHits, req, coll)
// This callback and variable handles the tracking of bytes read
// 1. as part of creation of tfr and its Next() calls which is
@@ -540,14 +593,14 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
if dateTimeParser == nil {
return nil, fmt.Errorf("no date time parser named `%s` registered", dateTimeParserName)
}
start, end, startLayout, endLayout, err := dr.ParseDates(dateTimeParser)
start, end, err := dr.ParseDates(dateTimeParser)
if err != nil {
return nil, fmt.Errorf("ParseDates err: %v, using date time parser named %s", err, dateTimeParserName)
}
if start.IsZero() && end.IsZero() {
return nil, fmt.Errorf("date range query must specify either start, end or both for date range name '%s'", dr.Name)
}
facetBuilder.AddRange(dr.Name, start, end, startLayout, endLayout)
facetBuilder.AddRange(dr.Name, start, end)
}
facetsBuilder.Add(facetName, facetBuilder)
} else {
@@ -605,7 +658,9 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
var storedFieldsCost uint64
for _, hit := range hits {
if i.name != "" {
// KNN documents will already have their Index value set as part of the knn collector output
// so check if the index is empty and set it to the current index name
if i.name != "" && hit.Index == "" {
hit.Index = i.name
}
err, storedFieldsBytes := LoadAndHighlightFields(hit, req, i.name, indexReader, highlighter)
@@ -638,18 +693,23 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
req.SearchAfter = nil
}
return &SearchResult{
rv := &SearchResult{
Status: &SearchStatus{
Total: 1,
Successful: 1,
},
Request: req,
Hits: hits,
Total: coll.Total(),
MaxScore: coll.MaxScore(),
Took: searchDuration,
Facets: coll.FacetResults(),
}, nil
}
if req.Explain {
rv.Request = req
}
return rv, nil
}
func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest,
@@ -658,9 +718,9 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest,
var totalStoredFieldsBytes uint64
if len(req.Fields) > 0 || highlighter != nil {
doc, err := r.Document(hit.ID)
totalStoredFieldsBytes = doc.StoredFieldsBytes()
if err == nil && doc != nil {
if len(req.Fields) > 0 {
if len(req.Fields) > 0 && hit.Fields == nil {
totalStoredFieldsBytes = doc.StoredFieldsBytes()
fieldsToLoad := deDuplicate(req.Fields)
for _, f := range fieldsToLoad {
doc.VisitFields(func(docF index.Field) {
+58 -15
View File
@@ -50,7 +50,8 @@ type DocumentMapping struct {
StructTagKey string `json:"struct_tag_key,omitempty"`
}
func (dm *DocumentMapping) Validate(cache *registry.Cache) error {
func (dm *DocumentMapping) Validate(cache *registry.Cache,
parentName string, fieldAliasCtx map[string]*FieldMapping) error {
var err error
if dm.DefaultAnalyzer != "" {
_, err := cache.AnalyzerNamed(dm.DefaultAnalyzer)
@@ -58,8 +59,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache) error {
return err
}
}
for _, property := range dm.Properties {
err = property.Validate(cache)
for propertyName, property := range dm.Properties {
newParent := propertyName
if parentName != "" {
newParent = fmt.Sprintf("%s.%s", parentName, propertyName)
}
err = property.Validate(cache, newParent, fieldAliasCtx)
if err != nil {
return err
}
@@ -77,15 +82,25 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache) error {
return err
}
}
switch field.Type {
case "text", "datetime", "number", "boolean", "geopoint", "geoshape", "IP":
default:
return fmt.Errorf("unknown field type: '%s'", field.Type)
err := validateFieldMapping(field, parentName, fieldAliasCtx)
if err != nil {
return err
}
}
return nil
}
func validateFieldType(field *FieldMapping) error {
switch field.Type {
case "text", "datetime", "number", "boolean", "geopoint", "geoshape", "IP":
return nil
default:
return fmt.Errorf("field: '%s', unknown field type: '%s'",
field.Name, field.Type)
}
}
// analyzerNameForPath attempts to first find the field
// described by this path, then returns the analyzer
// configured for that field
@@ -141,15 +156,20 @@ func (dm *DocumentMapping) fieldDescribedByPath(path string) *FieldMapping {
return nil
}
// documentMappingForPath returns the EXACT and closest matches for a sub
// documentMappingForPathElements returns the EXACT and closest matches for a sub
// document or for an explicitly mapped field; the closest most specific
// document mapping could be one that matches part of the provided path.
func (dm *DocumentMapping) documentMappingForPath(path string) (
func (dm *DocumentMapping) documentMappingForPathElements(pathElements []string) (
*DocumentMapping, *DocumentMapping) {
pathElements := decodePath(path)
var pathElementsCopy []string
if len(pathElements) == 0 {
pathElementsCopy = []string{""}
} else {
pathElementsCopy = pathElements
}
current := dm
OUTER:
for i, pathElement := range pathElements {
for i, pathElement := range pathElementsCopy {
if subDocMapping, exists := current.Properties[pathElement]; exists {
current = subDocMapping
continue OUTER
@@ -157,7 +177,7 @@ OUTER:
// no subDocMapping matches this pathElement
// only if this is the last element check for field name
if i == len(pathElements)-1 {
if i == len(pathElementsCopy)-1 {
for _, field := range current.Fields {
if field.Name == pathElement {
break
@@ -170,6 +190,15 @@ OUTER:
return current, current
}
// documentMappingForPath returns the EXACT and closest matches for a sub
// document or for an explicitly mapped field; the closest most specific
// document mapping could be one that matches part of the provided path.
func (dm *DocumentMapping) documentMappingForPath(path string) (
*DocumentMapping, *DocumentMapping) {
pathElements := decodePath(path)
return dm.documentMappingForPathElements(pathElements)
}
// NewDocumentMapping returns a new document mapping
// with all the default values.
func NewDocumentMapping() *DocumentMapping {
@@ -388,9 +417,8 @@ func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes
}
func (dm *DocumentMapping) processProperty(property interface{}, path []string, indexes []uint64, context *walkContext) {
pathString := encodePath(path)
// look to see if there is a mapping for this field
subDocMapping, closestDocMapping := dm.documentMappingForPath(pathString)
subDocMapping, closestDocMapping := dm.documentMappingForPathElements(path)
// check to see if we even need to do further processing
if subDocMapping != nil && !subDocMapping.Enabled {
@@ -402,6 +430,8 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
// cannot do anything with the zero value
return
}
pathString := encodePath(path)
propertyType := propertyValue.Type()
switch propertyType.Kind() {
case reflect.String:
@@ -502,9 +532,20 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
dm.walkDocument(property, path, indexes, context)
}
case reflect.Map, reflect.Slice:
var isPropertyVector bool
var isPropertyVectorInitialized bool
if subDocMapping != nil {
for _, fieldMapping := range subDocMapping.Fields {
switch fieldMapping.Type {
case "vector":
processed := fieldMapping.processVector(property, pathString, path,
indexes, context)
if !isPropertyVectorInitialized {
isPropertyVector = processed
isPropertyVectorInitialized = true
} else {
isPropertyVector = isPropertyVector && processed
}
case "geopoint":
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
case "IP":
@@ -517,7 +558,9 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
}
}
}
dm.walkDocument(property, path, indexes, context)
if !isPropertyVector {
dm.walkDocument(property, path, indexes, context)
}
case reflect.Ptr:
if !propertyValue.IsNil() {
switch property := property.(type) {
+26
View File
@@ -69,6 +69,17 @@ type FieldMapping struct {
// the processing of freq/norm details when the default score based relevancy
// isn't needed.
SkipFreqNorm bool `json:"skip_freq_norm,omitempty"`
// Dimensionality of the vector
Dims int `json:"dims,omitempty"`
// Similarity is the similarity algorithm used for scoring
// vector fields.
// See: index.DefaultSimilarityMetric & index.SupportedSimilarityMetrics
Similarity string `json:"similarity,omitempty"`
// Applicable to vector fields only - optimization string
VectorIndexOptimizedFor string `json:"vector_index_optimized_for,omitempty"`
}
// NewTextFieldMapping returns a default field mapping for text
@@ -448,6 +459,21 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error {
if err != nil {
return err
}
case "dims":
err := json.Unmarshal(v, &fm.Dims)
if err != nil {
return err
}
case "similarity":
err := json.Unmarshal(v, &fm.Similarity)
if err != nil {
return err
}
case "vector_index_optimized_for":
err := json.Unmarshal(v, &fm.VectorIndexOptimizedFor)
if err != nil {
return err
}
default:
invalidKeys = append(invalidKeys, k)
}
+31 -2
View File
@@ -174,12 +174,14 @@ func (im *IndexMappingImpl) Validate() error {
if err != nil {
return err
}
err = im.DefaultMapping.Validate(im.cache)
fieldAliasCtx := make(map[string]*FieldMapping)
err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx)
if err != nil {
return err
}
for _, docMapping := range im.TypeMapping {
err = docMapping.Validate(im.cache)
err = docMapping.Validate(im.cache, "", fieldAliasCtx)
if err != nil {
return err
}
@@ -431,6 +433,33 @@ func (im *IndexMappingImpl) FieldAnalyzer(field string) string {
return im.AnalyzerNameForPath(field)
}
// FieldMappingForPath returns the mapping for a specific field 'path'.
func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping {
if im.TypeMapping != nil {
for _, v := range im.TypeMapping {
for field, property := range v.Properties {
for _, v1 := range property.Fields {
if field == path {
// Return field mapping if the name matches the path param.
return *v1
}
}
}
}
}
for field, property := range im.DefaultMapping.Properties {
for _, v1 := range property.Fields {
if field == path {
// Return field mapping if the name matches the path param.
return *v1
}
}
}
return FieldMapping{}
}
// wrapper to satisfy new interface
func (im *IndexMappingImpl) DefaultSearchField() string {
+2
View File
@@ -55,4 +55,6 @@ type IndexMapping interface {
AnalyzerNameForPath(path string) string
AnalyzerNamed(name string) analysis.Analyzer
FieldMappingForPath(path string) FieldMapping
}
+35
View File
@@ -0,0 +1,35 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !vectors
// +build !vectors
package mapping
func NewVectorFieldMapping() *FieldMapping {
return nil
}
func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
pathString string, path []string, indexes []uint64, context *walkContext) bool {
return false
}
// -----------------------------------------------------------------------------
// document validation functions
func validateFieldMapping(field *FieldMapping, parentName string,
fieldAliasCtx map[string]*FieldMapping) error {
return validateFieldType(field)
}
+220
View File
@@ -0,0 +1,220 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package mapping
import (
"fmt"
"reflect"
"github.com/blevesearch/bleve/v2/document"
"github.com/blevesearch/bleve/v2/util"
index "github.com/blevesearch/bleve_index_api"
)
// Min and Max allowed dimensions for a vector field
const (
MinVectorDims = 1
MaxVectorDims = 2048
)
func NewVectorFieldMapping() *FieldMapping {
return &FieldMapping{
Type: "vector",
Store: false,
Index: true,
IncludeInAll: false,
DocValues: false,
SkipFreqNorm: true,
}
}
// validate and process a flat vector
func processFlatVector(vecV reflect.Value, dims int) ([]float32, bool) {
if vecV.Len() != dims {
return nil, false
}
rv := make([]float32, dims)
for i := 0; i < vecV.Len(); i++ {
item := vecV.Index(i)
if !item.CanInterface() {
return nil, false
}
itemI := item.Interface()
itemFloat, ok := util.ExtractNumericValFloat32(itemI)
if !ok {
return nil, false
}
rv[i] = itemFloat
}
return rv, true
}
// validate and process a vector
// max supported depth of nesting is 2 ([][]float32)
func processVector(vecI interface{}, dims int) ([]float32, bool) {
vecV := reflect.ValueOf(vecI)
if !vecV.IsValid() || vecV.Kind() != reflect.Slice || vecV.Len() == 0 {
return nil, false
}
// Let's examine the first element (head) of the vector.
// If head is a slice, then vector is nested, otherwise flat.
head := vecV.Index(0)
if !head.CanInterface() {
return nil, false
}
headI := head.Interface()
headV := reflect.ValueOf(headI)
if !headV.IsValid() {
return nil, false
}
if headV.Kind() != reflect.Slice { // vector is flat
return processFlatVector(vecV, dims)
}
// # process nested vector
// pre-allocate memory for the flattened vector
// so that we can use copy() later
rv := make([]float32, dims*vecV.Len())
for i := 0; i < vecV.Len(); i++ {
subVec := vecV.Index(i)
if !subVec.CanInterface() {
return nil, false
}
subVecI := subVec.Interface()
subVecV := reflect.ValueOf(subVecI)
if !subVecV.IsValid() {
return nil, false
}
if subVecV.Kind() != reflect.Slice {
return nil, false
}
flatVector, ok := processFlatVector(subVecV, dims)
if !ok {
return nil, false
}
copy(rv[i*dims:(i+1)*dims], flatVector)
}
return rv, true
}
func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
pathString string, path []string, indexes []uint64, context *walkContext) bool {
vector, ok := processVector(propertyMightBeVector, fm.Dims)
// Don't add field to document if vector is invalid
if !ok {
return false
}
fieldName := getFieldName(pathString, path, fm)
options := fm.Options()
field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, vector,
fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options)
context.doc.AddField(field)
// "_all" composite field is not applicable for vector field
context.excludedFromAll = append(context.excludedFromAll, fieldName)
return true
}
// -----------------------------------------------------------------------------
// document validation functions
func validateFieldMapping(field *FieldMapping, parentName string,
fieldAliasCtx map[string]*FieldMapping) error {
switch field.Type {
case "vector":
return validateVectorFieldAlias(field, parentName, fieldAliasCtx)
default: // non-vector field
return validateFieldType(field)
}
}
func validateVectorFieldAlias(field *FieldMapping, parentName string,
fieldAliasCtx map[string]*FieldMapping) error {
if field.Name == "" {
field.Name = parentName
}
if field.Similarity == "" {
field.Similarity = index.DefaultSimilarityMetric
}
if field.VectorIndexOptimizedFor == "" {
field.VectorIndexOptimizedFor = index.DefaultIndexOptimization
}
if _, exists := index.SupportedVectorIndexOptimizations[field.VectorIndexOptimizedFor]; !exists {
// if an unsupported config is provided, override to default
field.VectorIndexOptimizedFor = index.DefaultIndexOptimization
}
// following fields are not applicable for vector
// thus, we set them to default values
field.IncludeInAll = false
field.IncludeTermVectors = false
field.Store = false
field.DocValues = false
field.SkipFreqNorm = true
// # If alias is present, validate the field options as per the alias
// note: reading from a nil map is safe
if fieldAlias, ok := fieldAliasCtx[field.Name]; ok {
if field.Dims != fieldAlias.Dims {
return fmt.Errorf("field: '%s', invalid alias "+
"(different dimensions %d and %d)", fieldAlias.Name, field.Dims,
fieldAlias.Dims)
}
if field.Similarity != fieldAlias.Similarity {
return fmt.Errorf("field: '%s', invalid alias "+
"(different similarity values %s and %s)", fieldAlias.Name,
field.Similarity, fieldAlias.Similarity)
}
return nil
}
// # Validate field options
if field.Dims < MinVectorDims || field.Dims > MaxVectorDims {
return fmt.Errorf("field: '%s', invalid vector dimension: %d,"+
" value should be in range (%d, %d)", field.Name, field.Dims,
MinVectorDims, MaxVectorDims)
}
if _, ok := index.SupportedSimilarityMetrics[field.Similarity]; !ok {
return fmt.Errorf("field: '%s', invalid similarity "+
"metric: '%s', valid metrics are: %+v", field.Name, field.Similarity,
reflect.ValueOf(index.SupportedSimilarityMetrics).MapKeys())
}
if fieldAliasCtx != nil { // writing to a nil map is unsafe
fieldAliasCtx[field.Name] = field
}
return nil
}
+24
View File
@@ -0,0 +1,24 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package bleve
import "github.com/blevesearch/bleve/v2/mapping"
func NewVectorFieldMapping() *mapping.FieldMapping {
return mapping.NewVectorFieldMapping()
}
+59
View File
@@ -0,0 +1,59 @@
// Copyright (c) 2024 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bleve
// A preSearchResultProcessor processes the data in
// the preSearch result from multiple
// indexes in an alias and merges them together to
// create the final preSearch result
type preSearchResultProcessor interface {
// adds the preSearch result to the processor
add(*SearchResult, string)
// updates the final search result with the finalized
// data from the processor
finalize(*SearchResult)
}
type knnPreSearchResultProcessor struct {
addFn func(sr *SearchResult, indexName string)
finalizeFn func(sr *SearchResult)
}
func (k *knnPreSearchResultProcessor) add(sr *SearchResult, indexName string) {
if k.addFn != nil {
k.addFn(sr, indexName)
}
}
func (k *knnPreSearchResultProcessor) finalize(sr *SearchResult) {
if k.finalizeFn != nil {
k.finalizeFn(sr)
}
}
// -----------------------------------------------------------------------------
func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) {
if requestHasKNN(req) {
preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits)
}
}
func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor {
if requestHasKNN(req) {
return newKnnPreSearchResultProcessor(req)
}
return &knnPreSearchResultProcessor{} // equivalent to nil
}
+27 -131
View File
@@ -15,7 +15,6 @@
package bleve
import (
"encoding/json"
"fmt"
"reflect"
"sort"
@@ -32,19 +31,19 @@ import (
"github.com/blevesearch/bleve/v2/util"
)
const defaultDateTimeParser = optional.Name
var reflectStaticSizeSearchResult int
var reflectStaticSizeSearchStatus int
func init() {
var sr SearchResult
reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size())
var ss SearchStatus
reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size())
}
var cache = registry.NewCache()
var (
reflectStaticSizeSearchResult int
reflectStaticSizeSearchStatus int
)
func init() {
reflectStaticSizeSearchResult = int(reflect.TypeOf(SearchResult{}).Size())
reflectStaticSizeSearchStatus = int(reflect.TypeOf(SearchStatus{}).Size())
}
const defaultDateTimeParser = optional.Name
type dateTimeRange struct {
Name string `json:"name,omitempty"`
@@ -55,28 +54,24 @@ type dateTimeRange struct {
endString *string
}
func (dr *dateTimeRange) ParseDates(dateTimeParser analysis.DateTimeParser) (start, end time.Time, startLayout, endLayout string, err error) {
func (dr *dateTimeRange) ParseDates(dateTimeParser analysis.DateTimeParser) (start, end time.Time, err error) {
start = dr.Start
startLayout = time.RFC3339Nano
if dr.Start.IsZero() && dr.startString != nil {
s, layout, parseError := dateTimeParser.ParseDateTime(*dr.startString)
s, _, parseError := dateTimeParser.ParseDateTime(*dr.startString)
if parseError != nil {
return start, end, startLayout, endLayout, fmt.Errorf("error parsing start date '%s' for date range name '%s': %v", *dr.startString, dr.Name, parseError)
return start, end, fmt.Errorf("error parsing start date '%s' for date range name '%s': %v", *dr.startString, dr.Name, parseError)
}
start = s
startLayout = layout
}
end = dr.End
endLayout = time.RFC3339Nano
if dr.End.IsZero() && dr.endString != nil {
e, layout, parseError := dateTimeParser.ParseDateTime(*dr.endString)
e, _, parseError := dateTimeParser.ParseDateTime(*dr.endString)
if parseError != nil {
return start, end, startLayout, endLayout, fmt.Errorf("error parsing end date '%s' for date range name '%s': %v", *dr.endString, dr.Name, parseError)
return start, end, fmt.Errorf("error parsing end date '%s' for date range name '%s': %v", *dr.endString, dr.Name, parseError)
}
end = e
endLayout = layout
}
return start, end, startLayout, endLayout, err
return start, end, err
}
func (dr *dateTimeRange) UnmarshalJSON(input []byte) error {
@@ -187,7 +182,7 @@ func (fr *FacetRequest) Validate() error {
if dr.DateTimeParser == "" {
// cannot parse the date range dates as the defaultDateTimeParser is overridden
// so perform this validation at query time
start, end, _, _, err := dr.ParseDates(dateTimeParser)
start, end, err := dr.ParseDates(dateTimeParser)
if err != nil {
return fmt.Errorf("ParseDates err: %v, using date time parser named %s", err, defaultDateTimeParser)
}
@@ -285,51 +280,10 @@ func (h *HighlightRequest) AddField(field string) {
h.Fields = append(h.Fields, field)
}
// A SearchRequest describes all the parameters
// needed to search the index.
// Query is required.
// Size/From describe how much and which part of the
// result set to return.
// Highlight describes optional search result
// highlighting.
// Fields describes a list of field values which
// should be retrieved for result documents, provided they
// were stored while indexing.
// Facets describe the set of facets to be computed.
// Explain triggers inclusion of additional search
// result score explanations.
// Sort describes the desired order for the results to be returned.
// Score controls the kind of scoring performed
// SearchAfter supports deep paging by providing a minimum sort key
// SearchBefore supports deep paging by providing a maximum sort key
// sortFunc specifies the sort implementation to use for sorting results.
//
// A special field named "*" can be used to return all fields.
type SearchRequest struct {
ClientContextID string `json:"client_context_id,omitempty"`
Query query.Query `json:"query"`
Size int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort search.SortOrder `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score,omitempty"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
sortFunc func(sort.Interface)
}
func (r *SearchRequest) SetClientContextID(id string) {
r.ClientContextID = id
}
func (r *SearchRequest) Validate() error {
if srq, ok := r.Query.(query.ValidatableQuery); ok {
if err := srq.Validate(); err != nil {
err := srq.Validate()
if err != nil {
return err
}
}
@@ -355,6 +309,10 @@ func (r *SearchRequest) Validate() error {
}
}
err := validateKNN(r)
if err != nil {
return err
}
return r.Facets.Validate()
}
@@ -393,69 +351,6 @@ func (r *SearchRequest) SetSearchBefore(before []string) {
r.SearchBefore = before
}
// UnmarshalJSON deserializes a JSON representation of
// a SearchRequest
func (r *SearchRequest) UnmarshalJSON(input []byte) error {
var (
temp struct {
ClientContextID string `json:"client_context_id"`
Q json.RawMessage `json:"query"`
Size *int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort []json.RawMessage `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
}
err error
)
if err = util.UnmarshalJSON(input, &temp); err != nil {
return err
}
if temp.Size == nil {
r.Size = 10
} else {
r.Size = *temp.Size
}
if temp.Sort == nil {
r.Sort = search.SortOrder{&search.SortScore{Desc: true}}
} else {
if r.Sort, err = search.ParseSortOrderJSON(temp.Sort); err != nil {
return err
}
}
r.ClientContextID = temp.ClientContextID
r.From = temp.From
r.Explain = temp.Explain
r.Highlight = temp.Highlight
r.Fields = temp.Fields
r.Facets = temp.Facets
r.IncludeLocations = temp.IncludeLocations
r.Score = temp.Score
r.SearchAfter = temp.SearchAfter
r.SearchBefore = temp.SearchBefore
if r.Query, err = query.ParseQuery(temp.Q); err != nil {
return err
}
if r.Size < 0 {
r.Size = 10
}
if r.From < 0 {
r.From = 0
}
return nil
}
// NewSearchRequest creates a new SearchRequest
// for the Query, using default values for all
// other search parameters.
@@ -491,7 +386,8 @@ func (iem IndexErrMap) MarshalJSON() ([]byte, error) {
func (iem IndexErrMap) UnmarshalJSON(data []byte) error {
var tmp map[string]string
if err := util.UnmarshalJSON(data, &tmp); err != nil {
err := util.UnmarshalJSON(data, &tmp)
if err != nil {
return err
}
for k, v := range tmp {
@@ -541,7 +437,7 @@ func (ss *SearchStatus) Merge(other *SearchStatus) {
// Facets - The facet results for the search.
type SearchResult struct {
Status *SearchStatus `json:"status"`
Request *SearchRequest `json:"request"`
Request *SearchRequest `json:"request,omitempty"`
Hits search.DocumentMatchCollection `json:"hits"`
Total uint64 `json:"total_hits"`
Cost uint64 `json:"cost"`
@@ -571,7 +467,7 @@ func (sr *SearchResult) Size() int {
func (sr *SearchResult) String() string {
rv := ""
if sr.Total > 0 {
if sr.Request.Size > 0 {
if sr.Request != nil && sr.Request.Size > 0 {
rv = fmt.Sprintf("%d matches, showing %d through %d, took %s\n", sr.Total, sr.Request.From+1, sr.Request.From+len(sr.Hits), sr.Took)
for i, hit := range sr.Hits {
rv += fmt.Sprintf("%5d. %s (%f)\n", i+sr.Request.From+1, hit.ID, hit.Score)
+6
View File
@@ -44,9 +44,15 @@ type MakeDocumentMatchHandlerKeyType string
var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType(
"MakeDocumentMatchHandlerKey")
var MakeKNNDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType(
"MakeKNNDocumentMatchHandlerKey")
// MakeDocumentMatchHandler is an optional DocumentMatchHandler
// builder function which the applications can pass to bleve.
// These builder methods gives a DocumentMatchHandler function
// to bleve, which it will invoke on every document matches.
type MakeDocumentMatchHandler func(ctx *SearchContext) (
callback DocumentMatchHandler, loadID bool, err error)
type MakeKNNDocumentMatchHandler func(ctx *SearchContext) (
callback DocumentMatchHandler, err error)
+4
View File
@@ -69,6 +69,10 @@ func (c *collectStoreHeap) Final(skip int, fixup collectorFixup) (search.Documen
return rv, nil
}
func (c *collectStoreHeap) Internal() search.DocumentMatchCollection {
return c.heap
}
// heap interface implementation
func (c *collectStoreHeap) Len() int {
+262
View File
@@ -0,0 +1,262 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package collector
import (
"context"
"time"
"github.com/blevesearch/bleve/v2/search"
index "github.com/blevesearch/bleve_index_api"
)
type collectStoreKNN struct {
internalHeaps []collectorStore
kValues []int64
allHits map[*search.DocumentMatch]struct{}
ejectedDocs map[*search.DocumentMatch]struct{}
}
func newStoreKNN(internalHeaps []collectorStore, kValues []int64) *collectStoreKNN {
return &collectStoreKNN{
internalHeaps: internalHeaps,
kValues: kValues,
ejectedDocs: make(map[*search.DocumentMatch]struct{}),
allHits: make(map[*search.DocumentMatch]struct{}),
}
}
// Adds a document to the collector store and returns the documents that were ejected
// from the store. The documents that were ejected from the store are the ones that
// were not in the top K documents for any of the heaps.
// These document are put back into the pool document match pool in the KNN Collector.
func (c *collectStoreKNN) AddDocument(doc *search.DocumentMatch) []*search.DocumentMatch {
for heapIdx := 0; heapIdx < len(c.internalHeaps); heapIdx++ {
if _, ok := doc.ScoreBreakdown[heapIdx]; !ok {
continue
}
ejectedDoc := c.internalHeaps[heapIdx].AddNotExceedingSize(doc, int(c.kValues[heapIdx]))
if ejectedDoc != nil {
delete(ejectedDoc.ScoreBreakdown, heapIdx)
c.ejectedDocs[ejectedDoc] = struct{}{}
}
}
var rv []*search.DocumentMatch
for doc := range c.ejectedDocs {
if len(doc.ScoreBreakdown) == 0 {
rv = append(rv, doc)
}
// clear out the ejectedDocs map to reuse it in the next AddDocument call
delete(c.ejectedDocs, doc)
}
return rv
}
func (c *collectStoreKNN) Final(fixup collectorFixup) (search.DocumentMatchCollection, error) {
for _, heap := range c.internalHeaps {
for _, doc := range heap.Internal() {
// duplicates may be present across the internal heaps
// meaning the same document match may be in the top K
// for multiple KNN queries.
c.allHits[doc] = struct{}{}
}
}
size := len(c.allHits)
if size <= 0 {
return make(search.DocumentMatchCollection, 0), nil
}
rv := make(search.DocumentMatchCollection, size)
i := 0
for doc := range c.allHits {
if fixup != nil {
err := fixup(doc)
if err != nil {
return nil, err
}
}
rv[i] = doc
i++
}
return rv, nil
}
func MakeKNNDocMatchHandler(ctx *search.SearchContext) (search.DocumentMatchHandler, error) {
var hc *KNNCollector
var ok bool
if hc, ok = ctx.Collector.(*KNNCollector); ok {
return func(d *search.DocumentMatch) error {
if d == nil {
return nil
}
toRelease := hc.knnStore.AddDocument(d)
for _, doc := range toRelease {
ctx.DocumentMatchPool.Put(doc)
}
return nil
}, nil
}
return nil, nil
}
func GetNewKNNCollectorStore(kArray []int64) *collectStoreKNN {
internalHeaps := make([]collectorStore, len(kArray))
for knnIdx, k := range kArray {
// TODO - Check if the datatype of k can be made into an int instead of int64
idx := knnIdx
internalHeaps[idx] = getOptimalCollectorStore(int(k), 0, func(i, j *search.DocumentMatch) int {
if i.ScoreBreakdown[idx] < j.ScoreBreakdown[idx] {
return 1
}
return -1
})
}
return newStoreKNN(internalHeaps, kArray)
}
// implements Collector interface
type KNNCollector struct {
knnStore *collectStoreKNN
size int
total uint64
took time.Duration
results search.DocumentMatchCollection
maxScore float64
}
func NewKNNCollector(kArray []int64, size int64) *KNNCollector {
return &KNNCollector{
knnStore: GetNewKNNCollectorStore(kArray),
size: int(size),
}
}
func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
startTime := time.Now()
var err error
var next *search.DocumentMatch
// pre-allocate enough space in the DocumentMatchPool
// unless the sum of K is too large, then cap it
// everything should still work, just allocates DocumentMatches on demand
backingSize := hc.size
if backingSize > PreAllocSizeSkipCap {
backingSize = PreAllocSizeSkipCap + 1
}
searchContext := &search.SearchContext{
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), 0),
Collector: hc,
IndexReader: reader,
}
dmHandlerMakerKNN := MakeKNNDocMatchHandler
if cv := ctx.Value(search.MakeKNNDocumentMatchHandlerKey); cv != nil {
dmHandlerMakerKNN = cv.(search.MakeKNNDocumentMatchHandler)
}
// use the application given builder for making the custom document match
// handler and perform callbacks/invocations on the newly made handler.
dmHandler, err := dmHandlerMakerKNN(searchContext)
if err != nil {
return err
}
select {
case <-ctx.Done():
search.RecordSearchCost(ctx, search.AbortM, 0)
return ctx.Err()
default:
next, err = searcher.Next(searchContext)
}
for err == nil && next != nil {
if hc.total%CheckDoneEvery == 0 {
select {
case <-ctx.Done():
search.RecordSearchCost(ctx, search.AbortM, 0)
return ctx.Err()
default:
}
}
hc.total++
err = dmHandler(next)
if err != nil {
break
}
next, err = searcher.Next(searchContext)
}
if err != nil {
return err
}
// help finalize/flush the results in case
// of custom document match handlers.
err = dmHandler(nil)
if err != nil {
return err
}
// compute search duration
hc.took = time.Since(startTime)
// finalize actual results
err = hc.finalizeResults(reader)
if err != nil {
return err
}
return nil
}
func (hc *KNNCollector) finalizeResults(r index.IndexReader) error {
var err error
hc.results, err = hc.knnStore.Final(func(doc *search.DocumentMatch) error {
if doc.ID == "" {
// look up the id since we need it for lookup
var err error
doc.ID, err = r.ExternalID(doc.IndexInternalID)
if err != nil {
return err
}
}
return nil
})
return err
}
func (hc *KNNCollector) Results() search.DocumentMatchCollection {
return hc.results
}
func (hc *KNNCollector) Total() uint64 {
return hc.total
}
func (hc *KNNCollector) MaxScore() float64 {
return hc.maxScore
}
func (hc *KNNCollector) Took() time.Duration {
return hc.took
}
func (hc *KNNCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) {
// facet unsupported for vector search
}
func (hc *KNNCollector) FacetResults() search.FacetResults {
// facet unsupported for vector search
return nil
}
+10
View File
@@ -81,6 +81,16 @@ func (c *collectStoreList) Final(skip int, fixup collectorFixup) (search.Documen
return search.DocumentMatchCollection{}, nil
}
func (c *collectStoreList) Internal() search.DocumentMatchCollection {
rv := make(search.DocumentMatchCollection, c.results.Len())
i := 0
for e := c.results.Front(); e != nil; e = e.Next() {
rv[i] = e.Value.(*search.DocumentMatch)
i++
}
return rv
}
func (c *collectStoreList) len() int {
return c.results.Len()
}
+4
View File
@@ -72,6 +72,10 @@ func (c *collectStoreSlice) Final(skip int, fixup collectorFixup) (search.Docume
return search.DocumentMatchCollection{}, nil
}
func (c *collectStoreSlice) Internal() search.DocumentMatchCollection {
return c.slice
}
func (c *collectStoreSlice) len() int {
return len(c.slice)
}
+143 -39
View File
@@ -39,6 +39,9 @@ type collectorStore interface {
AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch
Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error)
// Provide access the internal heap implementation
Internal() search.DocumentMatchCollection
}
// PreAllocSizeSkipCap will cap preallocation to this amount when
@@ -72,6 +75,9 @@ type TopNCollector struct {
updateFieldVisitor index.DocValueVisitor
dvReader index.DocValueReader
searchAfter *search.DocumentMatch
knnHits map[string]*search.DocumentMatch
computeNewScoreExpl search.ScoreExplCorrectionCallbackFunc
}
// CheckDoneEvery controls how frequently we check the context deadline
@@ -89,44 +95,16 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector
// ordering hits by the provided sort order
func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector {
rv := newTopNCollector(size, 0, sort)
rv.searchAfter = &search.DocumentMatch{
Sort: after,
}
for pos, ss := range sort {
if ss.RequiresDocID() {
rv.searchAfter.ID = after[pos]
}
if ss.RequiresScoring() {
if score, err := strconv.ParseFloat(after[pos], 64); err == nil {
rv.searchAfter.Score = score
}
}
}
rv.searchAfter = createSearchAfterDocument(sort, after)
return rv
}
func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
hc := &TopNCollector{size: size, skip: skip, sort: sort}
// pre-allocate space on the store to avoid reslicing
// unless the size + skip is too large, then cap it
// everything should still work, just reslices as necessary
backingSize := size + skip + 1
if size+skip > PreAllocSizeSkipCap {
backingSize = PreAllocSizeSkipCap + 1
}
if size+skip > 10 {
hc.store = newStoreHeap(backingSize, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
} else {
hc.store = newStoreSlice(backingSize, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
}
hc.store = getOptimalCollectorStore(size, skip, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
// these lookups traverse an interface, so do once up-front
if sort.RequiresDocID() {
@@ -139,6 +117,59 @@ func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector
return hc
}
func createSearchAfterDocument(sort search.SortOrder, after []string) *search.DocumentMatch {
rv := &search.DocumentMatch{
Sort: after,
}
for pos, ss := range sort {
if ss.RequiresDocID() {
rv.ID = after[pos]
}
if ss.RequiresScoring() {
if score, err := strconv.ParseFloat(after[pos], 64); err == nil {
rv.Score = score
}
}
}
return rv
}
// Filter document matches based on the SearchAfter field in the SearchRequest.
func FilterHitsBySearchAfter(hits []*search.DocumentMatch, sort search.SortOrder, after []string) []*search.DocumentMatch {
if len(hits) == 0 {
return hits
}
// create a search after document
searchAfter := createSearchAfterDocument(sort, after)
// filter the hits
idx := 0
cachedScoring := sort.CacheIsScore()
cachedDesc := sort.CacheDescending()
for _, hit := range hits {
if sort.Compare(cachedScoring, cachedDesc, hit, searchAfter) > 0 {
hits[idx] = hit
idx++
}
}
return hits[:idx]
}
func getOptimalCollectorStore(size, skip int, comparator collectorCompare) collectorStore {
// pre-allocate space on the store to avoid reslicing
// unless the size + skip is too large, then cap it
// everything should still work, just reslices as necessary
backingSize := size + skip + 1
if size+skip > PreAllocSizeSkipCap {
backingSize = PreAllocSizeSkipCap + 1
}
if size+skip > 10 {
return newStoreHeap(backingSize, comparator)
} else {
return newStoreSlice(backingSize, comparator)
}
}
func (hc *TopNCollector) Size() int {
sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr
@@ -215,7 +246,12 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
}
}
err = hc.prepareDocumentMatch(searchContext, reader, next)
err = hc.adjustDocumentMatch(searchContext, reader, next)
if err != nil {
break
}
err = hc.prepareDocumentMatch(searchContext, reader, next, false)
if err != nil {
break
}
@@ -227,6 +263,23 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
next, err = searcher.Next(searchContext)
}
if err != nil {
return err
}
if hc.knnHits != nil {
// we may have some knn hits left that did not match any of the top N tf-idf hits
// we need to add them to the collector store to consider them as well.
for _, knnDoc := range hc.knnHits {
err = hc.prepareDocumentMatch(searchContext, reader, knnDoc, true)
if err != nil {
return err
}
err = dmHandler(knnDoc)
if err != nil {
return err
}
}
}
statsCallbackFn := ctx.Value(search.SearchIOStatsCallbackKey)
if statsCallbackFn != nil {
@@ -258,12 +311,40 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
var sortByScoreOpt = []string{"_score"}
func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
func (hc *TopNCollector) adjustDocumentMatch(ctx *search.SearchContext,
reader index.IndexReader, d *search.DocumentMatch) (err error) {
if hc.knnHits != nil {
d.ID, err = reader.ExternalID(d.IndexInternalID)
if err != nil {
return err
}
if knnHit, ok := hc.knnHits[d.ID]; ok {
d.Score, d.Expl = hc.computeNewScoreExpl(d, knnHit)
delete(hc.knnHits, d.ID)
}
}
return nil
}
func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
reader index.IndexReader, d *search.DocumentMatch, isKnnDoc bool) (err error) {
// visit field terms for features that require it (sort, facets)
if len(hc.neededFields) > 0 {
err = hc.visitFieldTerms(reader, d)
if !isKnnDoc && len(hc.neededFields) > 0 {
err = hc.visitFieldTerms(reader, d, hc.updateFieldVisitor)
if err != nil {
return err
}
} else if isKnnDoc && hc.facetsBuilder != nil {
// we need to visit the field terms for the knn document
// only for those fields that are required for faceting
// and not for sorting. This is because the knn document's
// sort value is already computed in the knn collector.
err = hc.visitFieldTerms(reader, d, func(field string, term []byte) {
if hc.facetsBuilder != nil {
hc.facetsBuilder.UpdateVisitor(field, term)
}
})
if err != nil {
return err
}
@@ -277,9 +358,14 @@ func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
if d.Score > hc.maxScore {
hc.maxScore = d.Score
}
// early exit as the document match had its sort value calculated in the knn
// collector itself
if isKnnDoc {
return nil
}
// see if we need to load ID (at this early stage, for example to sort on it)
if hc.needDocIds {
if hc.needDocIds && d.ID == "" {
d.ID, err = reader.ExternalID(d.IndexInternalID)
if err != nil {
return err
@@ -314,6 +400,7 @@ func MakeTopNDocumentMatchHandler(
// but we want to allow for exact match, so we pretend
hc.searchAfter.HitNumber = d.HitNumber
if hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.searchAfter) <= 0 {
ctx.DocumentMatchPool.Put(d)
return nil
}
}
@@ -353,12 +440,21 @@ func MakeTopNDocumentMatchHandler(
// visitFieldTerms is responsible for visiting the field terms of the
// search hit, and passing visited terms to the sort and facet builder
func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error {
func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch, v index.DocValueVisitor) error {
if hc.facetsBuilder != nil {
hc.facetsBuilder.StartDoc()
}
if d.ID != "" && d.IndexInternalID == nil {
// this document may have been sent over as preSearchData and
// we need to look up the internal id to visit the doc values for it
var err error
d.IndexInternalID, err = reader.InternalID(d.ID)
if err != nil {
return err
}
}
err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor)
err := hc.dvReader.VisitDocValues(d.IndexInternalID, v)
if hc.facetsBuilder != nil {
hc.facetsBuilder.EndDoc()
}
@@ -435,3 +531,11 @@ func (hc *TopNCollector) FacetResults() search.FacetResults {
}
return nil
}
func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, newScoreExplComputer search.ScoreExplCorrectionCallbackFunc) {
hc.knnHits = make(map[string]*search.DocumentMatch, len(knnHits))
for _, hit := range knnHits {
hc.knnHits[hit.ID] = hit
}
hc.computeNewScoreExpl = newScoreExplComputer
}
@@ -17,7 +17,6 @@ package facet
import (
"reflect"
"sort"
"strconv"
"time"
"github.com/blevesearch/bleve/v2/numeric"
@@ -36,10 +35,8 @@ func init() {
}
type dateTimeRange struct {
start time.Time
end time.Time
startLayout string
endLayout string
start time.Time
end time.Time
}
type DateTimeFacetBuilder struct {
@@ -78,12 +75,10 @@ func (fb *DateTimeFacetBuilder) Size() int {
return sizeInBytes
}
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time, startLayout string, endLayout string) {
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) {
r := dateTimeRange{
start: start,
end: end,
startLayout: startLayout,
endLayout: endLayout,
start: start,
end: end,
}
fb.ranges[name] = &r
}
@@ -139,23 +134,11 @@ func (fb *DateTimeFacetBuilder) Result() *search.FacetResult {
Count: count,
}
if !dateRange.start.IsZero() {
var start string
if dateRange.startLayout == "" {
// layout not set probably means it is probably a timestamp
start = strconv.FormatInt(dateRange.start.UnixNano(), 10)
} else {
start = dateRange.start.Format(dateRange.startLayout)
}
start := dateRange.start.Format(time.RFC3339Nano)
tf.Start = &start
}
if !dateRange.end.IsZero() {
var end string
if dateRange.endLayout == "" {
// layout not set probably means it is probably a timestamp
end = strconv.FormatInt(dateRange.end.UnixNano(), 10)
} else {
end = dateRange.end.Format(dateRange.endLayout)
}
end := dateRange.end.Format(time.RFC3339Nano)
tf.End = &end
}
rv.DateRanges = append(rv.DateRanges, tf)
+15 -3
View File
@@ -321,17 +321,29 @@ func (fr *FacetResult) Merge(other *FacetResult) {
fr.Total += other.Total
fr.Missing += other.Missing
fr.Other += other.Other
if fr.Terms != nil && other.Terms != nil {
if other.Terms != nil {
if fr.Terms == nil {
fr.Terms = other.Terms
return
}
for _, term := range other.Terms.termFacets {
fr.Terms.Add(term)
}
}
if fr.NumericRanges != nil && other.NumericRanges != nil {
if other.NumericRanges != nil {
if fr.NumericRanges == nil {
fr.NumericRanges = other.NumericRanges
return
}
for _, nr := range other.NumericRanges {
fr.NumericRanges = fr.NumericRanges.Add(nr)
}
}
if fr.DateRanges != nil && other.DateRanges != nil {
if other.DateRanges != nil {
if fr.DateRanges == nil {
fr.DateRanges = other.DateRanges
return
}
for _, dr := range other.DateRanges {
fr.DateRanges = fr.DateRanges.Add(dr)
}
+18 -9
View File
@@ -27,10 +27,15 @@ import (
)
type DisjunctionQuery struct {
Disjuncts []Query `json:"disjuncts"`
BoostVal *Boost `json:"boost,omitempty"`
Min float64 `json:"min"`
queryStringMode bool
Disjuncts []Query `json:"disjuncts"`
BoostVal *Boost `json:"boost,omitempty"`
Min float64 `json:"min"`
retrieveScoreBreakdown bool
queryStringMode bool
}
func (q *DisjunctionQuery) RetrieveScoreBreakdown(b bool) {
q.retrieveScoreBreakdown = b
}
// NewDisjunctionQuery creates a new compound Query.
@@ -73,18 +78,22 @@ func (q *DisjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m
}
return nil, err
}
if _, ok := sr.(*searcher.MatchNoneSearcher); ok && q.queryStringMode {
// in query string mode, skip match none
continue
if sr != nil {
if _, ok := sr.(*searcher.MatchNoneSearcher); ok && q.queryStringMode {
// in query string mode, skip match none
continue
}
ss = append(ss, sr)
}
ss = append(ss, sr)
}
if len(ss) < 1 {
return searcher.NewMatchNoneSearcher(i)
}
return searcher.NewDisjunctionSearcher(ctx, i, ss, q.Min, options)
nctx := context.WithValue(ctx, search.IncludeScoreBreakdownKey, q.retrieveScoreBreakdown)
return searcher.NewDisjunctionSearcher(nctx, i, ss, q.Min, options)
}
func (q *DisjunctionQuery) Validate() error {
+74
View File
@@ -0,0 +1,74 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package query
import (
"context"
"fmt"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/searcher"
index "github.com/blevesearch/bleve_index_api"
)
type KNNQuery struct {
VectorField string `json:"field"`
Vector []float32 `json:"vector"`
K int64 `json:"k"`
BoostVal *Boost `json:"boost,omitempty"`
}
func NewKNNQuery(vector []float32) *KNNQuery {
return &KNNQuery{Vector: vector}
}
func (q *KNNQuery) Field() string {
return q.VectorField
}
func (q *KNNQuery) SetK(k int64) {
q.K = k
}
func (q *KNNQuery) SetFieldVal(field string) {
q.VectorField = field
}
func (q *KNNQuery) SetBoost(b float64) {
boost := Boost(b)
q.BoostVal = &boost
}
func (q *KNNQuery) Boost() float64 {
return q.BoostVal.Value()
}
func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader,
m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
fieldMapping := m.FieldMappingForPath(q.VectorField)
similarityMetric := fieldMapping.Similarity
if similarityMetric == "" {
similarityMetric = index.DefaultSimilarityMetric
}
if q.K <= 0 || len(q.Vector) == 0 {
return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty")
}
return searcher.NewKNNSearcher(ctx, i, m, options, q.VectorField,
q.Vector, q.K, q.BoostVal.Value(), similarityMetric)
}
+41
View File
@@ -65,14 +65,55 @@ type ValidatableQuery interface {
Validate() error
}
// ParseQuery deserializes a JSON representation of
// a PreSearchData object.
func ParsePreSearchData(input []byte) (map[string]interface{}, error) {
var rv map[string]interface{}
var tmp map[string]json.RawMessage
err := util.UnmarshalJSON(input, &tmp)
if err != nil {
return nil, err
}
for k, v := range tmp {
switch k {
case search.KnnPreSearchDataKey:
var value []*search.DocumentMatch
if v != nil {
err := util.UnmarshalJSON(v, &value)
if err != nil {
return nil, err
}
}
if rv == nil {
rv = make(map[string]interface{})
}
rv[search.KnnPreSearchDataKey] = value
}
}
return rv, nil
}
// ParseQuery deserializes a JSON representation of
// a Query object.
func ParseQuery(input []byte) (Query, error) {
if len(input) == 0 {
// interpret as a match_none query
return NewMatchNoneQuery(), nil
}
var tmp map[string]interface{}
err := util.UnmarshalJSON(input, &tmp)
if err != nil {
return nil, err
}
if len(tmp) == 0 {
// interpret as a match_none query
return NewMatchNoneQuery(), nil
}
_, hasFuzziness := tmp["fuzziness"]
_, isMatchQuery := tmp["match"]
_, isMatchPhraseQuery := tmp["match_phrase"]
+36 -31
View File
@@ -37,6 +37,7 @@ type ConstantScorer struct {
queryNorm float64
queryWeight float64
queryWeightExplanation *search.Explanation
includeScore bool
}
func (s *ConstantScorer) Size() int {
@@ -51,10 +52,11 @@ func (s *ConstantScorer) Size() int {
func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer {
rv := ConstantScorer{
options: options,
queryWeight: 1.0,
constant: constant,
boost: boost,
options: options,
queryWeight: 1.0,
constant: constant,
boost: boost,
includeScore: options.Score != "none",
}
return &rv
@@ -92,35 +94,38 @@ func (s *ConstantScorer) SetQueryNorm(qnorm float64) {
func (s *ConstantScorer) Score(ctx *search.SearchContext, id index.IndexInternalID) *search.DocumentMatch {
var scoreExplanation *search.Explanation
score := s.constant
if s.options.Explain {
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("ConstantScore()"),
}
}
// if the query weight isn't 1, multiply
if s.queryWeight != 1.0 {
score = score * s.queryWeight
if s.options.Explain {
childExplanations := make([]*search.Explanation, 2)
childExplanations[0] = s.queryWeightExplanation
childExplanations[1] = scoreExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("weight(^%f), product of:", s.boost),
Children: childExplanations,
}
}
}
rv := ctx.DocumentMatchPool.Get()
rv.IndexInternalID = id
rv.Score = score
if s.options.Explain {
rv.Expl = scoreExplanation
if s.includeScore {
score := s.constant
if s.options.Explain {
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("ConstantScore()"),
}
}
// if the query weight isn't 1, multiply
if s.queryWeight != 1.0 {
score = score * s.queryWeight
if s.options.Explain {
childExplanations := make([]*search.Explanation, 2)
childExplanations[0] = s.queryWeightExplanation
childExplanations[1] = scoreExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("weight(^%f), product of:", s.boost),
Children: childExplanations,
}
}
}
rv.Score = score
if s.options.Explain {
rv.Expl = scoreExplanation
}
}
return rv
@@ -81,3 +81,43 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
return rv
}
// This method is used only when disjunction searcher is used over multiple
// KNN searchers, where only the score breakdown and the optional explanation breakdown
// is required. The final score and explanation is set when we finalize the KNN hits.
func (s *DisjunctionQueryScorer) ScoreAndExplBreakdown(ctx *search.SearchContext, constituents []*search.DocumentMatch,
matchingIdxs []int, originalPositions []int, countTotal int) *search.DocumentMatch {
scoreBreakdown := make(map[int]float64)
var childrenExplanations []*search.Explanation
if s.options.Explain {
// since we want to notify which expl belongs to which matched searcher within the disjunction searcher
childrenExplanations = make([]*search.Explanation, countTotal)
}
for i, docMatch := range constituents {
var index int
if originalPositions != nil {
// scorer used in disjunction slice searcher
index = originalPositions[matchingIdxs[i]]
} else {
// scorer used in disjunction heap searcher
index = matchingIdxs[i]
}
scoreBreakdown[index] = docMatch.Score
if s.options.Explain {
childrenExplanations[index] = docMatch.Expl
}
}
var explBreakdown *search.Explanation
if s.options.Explain {
explBreakdown = &search.Explanation{Children: childrenExplanations}
}
rv := constituents[0]
rv.ScoreBreakdown = scoreBreakdown
rv.Expl = explBreakdown
rv.FieldTermLocations = search.MergeFieldTermLocations(
rv.FieldTermLocations, constituents[1:])
return rv
}
+156
View File
@@ -0,0 +1,156 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package scorer
import (
"fmt"
"math"
"reflect"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)
var reflectStaticSizeKNNQueryScorer int
func init() {
var sqs KNNQueryScorer
reflectStaticSizeKNNQueryScorer = int(reflect.TypeOf(sqs).Size())
}
type KNNQueryScorer struct {
queryVector []float32
queryField string
queryWeight float64
queryBoost float64
queryNorm float64
options search.SearcherOptions
similarityMetric string
queryWeightExplanation *search.Explanation
}
func (s *KNNQueryScorer) Size() int {
sizeInBytes := reflectStaticSizeKNNQueryScorer + size.SizeOfPtr +
(len(s.queryVector) * size.SizeOfFloat32) + len(s.queryField)
if s.queryWeightExplanation != nil {
sizeInBytes += s.queryWeightExplanation.Size()
}
return sizeInBytes
}
func NewKNNQueryScorer(queryVector []float32, queryField string, queryBoost float64,
options search.SearcherOptions,
similarityMetric string) *KNNQueryScorer {
return &KNNQueryScorer{
queryVector: queryVector,
queryField: queryField,
queryBoost: queryBoost,
queryWeight: 1.0,
options: options,
similarityMetric: similarityMetric,
}
}
// Score used when the knnMatch.Score = 0 ->
// the query and indexed vector are exactly the same.
const maxKNNScore = math.MaxFloat32
func (sqs *KNNQueryScorer) Score(ctx *search.SearchContext,
knnMatch *index.VectorDoc) *search.DocumentMatch {
rv := ctx.DocumentMatchPool.Get()
var scoreExplanation *search.Explanation
score := knnMatch.Score
if sqs.similarityMetric == index.EuclideanDistance {
// in case of euclidean distance being the distance metric,
// an exact vector (perfect match), would return distance = 0
if score == 0 {
score = maxKNNScore
} else {
// euclidean distances need to be inverted to work with
// tf-idf scoring
score = 1.0 / score
}
}
if sqs.options.Explain {
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("fieldWeight(%s in doc %s), score of:",
sqs.queryField, knnMatch.ID),
Children: []*search.Explanation{
{
Value: score,
Message: fmt.Sprintf("vector(field(%s:%s) with similarity_metric(%s)=%e",
sqs.queryField, knnMatch.ID, sqs.similarityMetric, score),
},
},
}
}
// if the query weight isn't 1, multiply
if sqs.queryWeight != 1.0 && score != maxKNNScore {
score = score * sqs.queryWeight
if sqs.options.Explain {
scoreExplanation = &search.Explanation{
Value: score,
// Product of score * weight
// Avoid adding the query vector to the explanation since vectors
// can get quite large.
Message: fmt.Sprintf("weight(%s:query Vector^%f in %s), product of:",
sqs.queryField, sqs.queryBoost, knnMatch.ID),
Children: []*search.Explanation{sqs.queryWeightExplanation, scoreExplanation},
}
}
}
rv.Score = score
if sqs.options.Explain {
rv.Expl = scoreExplanation
}
rv.IndexInternalID = append(rv.IndexInternalID, knnMatch.ID...)
return rv
}
func (sqs *KNNQueryScorer) Weight() float64 {
return sqs.queryBoost * sqs.queryBoost
}
func (sqs *KNNQueryScorer) SetQueryNorm(qnorm float64) {
sqs.queryNorm = qnorm
// update the query weight
sqs.queryWeight = sqs.queryBoost * sqs.queryNorm
if sqs.options.Explain {
childrenExplanations := make([]*search.Explanation, 2)
childrenExplanations[0] = &search.Explanation{
Value: sqs.queryBoost,
Message: "boost",
}
childrenExplanations[1] = &search.Explanation{
Value: sqs.queryNorm,
Message: "queryNorm",
}
sqs.queryWeightExplanation = &search.Explanation{
Value: sqs.queryWeight,
Message: fmt.Sprintf("queryWeight(%s:query Vector^%f), product of:",
sqs.queryField, sqs.queryBoost),
Children: childrenExplanations,
}
}
}
+18 -2
View File
@@ -147,7 +147,7 @@ type DocumentMatch struct {
Index string `json:"index,omitempty"`
ID string `json:"id"`
IndexInternalID index.IndexInternalID `json:"-"`
Score float64 `json:"score"`
Score float64 `json:"score,omitempty"`
Expl *Explanation `json:"explanation,omitempty"`
Locations FieldTermLocationMap `json:"locations,omitempty"`
Fragments FieldFragmentMap `json:"fragments,omitempty"`
@@ -173,6 +173,22 @@ type DocumentMatch struct {
// not all sub-queries matched
// if false, all the sub-queries matched
PartialMatch bool `json:"partial_match,omitempty"`
// used to indicate the sub-scores that combined to form the
// final score for this document match. This is only populated
// when the search request's query is a DisjunctionQuery
// or a ConjunctionQuery. The map key is the index of the sub-query
// in the DisjunctionQuery or ConjunctionQuery. The map value is the
// sub-score for that sub-query.
ScoreBreakdown map[int]float64 `json:"score_breakdown,omitempty"`
// internal variable used in PreSearch phase of search in alias
// to indicate the name of the index that this match came from.
// used in knn search.
// it is a stack of index names, the top of the stack is the name
// of the index that this match came from
// of the current alias view, used in alias of aliases scenario
IndexNames []string `json:"index_names,omitempty"`
}
func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
@@ -334,7 +350,7 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
}
func (dm *DocumentMatch) String() string {
return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
return fmt.Sprintf("[%s-%f]", dm.ID, dm.Score)
}
type DocumentMatchCollection []*DocumentMatch
+53
View File
@@ -0,0 +1,53 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package searcher
import (
"context"
"github.com/blevesearch/bleve/v2/search"
index "github.com/blevesearch/bleve_index_api"
)
func optimizeKNN(ctx context.Context, indexReader index.IndexReader,
qsearchers []search.Searcher) error {
var octx index.VectorOptimizableContext
var err error
for _, searcher := range qsearchers {
// Only applicable to KNN Searchers.
o, ok := searcher.(index.VectorOptimizable)
if !ok {
continue
}
octx, err = o.VectorOptimize(ctx, octx)
if err != nil {
return err
}
}
// No KNN searchers.
if octx == nil {
return nil
}
// Postings lists and iterators replaced in the pointer to the
// vector reader
return octx.Finish()
}
@@ -0,0 +1,31 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !vectors
// +build !vectors
package searcher
import (
"context"
"github.com/blevesearch/bleve/v2/search"
index "github.com/blevesearch/bleve_index_api"
)
func optimizeKNN(ctx context.Context, indexReader index.IndexReader,
qsearchers []search.Searcher) error {
// No-op
return nil
}
@@ -33,3 +33,23 @@ func (otrl OrderedSearcherList) Less(i, j int) bool {
func (otrl OrderedSearcherList) Swap(i, j int) {
otrl[i], otrl[j] = otrl[j], otrl[i]
}
type OrderedPositionalSearcherList struct {
searchers []search.Searcher
index []int
}
// sort.Interface
func (otrl OrderedPositionalSearcherList) Len() int {
return len(otrl.searchers)
}
func (otrl OrderedPositionalSearcherList) Less(i, j int) bool {
return otrl.searchers[i].Count() < otrl.searchers[j].Count()
}
func (otrl OrderedPositionalSearcherList) Swap(i, j int) {
otrl.searchers[i], otrl.searchers[j] = otrl.searchers[j], otrl.searchers[i]
otrl.index[i], otrl.index[j] = otrl.index[j], otrl.index[i]
}
+15 -15
View File
@@ -35,7 +35,7 @@ func init() {
type ConjunctionSearcher struct {
indexReader index.IndexReader
searchers OrderedSearcherList
searchers []search.Searcher
queryNorm float64
currs []*search.DocumentMatch
maxIDIdx int
@@ -88,6 +88,20 @@ func NewConjunctionSearcher(ctx context.Context, indexReader index.IndexReader,
return &rv, nil
}
func (s *ConjunctionSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *ConjunctionSearcher) Size() int {
sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr +
s.scorer.Size()
@@ -105,20 +119,6 @@ func (s *ConjunctionSearcher) Size() int {
return sizeInBytes
}
func (s *ConjunctionSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *ConjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
@@ -46,15 +46,31 @@ func optionsDisjunctionOptimizable(options search.SearcherOptions) bool {
func newDisjunctionSearcher(ctx context.Context, indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (search.Searcher, error) {
// attempt the "unadorned" disjunction optimization only when we
// do not need extra information like freq-norm's or term vectors
// and the requested min is simple
if len(qsearchers) > 1 && min <= 1 &&
optionsDisjunctionOptimizable(options) {
rv, err := optimizeCompositeSearcher(ctx, "disjunction:unadorned",
indexReader, qsearchers, options)
if err != nil || rv != nil {
return rv, err
var disjOverKNN bool
if ctx != nil {
disjOverKNN, _ = ctx.Value(search.IncludeScoreBreakdownKey).(bool)
}
if disjOverKNN {
// The KNN Searcher optimization is a necessary pre-req for the KNN Searchers,
// not an optional optimization like for, say term searchers.
// It's an optimization to repeat search an open vector index when applicable,
// rather than individually opening and searching a vector index.
err := optimizeKNN(ctx, indexReader, qsearchers)
if err != nil {
return nil, err
}
} else {
// attempt the "unadorned" disjunction optimization only when we
// do not need extra information like freq-norm's or term vectors
// and the requested min is simple
if len(qsearchers) > 1 && min <= 1 &&
optionsDisjunctionOptimizable(options) {
rv, err := optimizeCompositeSearcher(ctx, "disjunction:unadorned",
indexReader, qsearchers, options)
if err != nil || rv != nil {
return rv, err
}
}
}
@@ -39,22 +39,25 @@ func init() {
}
type SearcherCurr struct {
searcher search.Searcher
curr *search.DocumentMatch
searcher search.Searcher
curr *search.DocumentMatch
matchingIdx int
}
type DisjunctionHeapSearcher struct {
indexReader index.IndexReader
numSearchers int
scorer *scorer.DisjunctionQueryScorer
min int
queryNorm float64
initialized bool
searchers []search.Searcher
heap []*SearcherCurr
numSearchers int
scorer *scorer.DisjunctionQueryScorer
min int
queryNorm float64
retrieveScoreBreakdown bool
initialized bool
searchers []search.Searcher
heap []*SearcherCurr
matching []*search.DocumentMatch
matchingIdxs []int
matchingCurrs []*SearcherCurr
bytesRead uint64
@@ -67,22 +70,42 @@ func newDisjunctionHeapSearcher(ctx context.Context, indexReader index.IndexRead
if limit && tooManyClauses(len(searchers)) {
return nil, tooManyClausesErr("", len(searchers))
}
var retrieveScoreBreakdown bool
if ctx != nil {
retrieveScoreBreakdown, _ = ctx.Value(search.IncludeScoreBreakdownKey).(bool)
}
// build our searcher
rv := DisjunctionHeapSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingCurrs: make([]*SearcherCurr, len(searchers)),
heap: make([]*SearcherCurr, 0, len(searchers)),
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingCurrs: make([]*SearcherCurr, len(searchers)),
matchingIdxs: make([]int, len(searchers)),
retrieveScoreBreakdown: retrieveScoreBreakdown,
heap: make([]*SearcherCurr, 0, len(searchers)),
}
rv.computeQueryNorm()
return &rv, nil
}
func (s *DisjunctionHeapSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionHeapSearcher) Size() int {
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr +
s.scorer.Size()
@@ -101,24 +124,11 @@ func (s *DisjunctionHeapSearcher) Size() int {
// since searchers and document matches already counted above
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
return sizeInBytes
}
func (s *DisjunctionHeapSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error {
// alloc a single block of SearcherCurrs
block := make([]SearcherCurr, len(s.searchers))
@@ -132,6 +142,7 @@ func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error
if curr != nil {
block[i].searcher = searcher
block[i].curr = curr
block[i].matchingIdx = i
heap.Push(s, &block[i])
}
}
@@ -147,6 +158,7 @@ func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error
func (s *DisjunctionHeapSearcher) updateMatches() error {
matching := s.matching[:0]
matchingCurrs := s.matchingCurrs[:0]
matchingIdxs := s.matchingIdxs[:0]
if len(s.heap) > 0 {
@@ -154,17 +166,20 @@ func (s *DisjunctionHeapSearcher) updateMatches() error {
next := heap.Pop(s).(*SearcherCurr)
matching = append(matching, next.curr)
matchingCurrs = append(matchingCurrs, next)
matchingIdxs = append(matchingIdxs, next.matchingIdx)
// now as long as top of heap matches, keep popping
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 {
next = heap.Pop(s).(*SearcherCurr)
matching = append(matching, next.curr)
matchingCurrs = append(matchingCurrs, next)
matchingIdxs = append(matchingIdxs, next.matchingIdx)
}
}
s.matching = matching
s.matchingCurrs = matchingCurrs
s.matchingIdxs = matchingIdxs
return nil
}
@@ -197,10 +212,16 @@ func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) (
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
partialMatch := len(s.matching) != len(s.searchers)
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
rv.PartialMatch = partialMatch
if s.retrieveScoreBreakdown {
// just return score and expl breakdown here, since it is a disjunction over knn searchers,
// and the final score and expl is calculated in the knn collector
rv = s.scorer.ScoreAndExplBreakdown(ctx, s.matching, s.matchingIdxs, nil, s.numSearchers)
} else {
// score this match
partialMatch := len(s.matching) != len(s.searchers)
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
rv.PartialMatch = partialMatch
}
}
// invoke next on all the matching searchers
@@ -34,17 +34,19 @@ func init() {
}
type DisjunctionSliceSearcher struct {
indexReader index.IndexReader
searchers OrderedSearcherList
numSearchers int
queryNorm float64
currs []*search.DocumentMatch
scorer *scorer.DisjunctionQueryScorer
min int
matching []*search.DocumentMatch
matchingIdxs []int
initialized bool
bytesRead uint64
indexReader index.IndexReader
searchers []search.Searcher
originalPos []int
numSearchers int
queryNorm float64
retrieveScoreBreakdown bool
currs []*search.DocumentMatch
scorer *scorer.DisjunctionQueryScorer
min int
matching []*search.DocumentMatch
matchingIdxs []int
initialized bool
bytesRead uint64
}
func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexReader,
@@ -54,21 +56,45 @@ func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexRea
if limit && tooManyClauses(len(qsearchers)) {
return nil, tooManyClausesErr("", len(qsearchers))
}
// build the downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
searchers[i] = searcher
var searchers OrderedSearcherList
var originalPos []int
var retrieveScoreBreakdown bool
if ctx != nil {
retrieveScoreBreakdown, _ = ctx.Value(search.IncludeScoreBreakdownKey).(bool)
}
// sort the searchers
sort.Sort(sort.Reverse(searchers))
// build our searcher
if retrieveScoreBreakdown {
// needed only when kNN is in picture
sortedSearchers := &OrderedPositionalSearcherList{
searchers: make([]search.Searcher, len(qsearchers)),
index: make([]int, len(qsearchers)),
}
for i, searcher := range qsearchers {
sortedSearchers.searchers[i] = searcher
sortedSearchers.index[i] = i
}
sort.Sort(sortedSearchers)
searchers = sortedSearchers.searchers
originalPos = sortedSearchers.index
} else {
searchers = make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
searchers[i] = searcher
}
sort.Sort(searchers)
}
rv := DisjunctionSliceSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
currs: make([]*search.DocumentMatch, len(searchers)),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
indexReader: indexReader,
searchers: searchers,
originalPos: originalPos,
numSearchers: len(searchers),
currs: make([]*search.DocumentMatch, len(searchers)),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
retrieveScoreBreakdown: retrieveScoreBreakdown,
matching: make([]*search.DocumentMatch, len(searchers)),
matchingIdxs: make([]int, len(searchers)),
}
@@ -76,6 +102,20 @@ func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexRea
return &rv, nil
}
func (s *DisjunctionSliceSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionSliceSearcher) Size() int {
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr +
s.scorer.Size()
@@ -97,24 +137,11 @@ func (s *DisjunctionSliceSearcher) Size() int {
}
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
sizeInBytes += len(s.originalPos) * size.SizeOfInt
return sizeInBytes
}
func (s *DisjunctionSliceSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
@@ -197,10 +224,16 @@ func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) (
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
found = true
partialMatch := len(s.matching) != len(s.searchers)
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
rv.PartialMatch = partialMatch
if s.retrieveScoreBreakdown {
// just return score and expl breakdown here, since it is a disjunction over knn searchers,
// and the final score and expl is calculated in the knn collector
rv = s.scorer.ScoreAndExplBreakdown(ctx, s.matching, s.matchingIdxs, s.originalPos, s.numSearchers)
} else {
// score this match
partialMatch := len(s.matching) != len(s.searchers)
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
rv.PartialMatch = partialMatch
}
}
// invoke next on all the matching searchers
+142
View File
@@ -0,0 +1,142 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package searcher
import (
"context"
"reflect"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/scorer"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)
var reflectStaticSizeKNNSearcher int
func init() {
var ks KNNSearcher
reflectStaticSizeKNNSearcher = int(reflect.TypeOf(ks).Size())
}
type KNNSearcher struct {
field string
vector []float32
k int64
indexReader index.IndexReader
vectorReader index.VectorReader
scorer *scorer.KNNQueryScorer
count uint64
vd index.VectorDoc
}
func NewKNNSearcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping,
options search.SearcherOptions, field string, vector []float32, k int64,
boost float64, similarityMetric string) (search.Searcher, error) {
if vr, ok := i.(index.VectorIndexReader); ok {
vectorReader, err := vr.VectorReader(ctx, vector, field, k)
if err != nil {
return nil, err
}
knnScorer := scorer.NewKNNQueryScorer(vector, field, boost,
options, similarityMetric)
return &KNNSearcher{
indexReader: i,
vectorReader: vectorReader,
field: field,
vector: vector,
k: k,
scorer: knnScorer,
}, nil
}
return nil, nil
}
func (s *KNNSearcher) VectorOptimize(ctx context.Context, octx index.VectorOptimizableContext) (
index.VectorOptimizableContext, error) {
o, ok := s.vectorReader.(index.VectorOptimizable)
if ok {
return o.VectorOptimize(ctx, octx)
}
return nil, nil
}
func (s *KNNSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (
*search.DocumentMatch, error) {
knnMatch, err := s.vectorReader.Next(s.vd.Reset())
if err != nil {
return nil, err
}
if knnMatch == nil {
return nil, nil
}
docMatch := s.scorer.Score(ctx, knnMatch)
return docMatch, nil
}
func (s *KNNSearcher) Close() error {
return s.vectorReader.Close()
}
func (s *KNNSearcher) Count() uint64 {
return s.vectorReader.Count()
}
func (s *KNNSearcher) DocumentMatchPoolSize() int {
return 1
}
func (s *KNNSearcher) Min() int {
return 0
}
func (s *KNNSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
knnMatch, err := s.vectorReader.Next(s.vd.Reset())
if err != nil {
return nil, err
}
if knnMatch == nil {
return nil, nil
}
docMatch := s.scorer.Score(ctx, knnMatch)
return docMatch, nil
}
func (s *KNNSearcher) SetQueryNorm(qnorm float64) {
s.scorer.SetQueryNorm(qnorm)
}
func (s *KNNSearcher) Size() int {
return reflectStaticSizeKNNSearcher + size.SizeOfPtr +
s.vectorReader.Size() +
s.vd.Size() +
s.scorer.Size()
}
func (s *KNNSearcher) Weight() float64 {
return s.scorer.Weight()
}
+13
View File
@@ -106,6 +106,7 @@ const (
const SearchIncrementalCostKey = "_search_incremental_cost_key"
const QueryTypeKey = "_query_type_key"
const FuzzyMatchPhraseKey = "_fuzzy_match_phrase_key"
const IncludeScoreBreakdownKey = "_include_score_breakdown_key"
func RecordSearchCost(ctx context.Context,
msg SearchIncrementalCostCallbackMsg, bytes uint64) {
@@ -133,3 +134,15 @@ const MaxGeoBufPoolSize = 24 * 1024
const MinGeoBufPoolSize = 24
type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool
const KnnPreSearchDataKey = "_knn_pre_search_data_key"
const PreSearchKey = "_presearch_key"
type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation)
type SearcherStartCallbackFn func(size uint64) error
type SearcherEndCallbackFn func(size uint64) error
const SearcherStartCallbackKey = "_searcher_start_callback_key"
const SearcherEndCallbackKey = "_searcher_end_callback_key"
+524
View File
@@ -0,0 +1,524 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build vectors
// +build vectors
package bleve
import (
"context"
"encoding/json"
"fmt"
"sort"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/collector"
"github.com/blevesearch/bleve/v2/search/query"
index "github.com/blevesearch/bleve_index_api"
)
type knnOperator string
// Must be updated only at init
var BleveMaxK = int64(10000)
type SearchRequest struct {
Query query.Query `json:"query"`
Size int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort search.SortOrder `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score,omitempty"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
KNN []*KNNRequest `json:"knn"`
KNNOperator knnOperator `json:"knn_operator"`
// PreSearchData will be a map that will be used
// in the second phase of any 2-phase search, to provide additional
// context to the second phase. This is useful in the case of index
// aliases where the first phase will gather the PreSearchData from all
// the indexes in the alias, and the second phase will use that
// PreSearchData to perform the actual search.
// The currently accepted map configuration is:
//
// "_knn_pre_search_data_key": []*search.DocumentMatch
PreSearchData map[string]interface{} `json:"pre_search_data,omitempty"`
sortFunc func(sort.Interface)
}
type KNNRequest struct {
Field string `json:"field"`
Vector []float32 `json:"vector"`
K int64 `json:"k"`
Boost *query.Boost `json:"boost,omitempty"`
}
func (r *SearchRequest) AddKNN(field string, vector []float32, k int64, boost float64) {
b := query.Boost(boost)
r.KNN = append(r.KNN, &KNNRequest{
Field: field,
Vector: vector,
K: k,
Boost: &b,
})
}
func (r *SearchRequest) AddKNNOperator(operator knnOperator) {
r.KNNOperator = operator
}
// UnmarshalJSON deserializes a JSON representation of
// a SearchRequest
func (r *SearchRequest) UnmarshalJSON(input []byte) error {
var temp struct {
Q json.RawMessage `json:"query"`
Size *int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort []json.RawMessage `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
KNN []*KNNRequest `json:"knn"`
KNNOperator knnOperator `json:"knn_operator"`
PreSearchData json.RawMessage `json:"pre_search_data"`
}
err := json.Unmarshal(input, &temp)
if err != nil {
return err
}
if temp.Size == nil {
r.Size = 10
} else {
r.Size = *temp.Size
}
if temp.Sort == nil {
r.Sort = search.SortOrder{&search.SortScore{Desc: true}}
} else {
r.Sort, err = search.ParseSortOrderJSON(temp.Sort)
if err != nil {
return err
}
}
r.From = temp.From
r.Explain = temp.Explain
r.Highlight = temp.Highlight
r.Fields = temp.Fields
r.Facets = temp.Facets
r.IncludeLocations = temp.IncludeLocations
r.Score = temp.Score
r.SearchAfter = temp.SearchAfter
r.SearchBefore = temp.SearchBefore
r.Query, err = query.ParseQuery(temp.Q)
if err != nil {
return err
}
if r.Size < 0 {
r.Size = 10
}
if r.From < 0 {
r.From = 0
}
r.KNN = temp.KNN
r.KNNOperator = temp.KNNOperator
if r.KNNOperator == "" {
r.KNNOperator = knnOperatorOr
}
if temp.PreSearchData != nil {
r.PreSearchData, err = query.ParsePreSearchData(temp.PreSearchData)
if err != nil {
return err
}
}
return nil
}
// -----------------------------------------------------------------------------
func copySearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest {
rv := SearchRequest{
Query: req.Query,
Size: req.Size + req.From,
From: 0,
Highlight: req.Highlight,
Fields: req.Fields,
Facets: req.Facets,
Explain: req.Explain,
Sort: req.Sort.Copy(),
IncludeLocations: req.IncludeLocations,
Score: req.Score,
SearchAfter: req.SearchAfter,
SearchBefore: req.SearchBefore,
KNN: req.KNN,
KNNOperator: req.KNNOperator,
PreSearchData: preSearchData,
}
return &rv
}
var (
knnOperatorAnd = knnOperator("and")
knnOperatorOr = knnOperator("or")
)
func createKNNQuery(req *SearchRequest) (query.Query, []int64, int64, error) {
if requestHasKNN(req) {
// first perform validation
err := validateKNN(req)
if err != nil {
return nil, nil, 0, err
}
var subQueries []query.Query
kArray := make([]int64, 0, len(req.KNN))
sumOfK := int64(0)
for _, knn := range req.KNN {
knnQuery := query.NewKNNQuery(knn.Vector)
knnQuery.SetFieldVal(knn.Field)
knnQuery.SetK(knn.K)
knnQuery.SetBoost(knn.Boost.Value())
subQueries = append(subQueries, knnQuery)
kArray = append(kArray, knn.K)
sumOfK += knn.K
}
rv := query.NewDisjunctionQuery(subQueries)
rv.RetrieveScoreBreakdown(true)
return rv, kArray, sumOfK, nil
}
return nil, nil, 0, nil
}
func validateKNN(req *SearchRequest) error {
if req.KNN != nil &&
req.KNNOperator != "" &&
req.KNNOperator != knnOperatorOr &&
req.KNNOperator != knnOperatorAnd {
return fmt.Errorf("unknown knn operator: %s", req.KNNOperator)
}
for _, q := range req.KNN {
if q == nil {
return fmt.Errorf("knn query cannot be nil")
}
if q.K <= 0 || len(q.Vector) == 0 {
return fmt.Errorf("k must be greater than 0 and vector must be non-empty")
}
if q.K > BleveMaxK {
return fmt.Errorf("k must be less than %d", BleveMaxK)
}
}
switch req.KNNOperator {
case knnOperatorAnd, knnOperatorOr, "":
// Valid cases, do nothing
default:
return fmt.Errorf("knn_operator must be either 'and' / 'or'")
}
return nil
}
func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMatch, reader index.IndexReader, name string) (err error) {
requiredSortFields := req.Sort.RequiredFields()
var dvReader index.DocValueReader
var updateFieldVisitor index.DocValueVisitor
if len(requiredSortFields) > 0 {
dvReader, err = reader.DocValueReader(requiredSortFields)
if err != nil {
return err
}
updateFieldVisitor = func(field string, term []byte) {
req.Sort.UpdateVisitor(field, term)
}
}
for _, hit := range knnHits {
if len(requiredSortFields) > 0 {
err = dvReader.VisitDocValues(hit.IndexInternalID, updateFieldVisitor)
if err != nil {
return err
}
}
req.Sort.Value(hit)
err, _ = LoadAndHighlightFields(hit, req, "", reader, nil)
if err != nil {
return err
}
hit.Index = name
}
return nil
}
func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader, preSearch bool) ([]*search.DocumentMatch, error) {
KNNQuery, kArray, sumOfK, err := createKNNQuery(req)
if err != nil {
return nil, err
}
knnSearcher, err := KNNQuery.Searcher(ctx, reader, i.m, search.SearcherOptions{
Explain: req.Explain,
})
if err != nil {
return nil, err
}
knnCollector := collector.NewKNNCollector(kArray, sumOfK)
err = knnCollector.Collect(ctx, knnSearcher, reader)
if err != nil {
return nil, err
}
knnHits := knnCollector.Results()
if !preSearch {
knnHits = finalizeKNNResults(req, knnHits)
}
// at this point, irrespective of whether it is a preSearch or not,
// the knn hits are populated with Sort and Fields.
// it must be ensured downstream that the Sort and Fields are not
// re-evaluated, for these hits.
// also add the index names to the hits, so that when early
// exit takes place after the first phase, the hits will have
// a valid value for Index.
err = addSortAndFieldsToKNNHits(req, knnHits, reader, i.name)
if err != nil {
return nil, err
}
return knnHits, nil
}
func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) {
if len(knnHits) > 0 {
newScoreExplComputer := func(queryMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) (float64, *search.Explanation) {
totalScore := queryMatch.Score + knnMatch.Score
if !req.Explain {
// exit early as we don't need to compute the explanation
return totalScore, nil
}
return totalScore, &search.Explanation{Value: totalScore, Message: "sum of:", Children: []*search.Explanation{queryMatch.Expl, knnMatch.Expl}}
}
coll.SetKNNHits(knnHits, search.ScoreExplCorrectionCallbackFunc(newScoreExplComputer))
}
}
func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch {
// if the KNN operator is AND, then we need to filter out the hits that
// do not have match the KNN queries.
if req.KNNOperator == knnOperatorAnd {
idx := 0
for _, hit := range knnHits {
if len(hit.ScoreBreakdown) == len(req.KNN) {
knnHits[idx] = hit
idx++
}
}
knnHits = knnHits[:idx]
}
// fix the score using score breakdown now
// if the score is none, then we need to set the score to 0.0
// if req.Explain is true, then we need to use the expl breakdown to
// finalize the correct explanation.
for _, hit := range knnHits {
hit.Score = 0.0
if req.Score != "none" {
for _, score := range hit.ScoreBreakdown {
hit.Score += score
}
}
if req.Explain {
childrenExpl := make([]*search.Explanation, 0, len(hit.ScoreBreakdown))
for i := range hit.ScoreBreakdown {
childrenExpl = append(childrenExpl, hit.Expl.Children[i])
}
hit.Expl = &search.Explanation{Value: hit.Score, Message: "sum of:", Children: childrenExpl}
}
// we don't need the score breakdown anymore
// so we can set it to nil
hit.ScoreBreakdown = nil
}
return knnHits
}
// when we are setting KNN hits in the preSearchData, we need to make sure that
// the KNN hit goes to the right index. This is because the KNN hits are
// collected from all the indexes in the alias, but the preSearchData is
// specific to each index. If alias A1 contains indexes I1 and I2 and
// the KNN hits collected from both I1 and I2, and merged to get top K
// hits, then the top K hits need to be distributed to I1 and I2,
// so that the preSearchData for I1 contains the top K hits from I1 and
// the preSearchData for I2 contains the top K hits from I2.
func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) {
// create a set of all the index names of this alias
indexNames := make(map[string]struct{}, len(indexes))
for _, index := range indexes {
indexNames[index.Name()] = struct{}{}
}
segregatedKnnHits := make(map[string][]*search.DocumentMatch)
for _, hit := range knnHits {
// for each hit, we need to perform a validation check to ensure that the stack
// is still valid.
//
// if the stack is empty, then we have an inconsistency/abnormality
// since any hit with an empty stack is supposed to land on a leaf index,
// and not an alias. This cannot happen in normal circumstances. But
// performing this check to be safe. Since we extract the stack top
// in the following steps.
if len(hit.IndexNames) == 0 {
return nil, ErrorTwoPhaseSearchInconsistency
}
// since the stack is not empty, we need to check if the top of the stack
// is a valid index name, of an index that is part of this alias. If not,
// then we have an inconsistency that could be caused due to a topology
// change.
stackTopIdx := len(hit.IndexNames) - 1
top := hit.IndexNames[stackTopIdx]
if _, exists := indexNames[top]; !exists {
return nil, ErrorTwoPhaseSearchInconsistency
}
if stackTopIdx == 0 {
// if the stack consists of only one index, then popping the top
// would result in an empty slice, and handle this case by setting
// indexNames to nil. So that the final search results will not
// contain the indexNames field.
hit.IndexNames = nil
} else {
hit.IndexNames = hit.IndexNames[:stackTopIdx]
}
segregatedKnnHits[top] = append(segregatedKnnHits[top], hit)
}
return segregatedKnnHits, nil
}
func requestHasKNN(req *SearchRequest) bool {
return len(req.KNN) > 0
}
// returns true if the search request contains a KNN request that can be
// satisfied by just performing a preSearch, completely bypassing the
// actual search.
func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool {
// if req.Query is not match_none => then we need to go to phase 2
// to perform the actual query.
if _, ok := req.Query.(*query.MatchNoneQuery); !ok {
return false
}
// req.Query is a match_none query
//
// if request contains facets, we need to perform phase 2 to calculate
// the facet result. Since documents were removed as part of the
// merging process after phase 1, if the facet results were to be calculated
// during phase 1, then they will be now be incorrect, since merging would
// remove some documents.
if req.Facets != nil {
return false
}
// the request is a match_none query and does not contain any facets
// so we can satisfy the request using just the preSearch result.
return true
}
func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preSearchResult *SearchResult,
indexes []Index) (map[string]map[string]interface{}, error) {
distributedHits, err := validateAndDistributeKNNHits([]*search.DocumentMatch(preSearchResult.Hits), indexes)
if err != nil {
return nil, err
}
for _, index := range indexes {
mergedOut[index.Name()][search.KnnPreSearchDataKey] = distributedHits[index.Name()]
}
return mergedOut, nil
}
func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) {
dummyReq.KNN = realReq.KNN
dummyReq.KNNOperator = knnOperatorOr
dummyReq.Explain = realReq.Explain
dummyReq.Fields = realReq.Fields
dummyReq.Sort = realReq.Sort
}
// the preSearchData for KNN is a list of DocumentMatch objects
// that need to be redistributed to the right index.
// This is used only in the case of an alias tree, where the indexes
// are at the leaves of the tree, and the master alias is at the root.
// At each level of the tree, the preSearchData needs to be redistributed
// to the indexes/aliases at that level. Because the preSearchData is
// specific to each final index at the leaf.
func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) {
knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch)
if !ok {
return nil, fmt.Errorf("request does not have knn preSearchData for redistribution")
}
segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes)
if err != nil {
return nil, err
}
rv := make(map[string]map[string]interface{})
for _, index := range indexes {
rv[index.Name()] = make(map[string]interface{})
}
for _, index := range indexes {
for k, v := range req.PreSearchData {
switch k {
case search.KnnPreSearchDataKey:
rv[index.Name()][k] = segregatedKnnHits[index.Name()]
default:
rv[index.Name()][k] = v
}
}
}
return rv, nil
}
func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor {
kArray := make([]int64, len(req.KNN))
for i, knnReq := range req.KNN {
kArray[i] = knnReq.K
}
knnStore := collector.GetNewKNNCollectorStore(kArray)
return &knnPreSearchResultProcessor{
addFn: func(sr *SearchResult, indexName string) {
for _, hit := range sr.Hits {
// tag the hit with the index name, so that when the
// final search result is constructed, the hit will have
// a valid path to follow along the alias tree to reach
// the index.
hit.IndexNames = append(hit.IndexNames, indexName)
knnStore.AddDocument(hit)
}
},
finalizeFn: func(sr *SearchResult) {
// passing nil as the document fixup function, because we don't need to
// fixup the document, since this was already done in the first phase,
// hence error is always nil.
// the merged knn hits are finalized and set in the search result.
sr.Hits, _ = knnStore.Final(nil)
},
}
}
+207
View File
@@ -0,0 +1,207 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !vectors
// +build !vectors
package bleve
import (
"context"
"encoding/json"
"sort"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/collector"
"github.com/blevesearch/bleve/v2/search/query"
index "github.com/blevesearch/bleve_index_api"
)
// A SearchRequest describes all the parameters
// needed to search the index.
// Query is required.
// Size/From describe how much and which part of the
// result set to return.
// Highlight describes optional search result
// highlighting.
// Fields describes a list of field values which
// should be retrieved for result documents, provided they
// were stored while indexing.
// Facets describe the set of facets to be computed.
// Explain triggers inclusion of additional search
// result score explanations.
// Sort describes the desired order for the results to be returned.
// Score controls the kind of scoring performed
// SearchAfter supports deep paging by providing a minimum sort key
// SearchBefore supports deep paging by providing a maximum sort key
// sortFunc specifies the sort implementation to use for sorting results.
//
// A special field named "*" can be used to return all fields.
type SearchRequest struct {
ClientContextID string `json:"client_context_id,omitempty"`
Query query.Query `json:"query"`
Size int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort search.SortOrder `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score,omitempty"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
// PreSearchData will be a map that will be used
// in the second phase of any 2-phase search, to provide additional
// context to the second phase. This is useful in the case of index
// aliases where the first phase will gather the PreSearchData from all
// the indexes in the alias, and the second phase will use that
// PreSearchData to perform the actual search.
// The currently accepted map configuration is:
//
// "_knn_pre_search_data_key": []*search.DocumentMatch
PreSearchData map[string]interface{} `json:"pre_search_data,omitempty"`
sortFunc func(sort.Interface)
}
// UnmarshalJSON deserializes a JSON representation of
// a SearchRequest
func (r *SearchRequest) UnmarshalJSON(input []byte) error {
var temp struct {
Q json.RawMessage `json:"query"`
Size *int `json:"size"`
From int `json:"from"`
Highlight *HighlightRequest `json:"highlight"`
Fields []string `json:"fields"`
Facets FacetsRequest `json:"facets"`
Explain bool `json:"explain"`
Sort []json.RawMessage `json:"sort"`
IncludeLocations bool `json:"includeLocations"`
Score string `json:"score"`
SearchAfter []string `json:"search_after"`
SearchBefore []string `json:"search_before"`
PreSearchData json.RawMessage `json:"pre_search_data"`
}
err := json.Unmarshal(input, &temp)
if err != nil {
return err
}
if temp.Size == nil {
r.Size = 10
} else {
r.Size = *temp.Size
}
if temp.Sort == nil {
r.Sort = search.SortOrder{&search.SortScore{Desc: true}}
} else {
r.Sort, err = search.ParseSortOrderJSON(temp.Sort)
if err != nil {
return err
}
}
r.From = temp.From
r.Explain = temp.Explain
r.Highlight = temp.Highlight
r.Fields = temp.Fields
r.Facets = temp.Facets
r.IncludeLocations = temp.IncludeLocations
r.Score = temp.Score
r.SearchAfter = temp.SearchAfter
r.SearchBefore = temp.SearchBefore
r.Query, err = query.ParseQuery(temp.Q)
if err != nil {
return err
}
if r.Size < 0 {
r.Size = 10
}
if r.From < 0 {
r.From = 0
}
if temp.PreSearchData != nil {
r.PreSearchData, err = query.ParsePreSearchData(temp.PreSearchData)
if err != nil {
return err
}
}
return nil
}
// -----------------------------------------------------------------------------
func copySearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest {
rv := SearchRequest{
Query: req.Query,
Size: req.Size + req.From,
From: 0,
Highlight: req.Highlight,
Fields: req.Fields,
Facets: req.Facets,
Explain: req.Explain,
Sort: req.Sort.Copy(),
IncludeLocations: req.IncludeLocations,
Score: req.Score,
SearchAfter: req.SearchAfter,
SearchBefore: req.SearchBefore,
PreSearchData: preSearchData,
}
return &rv
}
func validateKNN(req *SearchRequest) error {
return nil
}
func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader, preSearch bool) ([]*search.DocumentMatch, error) {
return nil, nil
}
func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) {
}
func requestHasKNN(req *SearchRequest) bool {
return false
}
func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) {
}
func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) {
return nil, nil
}
func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool {
return false
}
func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preSearchResult *SearchResult,
indexes []Index) (map[string]map[string]interface{}, error) {
return mergedOut, nil
}
func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch {
return knnHits
}
func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor {
return &knnPreSearchResultProcessor{} // equivalent to nil
}
+62
View File
@@ -0,0 +1,62 @@
// Copyright (c) 2023 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import (
"math"
"reflect"
)
// extract numeric value (if possible) and returns a float64
func ExtractNumericValFloat64(v interface{}) (float64, bool) {
val := reflect.ValueOf(v)
if !val.IsValid() {
return 0, false
}
switch {
case val.CanFloat():
return val.Float(), true
case val.CanInt():
return float64(val.Int()), true
case val.CanUint():
return float64(val.Uint()), true
}
return 0, false
}
// extract numeric value (if possible) and returns a float32
func ExtractNumericValFloat32(v interface{}) (float32, bool) {
val := reflect.ValueOf(v)
if !val.IsValid() {
return 0, false
}
switch {
case val.CanFloat():
floatVal := val.Float()
if floatVal > math.MaxFloat32 {
return 0, false
}
return float32(floatVal), true
case val.CanInt():
return float32(val.Int()), true
case val.CanUint():
return float32(val.Uint()), true
}
return 0, false
}