mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-05-13 06:49:38 -05:00
2404eff48e
* experimental search backport fix basic extractor resource name move escapeQuery regex into global variable minor pr review changes rename DebounceDuration env variable add document title and content when rebuilding bleve resource Co-authored-by: David Christofas <dchristofas@owncloud.com>
87 lines
2.1 KiB
Go
87 lines
2.1 KiB
Go
package content
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/bbalet/stopwords"
|
|
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
|
|
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
|
|
"github.com/google/go-tika/tika"
|
|
"github.com/owncloud/ocis/v2/ocis-pkg/log"
|
|
"github.com/owncloud/ocis/v2/services/search/pkg/config"
|
|
)
|
|
|
|
// Tika is used to extract content from a resource,
|
|
// it uses apache tika to retrieve all the data.
|
|
type Tika struct {
|
|
*Basic
|
|
Retriever
|
|
tika *tika.Client
|
|
}
|
|
|
|
// NewTikaExtractor creates a new Tika instance.
|
|
func NewTikaExtractor(gw gateway.GatewayAPIClient, logger log.Logger, cfg *config.Config) (*Tika, error) {
|
|
basic, err := NewBasicExtractor(logger)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
tk := tika.NewClient(nil, cfg.Extractor.Tika.TikaURL)
|
|
tkv, err := tk.Version(context.Background())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
logger.Info().Msgf("Tika version: %s", tkv)
|
|
|
|
return &Tika{
|
|
Basic: basic,
|
|
Retriever: newCS3Retriever(gw, logger, cfg.Extractor.CS3AllowInsecure),
|
|
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
|
|
}, nil
|
|
}
|
|
|
|
// Extract loads a resource from its underlying storage, passes it to tika and processes the result into a Document.
|
|
func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document, error) {
|
|
doc, err := t.Basic.Extract(ctx, ri)
|
|
if err != nil {
|
|
return doc, err
|
|
}
|
|
|
|
if ri.Size == 0 {
|
|
return doc, nil
|
|
}
|
|
|
|
if ri.Type != provider.ResourceType_RESOURCE_TYPE_FILE {
|
|
return doc, nil
|
|
}
|
|
|
|
data, err := t.Retrieve(ctx, ri.Id)
|
|
if err != nil {
|
|
return doc, err
|
|
}
|
|
defer data.Close()
|
|
|
|
metas, err := t.tika.MetaRecursive(ctx, data)
|
|
if err != nil {
|
|
return doc, err
|
|
}
|
|
|
|
for _, meta := range metas {
|
|
if title, err := getFirstValue(meta, "title"); err == nil {
|
|
doc.Title = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Title, title))
|
|
}
|
|
|
|
if content, err := getFirstValue(meta, "X-TIKA:content"); err == nil {
|
|
doc.Content = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Content, content))
|
|
}
|
|
}
|
|
|
|
if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
|
|
doc.Content = stopwords.CleanString(doc.Content, lang, true)
|
|
}
|
|
|
|
return doc, nil
|
|
}
|