Files
opencloud/services/search/pkg/content/tika.go
T
Florian Schade 2404eff48e [full-ci] experimental search backport (#5221)
* experimental search backport
fix basic extractor resource name
move escapeQuery regex into global variable
minor pr review changes
rename DebounceDuration env variable
add document title and content when rebuilding bleve resource

Co-authored-by: David Christofas <dchristofas@owncloud.com>
2022-12-13 14:22:41 +01:00

87 lines
2.1 KiB
Go

package content
import (
"context"
"fmt"
"strings"
"github.com/bbalet/stopwords"
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
"github.com/google/go-tika/tika"
"github.com/owncloud/ocis/v2/ocis-pkg/log"
"github.com/owncloud/ocis/v2/services/search/pkg/config"
)
// Tika is used to extract content from a resource,
// it uses apache tika to retrieve all the data.
type Tika struct {
*Basic
Retriever
tika *tika.Client
}
// NewTikaExtractor creates a new Tika instance.
func NewTikaExtractor(gw gateway.GatewayAPIClient, logger log.Logger, cfg *config.Config) (*Tika, error) {
basic, err := NewBasicExtractor(logger)
if err != nil {
return nil, err
}
tk := tika.NewClient(nil, cfg.Extractor.Tika.TikaURL)
tkv, err := tk.Version(context.Background())
if err != nil {
return nil, err
}
logger.Info().Msgf("Tika version: %s", tkv)
return &Tika{
Basic: basic,
Retriever: newCS3Retriever(gw, logger, cfg.Extractor.CS3AllowInsecure),
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
}, nil
}
// Extract loads a resource from its underlying storage, passes it to tika and processes the result into a Document.
func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document, error) {
doc, err := t.Basic.Extract(ctx, ri)
if err != nil {
return doc, err
}
if ri.Size == 0 {
return doc, nil
}
if ri.Type != provider.ResourceType_RESOURCE_TYPE_FILE {
return doc, nil
}
data, err := t.Retrieve(ctx, ri.Id)
if err != nil {
return doc, err
}
defer data.Close()
metas, err := t.tika.MetaRecursive(ctx, data)
if err != nil {
return doc, err
}
for _, meta := range metas {
if title, err := getFirstValue(meta, "title"); err == nil {
doc.Title = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Title, title))
}
if content, err := getFirstValue(meta, "X-TIKA:content"); err == nil {
doc.Content = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Content, content))
}
}
if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
doc.Content = stopwords.CleanString(doc.Content, lang, true)
}
return doc, nil
}