Files
opencloud/services/search/pkg/content/tika.go
T
Florian Schade 4f26424db6 [full-ci] enhancement: use reva client pool selectors (#6452)
* enhancement: use reva client pool selectors

register mock service to registry and pass tests

* enhancement: bump reva

* Fix a couple of linter issues

---------

Co-authored-by: Ralf Haferkamp <rhaferkamp@owncloud.com>
2023-06-08 12:41:04 +02:00

95 lines
2.6 KiB
Go

package content
import (
"context"
"fmt"
"strings"
"github.com/bbalet/stopwords"
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
"github.com/cs3org/reva/v2/pkg/rgrpc/todo/pool"
"github.com/google/go-tika/tika"
"github.com/owncloud/ocis/v2/ocis-pkg/log"
"github.com/owncloud/ocis/v2/services/search/pkg/config"
)
// Tika is used to extract content from a resource,
// it uses apache tika to retrieve all the data.
type Tika struct {
*Basic
Retriever
tika *tika.Client
contentExtractionSizeLimit uint64
}
// NewTikaExtractor creates a new Tika instance.
func NewTikaExtractor(gatewaySelector pool.Selectable[gateway.GatewayAPIClient], logger log.Logger, cfg *config.Config) (*Tika, error) {
basic, err := NewBasicExtractor(logger)
if err != nil {
return nil, err
}
tk := tika.NewClient(nil, cfg.Extractor.Tika.TikaURL)
tkv, err := tk.Version(context.Background())
if err != nil {
return nil, err
}
logger.Info().Msgf("Tika version: %s", tkv)
return &Tika{
Basic: basic,
Retriever: newCS3Retriever(gatewaySelector, logger, cfg.Extractor.CS3AllowInsecure),
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
contentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
}, nil
}
// Extract loads a resource from its underlying storage, passes it to tika and processes the result into a Document.
func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document, error) {
doc, err := t.Basic.Extract(ctx, ri)
if err != nil {
return doc, err
}
if ri.Size == 0 {
return doc, nil
}
if ri.Size > t.contentExtractionSizeLimit {
t.logger.Info().Interface("ResourceID", ri.Id).Str("Name", ri.Name).Msg("file exceeds content extraction size limit. skipping.")
return doc, nil
}
if ri.Type != provider.ResourceType_RESOURCE_TYPE_FILE {
return doc, nil
}
data, err := t.Retrieve(ctx, ri.Id)
if err != nil {
return doc, err
}
defer data.Close()
metas, err := t.tika.MetaRecursive(ctx, data)
if err != nil {
return doc, err
}
for _, meta := range metas {
if title, err := getFirstValue(meta, "title"); err == nil {
doc.Title = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Title, title))
}
if content, err := getFirstValue(meta, "X-TIKA:content"); err == nil {
doc.Content = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Content, content))
}
}
if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
doc.Content = stopwords.CleanString(doc.Content, lang, true)
}
return doc, nil
}