mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-04-23 12:38:21 -05:00
4f26424db6
* enhancement: use reva client pool selectors register mock service to registry and pass tests * enhancement: bump reva * Fix a couple of linter issues --------- Co-authored-by: Ralf Haferkamp <rhaferkamp@owncloud.com>
95 lines
2.6 KiB
Go
95 lines
2.6 KiB
Go
package content
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/bbalet/stopwords"
|
|
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
|
|
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
|
|
"github.com/cs3org/reva/v2/pkg/rgrpc/todo/pool"
|
|
"github.com/google/go-tika/tika"
|
|
"github.com/owncloud/ocis/v2/ocis-pkg/log"
|
|
"github.com/owncloud/ocis/v2/services/search/pkg/config"
|
|
)
|
|
|
|
// Tika is used to extract content from a resource,
|
|
// it uses apache tika to retrieve all the data.
|
|
type Tika struct {
|
|
*Basic
|
|
Retriever
|
|
tika *tika.Client
|
|
contentExtractionSizeLimit uint64
|
|
}
|
|
|
|
// NewTikaExtractor creates a new Tika instance.
|
|
func NewTikaExtractor(gatewaySelector pool.Selectable[gateway.GatewayAPIClient], logger log.Logger, cfg *config.Config) (*Tika, error) {
|
|
basic, err := NewBasicExtractor(logger)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
tk := tika.NewClient(nil, cfg.Extractor.Tika.TikaURL)
|
|
tkv, err := tk.Version(context.Background())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
logger.Info().Msgf("Tika version: %s", tkv)
|
|
|
|
return &Tika{
|
|
Basic: basic,
|
|
Retriever: newCS3Retriever(gatewaySelector, logger, cfg.Extractor.CS3AllowInsecure),
|
|
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
|
|
contentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
|
|
}, nil
|
|
}
|
|
|
|
// Extract loads a resource from its underlying storage, passes it to tika and processes the result into a Document.
|
|
func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document, error) {
|
|
doc, err := t.Basic.Extract(ctx, ri)
|
|
if err != nil {
|
|
return doc, err
|
|
}
|
|
|
|
if ri.Size == 0 {
|
|
return doc, nil
|
|
}
|
|
|
|
if ri.Size > t.contentExtractionSizeLimit {
|
|
t.logger.Info().Interface("ResourceID", ri.Id).Str("Name", ri.Name).Msg("file exceeds content extraction size limit. skipping.")
|
|
return doc, nil
|
|
}
|
|
|
|
if ri.Type != provider.ResourceType_RESOURCE_TYPE_FILE {
|
|
return doc, nil
|
|
}
|
|
|
|
data, err := t.Retrieve(ctx, ri.Id)
|
|
if err != nil {
|
|
return doc, err
|
|
}
|
|
defer data.Close()
|
|
|
|
metas, err := t.tika.MetaRecursive(ctx, data)
|
|
if err != nil {
|
|
return doc, err
|
|
}
|
|
|
|
for _, meta := range metas {
|
|
if title, err := getFirstValue(meta, "title"); err == nil {
|
|
doc.Title = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Title, title))
|
|
}
|
|
|
|
if content, err := getFirstValue(meta, "X-TIKA:content"); err == nil {
|
|
doc.Content = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Content, content))
|
|
}
|
|
}
|
|
|
|
if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
|
|
doc.Content = stopwords.CleanString(doc.Content, lang, true)
|
|
}
|
|
|
|
return doc, nil
|
|
}
|