Files
opencloud/services/search/pkg/content/tika.go
T
Andre Duffeck c0665975b3 Do not try to fulltext-index large files. (#6395)
* Do not try to extract the content of large files.

Both tika and ocis do not handle large files/archives very well.

* Apply suggestions from code review

Co-authored-by: Martin <github@diemattels.at>

---------

Co-authored-by: Martin <github@diemattels.at>
2023-05-27 19:03:58 +02:00

94 lines
2.5 KiB
Go

package content
import (
"context"
"fmt"
"strings"
"github.com/bbalet/stopwords"
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
"github.com/google/go-tika/tika"
"github.com/owncloud/ocis/v2/ocis-pkg/log"
"github.com/owncloud/ocis/v2/services/search/pkg/config"
)
// Tika is used to extract content from a resource,
// it uses apache tika to retrieve all the data.
type Tika struct {
*Basic
Retriever
tika *tika.Client
contentExtractionSizeLimit uint64
}
// NewTikaExtractor creates a new Tika instance.
func NewTikaExtractor(gw gateway.GatewayAPIClient, logger log.Logger, cfg *config.Config) (*Tika, error) {
basic, err := NewBasicExtractor(logger)
if err != nil {
return nil, err
}
tk := tika.NewClient(nil, cfg.Extractor.Tika.TikaURL)
tkv, err := tk.Version(context.Background())
if err != nil {
return nil, err
}
logger.Info().Msgf("Tika version: %s", tkv)
return &Tika{
Basic: basic,
Retriever: newCS3Retriever(gw, logger, cfg.Extractor.CS3AllowInsecure),
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
contentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
}, nil
}
// Extract loads a resource from its underlying storage, passes it to tika and processes the result into a Document.
func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document, error) {
doc, err := t.Basic.Extract(ctx, ri)
if err != nil {
return doc, err
}
if ri.Size == 0 {
return doc, nil
}
if ri.Size > t.contentExtractionSizeLimit {
t.logger.Info().Interface("ResourceID", ri.Id).Str("Name", ri.Name).Msg("file exceeds content extraction size limit. skipping.")
return doc, nil
}
if ri.Type != provider.ResourceType_RESOURCE_TYPE_FILE {
return doc, nil
}
data, err := t.Retrieve(ctx, ri.Id)
if err != nil {
return doc, err
}
defer data.Close()
metas, err := t.tika.MetaRecursive(ctx, data)
if err != nil {
return doc, err
}
for _, meta := range metas {
if title, err := getFirstValue(meta, "title"); err == nil {
doc.Title = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Title, title))
}
if content, err := getFirstValue(meta, "X-TIKA:content"); err == nil {
doc.Content = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Content, content))
}
}
if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
doc.Content = stopwords.CleanString(doc.Content, lang, true)
}
return doc, nil
}