Files
opencloud/vendor/github.com/bbalet/stopwords/simhash.go
2023-04-19 20:24:34 +02:00

159 lines
4.2 KiB
Go

// Copyright 2015 Benjamin BALET. All rights reserved.
// Use of this source code is governed by the BSD license
// license that can be found in the LICENSE file.
// Package stopwords implements Charikar's simhash algorithm to generate a 64-bit
// fingerprint of a given document.
package stopwords
import (
"bytes"
"hash/fnv"
"html"
"golang.org/x/text/language"
"golang.org/x/text/unicode/norm"
)
// Each item of the array is the hash of a word of the content.
type vector [64]int
// Internal struct: 64-bit hash and weight of a word
type feature struct {
Sum uint64
Weight int
}
// Simhash returns a 64-bit simhash representing the content of the string
// removes useless spaces and stop words from a byte slice.
// BCP 47 or ISO 639-1 language code (if unknown, we'll apply english filters).
// If cleanHTML is TRUE, remove HTML tags from content and unescape HTML entities.
func Simhash(content []byte, langCode string, cleanHTML bool) uint64 {
//Remove HTML tags
if cleanHTML {
content = remTags.ReplaceAll(content, []byte(" "))
content = []byte(html.UnescapeString(string(content)))
}
//Parse language
tag := language.Make(langCode)
base, _ := tag.Base()
langCode = base.String()
var hash uint64
//Remove stop words by using a list of most frequent words
switch langCode {
case "ar":
hash = removeStopWordsAndHash(content, arabic)
case "bg":
hash = removeStopWordsAndHash(content, bulgarian)
case "cs":
hash = removeStopWordsAndHash(content, czech)
case "da":
hash = removeStopWordsAndHash(content, danish)
case "de":
hash = removeStopWordsAndHash(content, german)
case "el":
hash = removeStopWordsAndHash(content, greek)
case "en":
hash = removeStopWordsAndHash(content, english)
case "es":
hash = removeStopWordsAndHash(content, spanish)
case "fa":
hash = removeStopWordsAndHash(content, persian)
case "fr":
hash = removeStopWordsAndHash(content, french)
case "fi":
hash = removeStopWordsAndHash(content, finnish)
case "hu":
hash = removeStopWordsAndHash(content, hungarian)
case "it":
hash = removeStopWordsAndHash(content, italian)
case "ja":
hash = removeStopWordsAndHash(content, japanese)
case "km":
hash = removeStopWordsAndHash(content, khmer)
case "lv":
hash = removeStopWordsAndHash(content, latvian)
case "nl":
hash = removeStopWordsAndHash(content, dutch)
case "no":
hash = removeStopWordsAndHash(content, norwegian)
case "pl":
hash = removeStopWordsAndHash(content, polish)
case "pt":
hash = removeStopWordsAndHash(content, portuguese)
case "ro":
hash = removeStopWordsAndHash(content, romanian)
case "ru":
hash = removeStopWordsAndHash(content, russian)
case "sk":
hash = removeStopWordsAndHash(content, slovak)
case "sv":
hash = removeStopWordsAndHash(content, swedish)
case "th":
hash = removeStopWordsAndHash(content, thai)
case "tr":
hash = removeStopWordsAndHash(content, turkish)
}
return hash
}
// removeStopWords iterates through a list of words and removes stop words.
func removeStopWordsAndHash(content []byte, dict map[string]string) uint64 {
var v vector
var i int
content = norm.NFC.Bytes(content)
content = bytes.ToLower(content)
words := wordSegmenter.FindAll(content, -1)
for _, w := range words {
if _, ok := dict[string(w)]; !ok {
aFeature := newFeature(w)
sum := aFeature.Sum
weight := aFeature.Weight
for i := uint8(0); i < 64; i++ {
bit := ((sum >> i) & 1)
if bit == 1 {
v[i] += weight
} else {
v[i] -= weight
}
}
i++
}
}
// compute and return the fingerprint of the content
// The fingerprint f of a given 64-dimension vector v is defined as follows:
// f[j] = 1 if v[j] >= 0
// f[j] = 0 if v[j] < 0
var fingerprint uint64
for j := uint8(0); j < 64; j++ {
if v[j] >= 0 {
fingerprint |= (1 << j)
}
}
return fingerprint
}
// Returns a new feature representing the given byte slice, using a weight of 1
func newFeature(f []byte) feature {
h := fnv.New64()
h.Write(f)
return feature{h.Sum64(), 1}
}
// CompareSimhash calculates the Hamming distance between two 64-bit integers
// using the Kernighan method.
func CompareSimhash(a uint64, b uint64) uint8 {
v := a ^ b
var c uint8
for c = 0; v != 0; c++ {
v &= v - 1
}
return c
}