mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-01-05 03:40:01 -06:00
159 lines
4.2 KiB
Go
159 lines
4.2 KiB
Go
// Copyright 2015 Benjamin BALET. All rights reserved.
|
|
// Use of this source code is governed by the BSD license
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package stopwords implements Charikar's simhash algorithm to generate a 64-bit
|
|
// fingerprint of a given document.
|
|
package stopwords
|
|
|
|
import (
|
|
"bytes"
|
|
"hash/fnv"
|
|
"html"
|
|
|
|
"golang.org/x/text/language"
|
|
"golang.org/x/text/unicode/norm"
|
|
)
|
|
|
|
// Each item of the array is the hash of a word of the content.
|
|
type vector [64]int
|
|
|
|
// Internal struct: 64-bit hash and weight of a word
|
|
type feature struct {
|
|
Sum uint64
|
|
Weight int
|
|
}
|
|
|
|
// Simhash returns a 64-bit simhash representing the content of the string
|
|
// removes useless spaces and stop words from a byte slice.
|
|
// BCP 47 or ISO 639-1 language code (if unknown, we'll apply english filters).
|
|
// If cleanHTML is TRUE, remove HTML tags from content and unescape HTML entities.
|
|
func Simhash(content []byte, langCode string, cleanHTML bool) uint64 {
|
|
//Remove HTML tags
|
|
if cleanHTML {
|
|
content = remTags.ReplaceAll(content, []byte(" "))
|
|
content = []byte(html.UnescapeString(string(content)))
|
|
}
|
|
|
|
//Parse language
|
|
tag := language.Make(langCode)
|
|
base, _ := tag.Base()
|
|
langCode = base.String()
|
|
var hash uint64
|
|
|
|
//Remove stop words by using a list of most frequent words
|
|
switch langCode {
|
|
case "ar":
|
|
hash = removeStopWordsAndHash(content, arabic)
|
|
case "bg":
|
|
hash = removeStopWordsAndHash(content, bulgarian)
|
|
case "cs":
|
|
hash = removeStopWordsAndHash(content, czech)
|
|
case "da":
|
|
hash = removeStopWordsAndHash(content, danish)
|
|
case "de":
|
|
hash = removeStopWordsAndHash(content, german)
|
|
case "el":
|
|
hash = removeStopWordsAndHash(content, greek)
|
|
case "en":
|
|
hash = removeStopWordsAndHash(content, english)
|
|
case "es":
|
|
hash = removeStopWordsAndHash(content, spanish)
|
|
case "fa":
|
|
hash = removeStopWordsAndHash(content, persian)
|
|
case "fr":
|
|
hash = removeStopWordsAndHash(content, french)
|
|
case "fi":
|
|
hash = removeStopWordsAndHash(content, finnish)
|
|
case "hu":
|
|
hash = removeStopWordsAndHash(content, hungarian)
|
|
case "it":
|
|
hash = removeStopWordsAndHash(content, italian)
|
|
case "ja":
|
|
hash = removeStopWordsAndHash(content, japanese)
|
|
case "km":
|
|
hash = removeStopWordsAndHash(content, khmer)
|
|
case "lv":
|
|
hash = removeStopWordsAndHash(content, latvian)
|
|
case "nl":
|
|
hash = removeStopWordsAndHash(content, dutch)
|
|
case "no":
|
|
hash = removeStopWordsAndHash(content, norwegian)
|
|
case "pl":
|
|
hash = removeStopWordsAndHash(content, polish)
|
|
case "pt":
|
|
hash = removeStopWordsAndHash(content, portuguese)
|
|
case "ro":
|
|
hash = removeStopWordsAndHash(content, romanian)
|
|
case "ru":
|
|
hash = removeStopWordsAndHash(content, russian)
|
|
case "sk":
|
|
hash = removeStopWordsAndHash(content, slovak)
|
|
case "sv":
|
|
hash = removeStopWordsAndHash(content, swedish)
|
|
case "th":
|
|
hash = removeStopWordsAndHash(content, thai)
|
|
case "tr":
|
|
hash = removeStopWordsAndHash(content, turkish)
|
|
}
|
|
|
|
return hash
|
|
}
|
|
|
|
// removeStopWords iterates through a list of words and removes stop words.
|
|
func removeStopWordsAndHash(content []byte, dict map[string]string) uint64 {
|
|
var v vector
|
|
var i int
|
|
|
|
content = norm.NFC.Bytes(content)
|
|
content = bytes.ToLower(content)
|
|
words := wordSegmenter.FindAll(content, -1)
|
|
|
|
for _, w := range words {
|
|
if _, ok := dict[string(w)]; !ok {
|
|
aFeature := newFeature(w)
|
|
sum := aFeature.Sum
|
|
weight := aFeature.Weight
|
|
for i := uint8(0); i < 64; i++ {
|
|
bit := ((sum >> i) & 1)
|
|
if bit == 1 {
|
|
v[i] += weight
|
|
} else {
|
|
v[i] -= weight
|
|
}
|
|
}
|
|
i++
|
|
}
|
|
}
|
|
|
|
// compute and return the fingerprint of the content
|
|
// The fingerprint f of a given 64-dimension vector v is defined as follows:
|
|
// f[j] = 1 if v[j] >= 0
|
|
// f[j] = 0 if v[j] < 0
|
|
var fingerprint uint64
|
|
for j := uint8(0); j < 64; j++ {
|
|
if v[j] >= 0 {
|
|
fingerprint |= (1 << j)
|
|
}
|
|
}
|
|
return fingerprint
|
|
}
|
|
|
|
// Returns a new feature representing the given byte slice, using a weight of 1
|
|
func newFeature(f []byte) feature {
|
|
h := fnv.New64()
|
|
h.Write(f)
|
|
return feature{h.Sum64(), 1}
|
|
}
|
|
|
|
// CompareSimhash calculates the Hamming distance between two 64-bit integers
|
|
// using the Kernighan method.
|
|
func CompareSimhash(a uint64, b uint64) uint8 {
|
|
v := a ^ b
|
|
var c uint8
|
|
for c = 0; v != 0; c++ {
|
|
v &= v - 1
|
|
}
|
|
return c
|
|
}
|