mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-04-28 23:09:46 -05:00
rename folder extensions -> services
Signed-off-by: Christian Richter <crichter@owncloud.com>
This commit is contained in:
@@ -0,0 +1,309 @@
|
||||
package preprocessor
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Default list of scripts to be analyzed within the string.
|
||||
//
|
||||
// Scripts that aren't present in the list will be considered as part
|
||||
// of the last "known" script. For example, if "Avestan" script (which isn't
|
||||
// present) is preceeded by "Arabic" script, then the "Avestan" script will
|
||||
// be considered as "Arabic"
|
||||
//
|
||||
// Punctuation symbols are usually considered part of the "Common" script
|
||||
var DefaultScripts = []string{
|
||||
"Arabic",
|
||||
"Common",
|
||||
"Devanagari",
|
||||
"Han",
|
||||
"Hangul",
|
||||
"Hiragana",
|
||||
"Inherited",
|
||||
"Katakana",
|
||||
"Latin",
|
||||
}
|
||||
|
||||
// Convenient map[string]map[string]string type used to merge multiple
|
||||
// scripts into one. This is mainly used for japanese language which uses
|
||||
// "Han", "Hiragana" and "Katakana" scripts.
|
||||
//
|
||||
// The map contains the expected previous script as first key, the expected
|
||||
// current script as second key, and the resulting script (if both keys
|
||||
// match) as value
|
||||
type MergeMap map[string]map[string]string
|
||||
|
||||
// The default mergeMap containing info for the japanese scripts
|
||||
var DefaultMergeMap = MergeMap{
|
||||
"Han": map[string]string{
|
||||
"Hiragana": "Hiragana",
|
||||
"Katakana": "Katakana",
|
||||
},
|
||||
"Hiragana": map[string]string{
|
||||
"Han": "Hiragana",
|
||||
"Katakana": "Hiragana",
|
||||
},
|
||||
"Katakana": map[string]string{
|
||||
"Han": "Katakana",
|
||||
"Hiragana": "Hiragana",
|
||||
},
|
||||
}
|
||||
|
||||
// Analysis options.
|
||||
type AnalysisOpts struct {
|
||||
UseMergeMap bool
|
||||
MergeMap MergeMap
|
||||
}
|
||||
|
||||
// A script range. The range should be attached to a string which could contain
|
||||
// multiple scripts. The "TargetScript" will go from bytes "Low" to "High"
|
||||
// (both inclusive), and contains a "RuneCount" number of runes or chars
|
||||
// (mostly for debugging purposes).
|
||||
// The Space contains the bytes (inside the range) that are considered as
|
||||
// white space.
|
||||
type ScriptRange struct {
|
||||
Low, High int
|
||||
Spaces []int
|
||||
TargetScript string
|
||||
RuneCount int
|
||||
}
|
||||
|
||||
// The result of a text analysis. It contains the analyzed text, a list of
|
||||
// script ranges (see the ScriptRange type) and a map containing how many
|
||||
// runes have been detected for a particular script.
|
||||
type TextAnalysis struct {
|
||||
ScriptRanges []ScriptRange
|
||||
RuneCount map[string]int
|
||||
Text string
|
||||
}
|
||||
|
||||
// The TextAnalyzer object contains private members. It should be created via
|
||||
// "NewTextAnalyzer" function.
|
||||
type TextAnalyzer struct {
|
||||
scripts map[string]*unicode.RangeTable
|
||||
scriptListCache []string
|
||||
}
|
||||
|
||||
// Create a new TextAnalyzer. A list of scripts must be provided.
|
||||
// You can use the "DefaultScripts" variable for a default list,
|
||||
// although it doesn't contain all the available scripts.
|
||||
// See the unicode.Scripts variable (in the unicode package) for a
|
||||
// full list. Note that using invalid scripts will cause an undefined
|
||||
// behavior
|
||||
func NewTextAnalyzer(scriptList []string) TextAnalyzer {
|
||||
scriptRanges := make(map[string]*unicode.RangeTable, len(scriptList))
|
||||
for _, script := range scriptList {
|
||||
scriptRanges[script] = unicode.Scripts[script]
|
||||
}
|
||||
return TextAnalyzer{
|
||||
scripts: scriptRanges,
|
||||
scriptListCache: scriptList,
|
||||
}
|
||||
}
|
||||
|
||||
// Analyze the target string using the specified options.
|
||||
// A TextAnalysis will be returned with the result of the analysis.
|
||||
func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalysis {
|
||||
analysis := TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{},
|
||||
RuneCount: make(map[string]int),
|
||||
Text: word,
|
||||
}
|
||||
|
||||
if len(word) < 1 {
|
||||
return analysis
|
||||
}
|
||||
|
||||
firstRune, runeLen := utf8.DecodeRuneInString(word)
|
||||
|
||||
lastRange := &ScriptRange{
|
||||
Low: 0,
|
||||
Spaces: make([]int, 0),
|
||||
TargetScript: ta.chooseScriptFor(firstRune),
|
||||
}
|
||||
firstRuneIsWhiteSpace := unicode.Is(unicode.White_Space, firstRune)
|
||||
if firstRuneIsWhiteSpace {
|
||||
lastRange.Spaces = append(lastRange.Spaces, 0)
|
||||
}
|
||||
|
||||
runeCount := 1
|
||||
for wordIndex, char := range word[runeLen:] {
|
||||
wordIndex += runeLen // shifted from the original string
|
||||
script := ta.chooseScriptFor(char)
|
||||
|
||||
isWhiteSpace := unicode.Is(unicode.White_Space, char)
|
||||
if script != lastRange.TargetScript {
|
||||
if mapScript, isOk := ta.getMergeMapValue(opts, lastRange.TargetScript, script); isOk {
|
||||
lastRange.TargetScript = mapScript
|
||||
if isWhiteSpace {
|
||||
// TODO: Check if this is dead code.
|
||||
// whitespace should be part of the "Common" script, and the Common
|
||||
// script shouldn't be part of a mergeMap
|
||||
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
|
||||
}
|
||||
runeCount++
|
||||
continue
|
||||
}
|
||||
|
||||
lastRange.High = wordIndex - 1
|
||||
lastRange.RuneCount = runeCount
|
||||
analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)
|
||||
if _, exists := analysis.RuneCount[lastRange.TargetScript]; !exists {
|
||||
analysis.RuneCount[lastRange.TargetScript] = 0
|
||||
}
|
||||
analysis.RuneCount[lastRange.TargetScript] += runeCount
|
||||
lastRange = &ScriptRange{
|
||||
Low: wordIndex,
|
||||
Spaces: make([]int, 0),
|
||||
TargetScript: script,
|
||||
}
|
||||
runeCount = 0
|
||||
}
|
||||
runeCount++
|
||||
if isWhiteSpace {
|
||||
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
|
||||
}
|
||||
}
|
||||
|
||||
// close the last range
|
||||
lastRange.High = len(word) - 1
|
||||
lastRange.RuneCount = runeCount
|
||||
analysis.RuneCount[lastRange.TargetScript] += runeCount
|
||||
analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)
|
||||
|
||||
return analysis
|
||||
}
|
||||
|
||||
func (ta *TextAnalyzer) chooseScriptFor(char rune) string {
|
||||
script := "_unknown"
|
||||
for scriptIndex, scriptFound := range ta.scriptListCache {
|
||||
// if we can't match with a known script, do nothing and jump to the next char
|
||||
if unicode.Is(ta.scripts[scriptFound], char) {
|
||||
if scriptIndex > 3 {
|
||||
// we might expect more chars with the same script
|
||||
// so move the script first to match it faster next time
|
||||
ta.reorderScriptList(scriptFound)
|
||||
}
|
||||
return scriptFound
|
||||
}
|
||||
}
|
||||
return script
|
||||
}
|
||||
|
||||
// Reorder the scriptListCache in the TextAnalyzer in order to speed up
|
||||
// the next script searches. A "Latin" script is expected to be surrounded
|
||||
// by "Latin" chars, although "Common" script chars might be present too
|
||||
func (ta *TextAnalyzer) reorderScriptList(matchedScript string) {
|
||||
for index, script := range ta.scriptListCache {
|
||||
if script == matchedScript {
|
||||
if index != 0 {
|
||||
// move the script to the first position for a faster matching
|
||||
newList := append([]string{script}, ta.scriptListCache[:index]...)
|
||||
ta.scriptListCache = append(newList, ta.scriptListCache[index+1:]...)
|
||||
}
|
||||
// if index == 0 there is nothing to do: the element is already the first
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get the value from the merge map based on the previous and current scripts.
|
||||
// The information about using the merge map and the actual merge map will be
|
||||
// gotten from the AnalysisOpts passed as parameter
|
||||
func (ta *TextAnalyzer) getMergeMapValue(opts AnalysisOpts, previous, current string) (string, bool) {
|
||||
if opts.UseMergeMap {
|
||||
// This option mainly target japanese chars; multiple scripts can be used
|
||||
// in the same piece of text (Han, Hiragana and Katakana)
|
||||
// Instead of starting a new range, adjust the target script of the last range
|
||||
if expCurrent, currentOk := opts.MergeMap[previous]; currentOk {
|
||||
if expFinal, finalOk := expCurrent[current]; finalOk {
|
||||
return expFinal, finalOk
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Change the "Common" script to the one used in the previous script range.
|
||||
// The ranges will be readjusted and merged if they're adjacent.
|
||||
// This naive approach should be good enough for normal use cases
|
||||
//
|
||||
// The MergeMap is needed in case of the japanese language: the ranges
|
||||
// "Han"-"Common"-"Katakana" might be replaced to "Han"-"Hiragana"-"Katakana"
|
||||
// However, the ranges should be merged together into a big "Hiragana" range.
|
||||
// If the MergeMap isn't needed, use an empty one
|
||||
func (tr *TextAnalysis) MergeCommon(mergeMap MergeMap) {
|
||||
var finalRanges []ScriptRange
|
||||
|
||||
if len(tr.ScriptRanges) < 1 {
|
||||
// no ranges -> nothing to do
|
||||
return
|
||||
}
|
||||
|
||||
previousRange := &ScriptRange{}
|
||||
*previousRange = tr.ScriptRanges[0]
|
||||
for _, sRange := range tr.ScriptRanges[1:] {
|
||||
if previousRange.TargetScript == sRange.TargetScript {
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
} else if sRange.TargetScript == "Common" || sRange.TargetScript == "Inherited" {
|
||||
// new range will be absorbed into the previous one
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
|
||||
tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
|
||||
} else if previousRange.TargetScript == "Common" || previousRange.TargetScript == "Inherited" {
|
||||
// might happen if the text starts with a Common script
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
previousRange.TargetScript = sRange.TargetScript
|
||||
} else {
|
||||
if mapScript, isOk := tr.getMergeMapValue(mergeMap, previousRange.TargetScript, sRange.TargetScript); isOk {
|
||||
if sRange.TargetScript == mapScript {
|
||||
// the previous range has changed the target script
|
||||
tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
|
||||
tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
|
||||
} else {
|
||||
// new range has been absorbed
|
||||
tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
|
||||
}
|
||||
previousRange.TargetScript = mapScript
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
continue
|
||||
}
|
||||
finalRanges = append(finalRanges, *previousRange)
|
||||
*previousRange = sRange
|
||||
}
|
||||
}
|
||||
|
||||
finalRanges = append(finalRanges, *previousRange)
|
||||
tr.ScriptRanges = finalRanges
|
||||
delete(tr.RuneCount, "Common")
|
||||
delete(tr.RuneCount, "Inherited")
|
||||
for index, rCount := range tr.RuneCount {
|
||||
if rCount == 0 {
|
||||
delete(tr.RuneCount, index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (tr *TextAnalysis) getMergeMapValue(mMap MergeMap, previous, current string) (string, bool) {
|
||||
// This option mainly target japanese chars; multiple scripts can be used
|
||||
// in the same piece of text (Han, Hiragana and Katakana)
|
||||
// Instead of starting a new range, adjust the target script of the last range
|
||||
if expCurrent, currentOk := mMap[previous]; currentOk {
|
||||
if expFinal, finalOk := expCurrent[current]; finalOk {
|
||||
return expFinal, finalOk
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
Reference in New Issue
Block a user