diff --git a/thumbnails/pkg/preprocessor/textanalyzer.go b/thumbnails/pkg/preprocessor/textanalyzer.go index eb139e9f2..8dddfa642 100644 --- a/thumbnails/pkg/preprocessor/textanalyzer.go +++ b/thumbnails/pkg/preprocessor/textanalyzer.go @@ -2,6 +2,7 @@ package preprocessor import ( "unicode" + "unicode/utf8" ) // Default list of scripts to be analyzed within the string. @@ -105,65 +106,72 @@ func NewTextAnalyzer(scriptList []string) TextAnalyzer { // A TextAnalysis will be returned with the result of the analysis. func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalysis { analysis := TextAnalysis{ - RuneCount: make(map[string]int), - Text: word, + ScriptRanges: []ScriptRange{}, + RuneCount: make(map[string]int), + Text: word, } - var lastRange *ScriptRange - runeCount := 0 - for wordIndex, char := range word { + if len(word) < 1 { + return analysis + } + + firstRune, runeLen := utf8.DecodeRuneInString(word) + + lastRange := &ScriptRange{ + Low: 0, + Spaces: make([]int, 0), + TargetScript: ta.chooseScriptFor(firstRune), + } + firstRuneIsWhiteSpace := unicode.Is(unicode.White_Space, firstRune) + if firstRuneIsWhiteSpace { + lastRange.Spaces = append(lastRange.Spaces, 0) + } + + runeCount := 1 + for wordIndex, char := range word[runeLen:] { + wordIndex += runeLen // shifted from the original string script := ta.chooseScriptFor(char) isWhiteSpace := unicode.Is(unicode.White_Space, char) - if lastRange == nil { - runeCount = 1 + if script != lastRange.TargetScript { + if mapScript, isOk := ta.getMergeMapValue(opts, lastRange.TargetScript, script); isOk { + lastRange.TargetScript = mapScript + if isWhiteSpace { + // TODO: Check if this is dead code. + // whitespace should be part of the "Common" script, and the Common + // script shouldn't be part of a mergeMap + lastRange.Spaces = append(lastRange.Spaces, wordIndex) + } + runeCount++ + continue + } + + lastRange.High = wordIndex - 1 + lastRange.RuneCount = runeCount + analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange) + if _, exists := analysis.RuneCount[lastRange.TargetScript]; !exists { + analysis.RuneCount[lastRange.TargetScript] = 0 + } + analysis.RuneCount[lastRange.TargetScript] += runeCount lastRange = &ScriptRange{ Low: wordIndex, Spaces: make([]int, 0), TargetScript: script, } - } else { - if script != lastRange.TargetScript { - if mapScript, isOk := ta.getMergeMapValue(opts, lastRange.TargetScript, script); isOk { - lastRange.TargetScript = mapScript - if isWhiteSpace { - // TODO: Check if this is dead code. - // whitespace should be part of the "Common" script, and the Common - // script shouldn't be part of a mergeMap - lastRange.Spaces = append(lastRange.Spaces, wordIndex) - } - runeCount++ - continue - } - - lastRange.High = wordIndex - 1 - lastRange.RuneCount = runeCount - analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange) - if _, exists := analysis.RuneCount[lastRange.TargetScript]; !exists { - analysis.RuneCount[lastRange.TargetScript] = 0 - } - analysis.RuneCount[lastRange.TargetScript] += runeCount - lastRange = &ScriptRange{ - Low: wordIndex, - Spaces: make([]int, 0), - TargetScript: script, - } - runeCount = 0 - } - runeCount++ + runeCount = 0 } + runeCount++ if isWhiteSpace { lastRange.Spaces = append(lastRange.Spaces, wordIndex) } } - if lastRange != nil { - // close the last range - lastRange.High = len(word) - 1 - lastRange.RuneCount = runeCount - analysis.RuneCount[lastRange.TargetScript] += runeCount - analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange) - } + // close the last range + lastRange.High = len(word) - 1 + lastRange.RuneCount = runeCount + analysis.RuneCount[lastRange.TargetScript] += runeCount + analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange) + return analysis } @@ -227,13 +235,13 @@ func (ta *TextAnalyzer) getMergeMapValue(opts AnalysisOpts, previous, current st // If the MergeMap isn't needed, use an empty one func (tr *TextAnalysis) MergeCommon(mergeMap MergeMap) { var finalRanges []ScriptRange - var previousRange *ScriptRange = &ScriptRange{} if len(tr.ScriptRanges) < 1 { // no ranges -> nothing to do return } + previousRange := &ScriptRange{} *previousRange = tr.ScriptRanges[0] for _, sRange := range tr.ScriptRanges[1:] { if previousRange.TargetScript == sRange.TargetScript { diff --git a/thumbnails/pkg/preprocessor/textanalyzer_test.go b/thumbnails/pkg/preprocessor/textanalyzer_test.go index 02ccfd126..91f980247 100644 --- a/thumbnails/pkg/preprocessor/textanalyzer_test.go +++ b/thumbnails/pkg/preprocessor/textanalyzer_test.go @@ -8,7 +8,7 @@ import ( ) var ( - inputs = [16]string{ + inputs = [18]string{ "basic latin", "trailing tab ", "Small text. \"$\", \"£\" and \"¥\" are currencies.", @@ -26,6 +26,8 @@ var ( "ä ä", "базовый русский", // cyrillic script isn't part of our default "latin русский", // latin + cyrillic (cyrillic not supported) + " space justified ", + "", } ) @@ -254,6 +256,28 @@ func TestAnalyzeString(t *testing.T) { Text: inputs[15], }, }, + { + input: inputs[16], // latin + cyrillic (cyrillic script isn't part of our default) + opts: defaultOpts, + eOut: TextAnalysis{ + ScriptRanges: []ScriptRange{ + ScriptRange{Low: 0, High: 16, Spaces: []int{0, 6, 16}, TargetScript: "Latin", RuneCount: 17}, + }, + RuneCount: map[string]int{ + "Latin": 17, + }, + Text: inputs[16], + }, + }, + { + input: inputs[17], // latin + cyrillic (cyrillic script isn't part of our default) + opts: defaultOpts, + eOut: TextAnalysis{ + ScriptRanges: []ScriptRange{}, + RuneCount: map[string]int{}, + Text: inputs[17], + }, + }, } for _, table := range tables { @@ -535,6 +559,47 @@ func TestAnalyzeStringRaw(t *testing.T) { Text: inputs[14], }, }, + { + input: inputs[15], // latin + cyrillic (cyrillic script isn't part of our default) + eOut: TextAnalysis{ + ScriptRanges: []ScriptRange{ + ScriptRange{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5}, + ScriptRange{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1}, + ScriptRange{Low: 6, High: 19, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7}, + }, + RuneCount: map[string]int{ + "Latin": 5, + "Common": 1, + "_unknown": 7, + }, + Text: inputs[15], + }, + }, + { + input: inputs[16], + eOut: TextAnalysis{ + ScriptRanges: []ScriptRange{ + ScriptRange{Low: 0, High: 0, Spaces: []int{0}, TargetScript: "Common", RuneCount: 1}, + ScriptRange{Low: 1, High: 5, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5}, + ScriptRange{Low: 6, High: 6, Spaces: []int{6}, TargetScript: "Common", RuneCount: 1}, + ScriptRange{Low: 7, High: 15, Spaces: []int{}, TargetScript: "Latin", RuneCount: 9}, + ScriptRange{Low: 16, High: 16, Spaces: []int{16}, TargetScript: "Common", RuneCount: 1}, + }, + RuneCount: map[string]int{ + "Latin": 14, + "Common": 3, + }, + Text: inputs[16], + }, + }, + { + input: inputs[17], // empty string + eOut: TextAnalysis{ + ScriptRanges: []ScriptRange{}, + RuneCount: map[string]int{}, + Text: inputs[17], + }, + }, } for _, table := range tables {