Simplify code and add new tests

This commit is contained in:
Juan Pablo Villafáñez
2021-11-12 11:18:27 +01:00
parent 5dcdac6680
commit 8dbf0f27f4
2 changed files with 118 additions and 45 deletions

View File

@@ -2,6 +2,7 @@ package preprocessor
import (
"unicode"
"unicode/utf8"
)
// Default list of scripts to be analyzed within the string.
@@ -105,65 +106,72 @@ func NewTextAnalyzer(scriptList []string) TextAnalyzer {
// A TextAnalysis will be returned with the result of the analysis.
func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalysis {
analysis := TextAnalysis{
RuneCount: make(map[string]int),
Text: word,
ScriptRanges: []ScriptRange{},
RuneCount: make(map[string]int),
Text: word,
}
var lastRange *ScriptRange
runeCount := 0
for wordIndex, char := range word {
if len(word) < 1 {
return analysis
}
firstRune, runeLen := utf8.DecodeRuneInString(word)
lastRange := &ScriptRange{
Low: 0,
Spaces: make([]int, 0),
TargetScript: ta.chooseScriptFor(firstRune),
}
firstRuneIsWhiteSpace := unicode.Is(unicode.White_Space, firstRune)
if firstRuneIsWhiteSpace {
lastRange.Spaces = append(lastRange.Spaces, 0)
}
runeCount := 1
for wordIndex, char := range word[runeLen:] {
wordIndex += runeLen // shifted from the original string
script := ta.chooseScriptFor(char)
isWhiteSpace := unicode.Is(unicode.White_Space, char)
if lastRange == nil {
runeCount = 1
if script != lastRange.TargetScript {
if mapScript, isOk := ta.getMergeMapValue(opts, lastRange.TargetScript, script); isOk {
lastRange.TargetScript = mapScript
if isWhiteSpace {
// TODO: Check if this is dead code.
// whitespace should be part of the "Common" script, and the Common
// script shouldn't be part of a mergeMap
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
}
runeCount++
continue
}
lastRange.High = wordIndex - 1
lastRange.RuneCount = runeCount
analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)
if _, exists := analysis.RuneCount[lastRange.TargetScript]; !exists {
analysis.RuneCount[lastRange.TargetScript] = 0
}
analysis.RuneCount[lastRange.TargetScript] += runeCount
lastRange = &ScriptRange{
Low: wordIndex,
Spaces: make([]int, 0),
TargetScript: script,
}
} else {
if script != lastRange.TargetScript {
if mapScript, isOk := ta.getMergeMapValue(opts, lastRange.TargetScript, script); isOk {
lastRange.TargetScript = mapScript
if isWhiteSpace {
// TODO: Check if this is dead code.
// whitespace should be part of the "Common" script, and the Common
// script shouldn't be part of a mergeMap
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
}
runeCount++
continue
}
lastRange.High = wordIndex - 1
lastRange.RuneCount = runeCount
analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)
if _, exists := analysis.RuneCount[lastRange.TargetScript]; !exists {
analysis.RuneCount[lastRange.TargetScript] = 0
}
analysis.RuneCount[lastRange.TargetScript] += runeCount
lastRange = &ScriptRange{
Low: wordIndex,
Spaces: make([]int, 0),
TargetScript: script,
}
runeCount = 0
}
runeCount++
runeCount = 0
}
runeCount++
if isWhiteSpace {
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
}
}
if lastRange != nil {
// close the last range
lastRange.High = len(word) - 1
lastRange.RuneCount = runeCount
analysis.RuneCount[lastRange.TargetScript] += runeCount
analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)
}
// close the last range
lastRange.High = len(word) - 1
lastRange.RuneCount = runeCount
analysis.RuneCount[lastRange.TargetScript] += runeCount
analysis.ScriptRanges = append(analysis.ScriptRanges, *lastRange)
return analysis
}
@@ -227,13 +235,13 @@ func (ta *TextAnalyzer) getMergeMapValue(opts AnalysisOpts, previous, current st
// If the MergeMap isn't needed, use an empty one
func (tr *TextAnalysis) MergeCommon(mergeMap MergeMap) {
var finalRanges []ScriptRange
var previousRange *ScriptRange = &ScriptRange{}
if len(tr.ScriptRanges) < 1 {
// no ranges -> nothing to do
return
}
previousRange := &ScriptRange{}
*previousRange = tr.ScriptRanges[0]
for _, sRange := range tr.ScriptRanges[1:] {
if previousRange.TargetScript == sRange.TargetScript {

View File

@@ -8,7 +8,7 @@ import (
)
var (
inputs = [16]string{
inputs = [18]string{
"basic latin",
"trailing tab ",
"Small text. \"$\", \"£\" and \"¥\" are currencies.",
@@ -26,6 +26,8 @@ var (
"ä ä",
"базовый русский", // cyrillic script isn't part of our default
"latin русский", // latin + cyrillic (cyrillic not supported)
" space justified ",
"",
}
)
@@ -254,6 +256,28 @@ func TestAnalyzeString(t *testing.T) {
Text: inputs[15],
},
},
{
input: inputs[16], // latin + cyrillic (cyrillic script isn't part of our default)
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
ScriptRange{Low: 0, High: 16, Spaces: []int{0, 6, 16}, TargetScript: "Latin", RuneCount: 17},
},
RuneCount: map[string]int{
"Latin": 17,
},
Text: inputs[16],
},
},
{
input: inputs[17], // latin + cyrillic (cyrillic script isn't part of our default)
opts: defaultOpts,
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{},
RuneCount: map[string]int{},
Text: inputs[17],
},
},
}
for _, table := range tables {
@@ -535,6 +559,47 @@ func TestAnalyzeStringRaw(t *testing.T) {
Text: inputs[14],
},
},
{
input: inputs[15], // latin + cyrillic (cyrillic script isn't part of our default)
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
ScriptRange{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
ScriptRange{Low: 5, High: 5, Spaces: []int{5}, TargetScript: "Common", RuneCount: 1},
ScriptRange{Low: 6, High: 19, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
},
RuneCount: map[string]int{
"Latin": 5,
"Common": 1,
"_unknown": 7,
},
Text: inputs[15],
},
},
{
input: inputs[16],
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{
ScriptRange{Low: 0, High: 0, Spaces: []int{0}, TargetScript: "Common", RuneCount: 1},
ScriptRange{Low: 1, High: 5, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
ScriptRange{Low: 6, High: 6, Spaces: []int{6}, TargetScript: "Common", RuneCount: 1},
ScriptRange{Low: 7, High: 15, Spaces: []int{}, TargetScript: "Latin", RuneCount: 9},
ScriptRange{Low: 16, High: 16, Spaces: []int{16}, TargetScript: "Common", RuneCount: 1},
},
RuneCount: map[string]int{
"Latin": 14,
"Common": 3,
},
Text: inputs[16],
},
},
{
input: inputs[17], // empty string
eOut: TextAnalysis{
ScriptRanges: []ScriptRange{},
RuneCount: map[string]int{},
Text: inputs[17],
},
},
}
for _, table := range tables {