mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-02-18 03:18:52 -06:00
Refactor to simplify the code
This commit is contained in:
@@ -112,18 +112,7 @@ func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalys
|
||||
|
||||
runeCount := 0
|
||||
for wordIndex, char := range word {
|
||||
script := "_unknown"
|
||||
for scriptIndex, scriptFound := range ta.scriptListCache {
|
||||
// if we can't match with a known script, do nothing and jump to the next char
|
||||
if unicode.Is(ta.scripts[scriptFound], char) {
|
||||
if scriptIndex > 3 {
|
||||
// we might expect more chars with the same script
|
||||
// so move the script first to match it faster next time
|
||||
ta.reorderScriptList(scriptFound)
|
||||
}
|
||||
script = scriptFound
|
||||
}
|
||||
}
|
||||
script := ta.chooseScriptFor(char)
|
||||
|
||||
isWhiteSpace := unicode.Is(unicode.White_Space, char)
|
||||
if lastRange == nil {
|
||||
@@ -135,23 +124,16 @@ func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalys
|
||||
}
|
||||
} else {
|
||||
if script != lastRange.TargetScript {
|
||||
if opts.UseMergeMap {
|
||||
// This option mainly target japanese chars; multiple scripts can be used
|
||||
// in the same piece of text (Han, Hiragana and Katakana)
|
||||
// Instead of starting a new range, adjust the target script of the last range
|
||||
if expCurrent, currentOk := opts.MergeMap[lastRange.TargetScript]; currentOk {
|
||||
if expFinal, finalOk := expCurrent[script]; finalOk {
|
||||
lastRange.TargetScript = expFinal
|
||||
if isWhiteSpace {
|
||||
// TODO: Check if this is dead code.
|
||||
// whitespace should be part of the "Common" script, and the Common
|
||||
// script shouldn't be part of a mergeMap
|
||||
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
|
||||
}
|
||||
runeCount++
|
||||
continue
|
||||
}
|
||||
if mapScript, isOk := ta.getMergeMapValue(opts, lastRange.TargetScript, script); isOk {
|
||||
lastRange.TargetScript = mapScript
|
||||
if isWhiteSpace {
|
||||
// TODO: Check if this is dead code.
|
||||
// whitespace should be part of the "Common" script, and the Common
|
||||
// script shouldn't be part of a mergeMap
|
||||
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
|
||||
}
|
||||
runeCount++
|
||||
continue
|
||||
}
|
||||
|
||||
lastRange.High = wordIndex - 1
|
||||
@@ -166,10 +148,9 @@ func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalys
|
||||
Spaces: make([]int, 0),
|
||||
TargetScript: script,
|
||||
}
|
||||
runeCount = 1
|
||||
} else {
|
||||
runeCount++
|
||||
runeCount = 0
|
||||
}
|
||||
runeCount++
|
||||
}
|
||||
if isWhiteSpace {
|
||||
lastRange.Spaces = append(lastRange.Spaces, wordIndex)
|
||||
@@ -186,6 +167,22 @@ func (ta *TextAnalyzer) AnalyzeString(word string, opts AnalysisOpts) TextAnalys
|
||||
return analysis
|
||||
}
|
||||
|
||||
func (ta *TextAnalyzer) chooseScriptFor(char rune) string {
|
||||
script := "_unknown"
|
||||
for scriptIndex, scriptFound := range ta.scriptListCache {
|
||||
// if we can't match with a known script, do nothing and jump to the next char
|
||||
if unicode.Is(ta.scripts[scriptFound], char) {
|
||||
if scriptIndex > 3 {
|
||||
// we might expect more chars with the same script
|
||||
// so move the script first to match it faster next time
|
||||
ta.reorderScriptList(scriptFound)
|
||||
}
|
||||
return scriptFound
|
||||
}
|
||||
}
|
||||
return script
|
||||
}
|
||||
|
||||
// Reorder the scriptListCache in the TextAnalyzer in order to speed up
|
||||
// the next script searches. A "Latin" script is expected to be surrounded
|
||||
// by "Latin" chars, although "Common" script chars might be present too
|
||||
@@ -203,6 +200,23 @@ func (ta *TextAnalyzer) reorderScriptList(matchedScript string) {
|
||||
}
|
||||
}
|
||||
|
||||
// Get the value from the merge map based on the previous and current scripts.
|
||||
// The information about using the merge map and the actual merge map will be
|
||||
// gotten from the AnalysisOpts passed as parameter
|
||||
func (ta *TextAnalyzer) getMergeMapValue(opts AnalysisOpts, previous, current string) (string, bool) {
|
||||
if opts.UseMergeMap {
|
||||
// This option mainly target japanese chars; multiple scripts can be used
|
||||
// in the same piece of text (Han, Hiragana and Katakana)
|
||||
// Instead of starting a new range, adjust the target script of the last range
|
||||
if expCurrent, currentOk := opts.MergeMap[previous]; currentOk {
|
||||
if expFinal, finalOk := expCurrent[current]; finalOk {
|
||||
return expFinal, finalOk
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Change the "Common" script to the one used in the previous script range.
|
||||
// The ranges will be readjusted and merged if they're adjacent.
|
||||
// This naive approach should be good enough for normal use cases
|
||||
@@ -213,52 +227,52 @@ func (ta *TextAnalyzer) reorderScriptList(matchedScript string) {
|
||||
// If the MergeMap isn't needed, use an empty one
|
||||
func (tr *TextAnalysis) MergeCommon(mergeMap MergeMap) {
|
||||
var finalRanges []ScriptRange
|
||||
var previousRange *ScriptRange
|
||||
for _, sRange := range tr.ScriptRanges {
|
||||
if previousRange != nil {
|
||||
if previousRange.TargetScript == sRange.TargetScript {
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
} else if sRange.TargetScript == "Common" || sRange.TargetScript == "Inherited" {
|
||||
// new range will be absorbed into the previous one
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
|
||||
tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
|
||||
} else if previousRange.TargetScript == "Common" || previousRange.TargetScript == "Inherited" {
|
||||
// might happen if the text starts with a Common script
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
previousRange.TargetScript = sRange.TargetScript
|
||||
} else {
|
||||
if expCurrent, currentOk := mergeMap[previousRange.TargetScript]; currentOk {
|
||||
if expFinal, finalOk := expCurrent[sRange.TargetScript]; finalOk {
|
||||
if sRange.TargetScript == expFinal {
|
||||
// the previous range has changed the target script
|
||||
tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
|
||||
tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
|
||||
} else {
|
||||
// new range has been absorbed
|
||||
tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
|
||||
}
|
||||
previousRange.TargetScript = expFinal
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
continue
|
||||
}
|
||||
}
|
||||
finalRanges = append(finalRanges, *previousRange)
|
||||
*previousRange = sRange
|
||||
}
|
||||
var previousRange *ScriptRange = &ScriptRange{}
|
||||
|
||||
if len(tr.ScriptRanges) < 1 {
|
||||
// no ranges -> nothing to do
|
||||
return
|
||||
}
|
||||
|
||||
*previousRange = tr.ScriptRanges[0]
|
||||
for _, sRange := range tr.ScriptRanges[1:] {
|
||||
if previousRange.TargetScript == sRange.TargetScript {
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
} else if sRange.TargetScript == "Common" || sRange.TargetScript == "Inherited" {
|
||||
// new range will be absorbed into the previous one
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
|
||||
tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
|
||||
} else if previousRange.TargetScript == "Common" || previousRange.TargetScript == "Inherited" {
|
||||
// might happen if the text starts with a Common script
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
previousRange.TargetScript = sRange.TargetScript
|
||||
} else {
|
||||
previousRange = &ScriptRange{}
|
||||
if mapScript, isOk := tr.getMergeMapValue(mergeMap, previousRange.TargetScript, sRange.TargetScript); isOk {
|
||||
if sRange.TargetScript == mapScript {
|
||||
// the previous range has changed the target script
|
||||
tr.RuneCount[previousRange.TargetScript] -= previousRange.RuneCount
|
||||
tr.RuneCount[sRange.TargetScript] += previousRange.RuneCount
|
||||
} else {
|
||||
// new range has been absorbed
|
||||
tr.RuneCount[sRange.TargetScript] -= sRange.RuneCount
|
||||
tr.RuneCount[previousRange.TargetScript] += sRange.RuneCount
|
||||
}
|
||||
previousRange.TargetScript = mapScript
|
||||
previousRange.High = sRange.High
|
||||
previousRange.Spaces = append(previousRange.Spaces, sRange.Spaces...)
|
||||
previousRange.RuneCount += sRange.RuneCount
|
||||
continue
|
||||
}
|
||||
finalRanges = append(finalRanges, *previousRange)
|
||||
*previousRange = sRange
|
||||
}
|
||||
}
|
||||
@@ -273,3 +287,15 @@ func (tr *TextAnalysis) MergeCommon(mergeMap MergeMap) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (tr *TextAnalysis) getMergeMapValue(mMap MergeMap, previous, current string) (string, bool) {
|
||||
// This option mainly target japanese chars; multiple scripts can be used
|
||||
// in the same piece of text (Han, Hiragana and Katakana)
|
||||
// Instead of starting a new range, adjust the target script of the last range
|
||||
if expCurrent, currentOk := mMap[previous]; currentOk {
|
||||
if expFinal, finalOk := expCurrent[current]; finalOk {
|
||||
return expFinal, finalOk
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
@@ -7,6 +7,28 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
var (
|
||||
inputs = [16]string{
|
||||
"basic latin",
|
||||
"trailing tab ",
|
||||
"Small text. \"$\", \"£\" and \"¥\" are currencies.",
|
||||
"latin with 🖖",
|
||||
"기본 한국어",
|
||||
"基本的な日本語",
|
||||
"ウーロン茶",
|
||||
"私はエンジニアです",
|
||||
"ティー私はエンジニアです",
|
||||
"私はエンジニアです ティー",
|
||||
"आधारभूत देवनागरी",
|
||||
"mixed 언어 传入 🚀!",
|
||||
"/k͜p/",
|
||||
// ä and a + ¨
|
||||
"ä ä",
|
||||
"базовый русский", // cyrillic script isn't part of our default
|
||||
"latin русский", // latin + cyrillic (cyrillic not supported)
|
||||
}
|
||||
)
|
||||
|
||||
func TestAnalyzeString(t *testing.T) {
|
||||
defaultOpts := AnalysisOpts{
|
||||
UseMergeMap: true,
|
||||
@@ -19,7 +41,7 @@ func TestAnalyzeString(t *testing.T) {
|
||||
eOut TextAnalysis
|
||||
}{
|
||||
{
|
||||
input: "basic latin",
|
||||
input: inputs[0],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -28,12 +50,12 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Latin": 11,
|
||||
},
|
||||
Text: "basic latin",
|
||||
Text: inputs[0],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "trailing tab ",
|
||||
opts: defaultOpts,
|
||||
input: inputs[1],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 12, Spaces: []int{8, 12}, TargetScript: "Latin", RuneCount: 13},
|
||||
@@ -41,11 +63,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Latin": 13,
|
||||
},
|
||||
Text: "trailing tab ",
|
||||
Text: inputs[1],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "Small text. \"$\", \"£\" and \"¥\" are currencies.",
|
||||
input: inputs[2],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -54,11 +76,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Latin": 44,
|
||||
},
|
||||
Text: "Small text. \"$\", \"£\" and \"¥\" are currencies.",
|
||||
Text: inputs[2],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "latin with 🖖",
|
||||
input: inputs[3],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -67,11 +89,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Latin": 12,
|
||||
},
|
||||
Text: "latin with 🖖",
|
||||
Text: inputs[3],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "기본 한국어",
|
||||
input: inputs[4],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -80,11 +102,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Hangul": 6,
|
||||
},
|
||||
Text: "기본 한국어",
|
||||
Text: inputs[4],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "基本的な日本語",
|
||||
input: inputs[5],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -93,11 +115,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Hiragana": 7,
|
||||
},
|
||||
Text: "基本的な日本語",
|
||||
Text: inputs[5],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "ウーロン茶",
|
||||
input: inputs[6],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -106,11 +128,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Katakana": 5,
|
||||
},
|
||||
Text: "ウーロン茶",
|
||||
Text: inputs[6],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "私はエンジニアです",
|
||||
input: inputs[7],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -119,11 +141,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Hiragana": 9,
|
||||
},
|
||||
Text: "私はエンジニアです",
|
||||
Text: inputs[7],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "ティー私はエンジニアです",
|
||||
input: inputs[8],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -132,11 +154,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Hiragana": 12,
|
||||
},
|
||||
Text: "ティー私はエンジニアです",
|
||||
Text: inputs[8],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "私はエンジニアです ティー",
|
||||
input: inputs[9],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -145,11 +167,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Hiragana": 13,
|
||||
},
|
||||
Text: "私はエンジニアです ティー",
|
||||
Text: inputs[9],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "आधारभूत देवनागरी",
|
||||
input: inputs[10],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -158,11 +180,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Devanagari": 16,
|
||||
},
|
||||
Text: "आधारभूत देवनागरी",
|
||||
Text: inputs[10],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "mixed 언어 传入 🚀!",
|
||||
input: inputs[11],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -175,11 +197,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
"Hangul": 3,
|
||||
"Han": 5,
|
||||
},
|
||||
Text: "mixed 언어 传入 🚀!",
|
||||
Text: inputs[11],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "/k͜p/",
|
||||
input: inputs[12],
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -188,11 +210,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Latin": 5,
|
||||
},
|
||||
Text: "/k͜p/",
|
||||
Text: inputs[12],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "ä ä", // ä and a + ¨
|
||||
input: inputs[13], // ä and a + ¨
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -201,11 +223,11 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"Latin": 4,
|
||||
},
|
||||
Text: "ä ä",
|
||||
Text: inputs[13],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "базовый русский", // cyrillic script isn't part of our default
|
||||
input: inputs[14], // cyrillic script isn't part of our default
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
@@ -214,7 +236,22 @@ func TestAnalyzeString(t *testing.T) {
|
||||
RuneCount: map[string]int{
|
||||
"_unknown": 15,
|
||||
},
|
||||
Text: "базовый русский",
|
||||
Text: inputs[14],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: inputs[15], // latin + cyrillic (cyrillic script isn't part of our default)
|
||||
opts: defaultOpts,
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 5, Spaces: []int{5}, TargetScript: "Latin", RuneCount: 6},
|
||||
ScriptRange{Low: 6, High: 19, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
|
||||
},
|
||||
RuneCount: map[string]int{
|
||||
"Latin": 6,
|
||||
"_unknown": 7,
|
||||
},
|
||||
Text: inputs[15],
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -240,7 +277,7 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
eOut TextAnalysis
|
||||
}{
|
||||
{
|
||||
input: "basic latin",
|
||||
input: inputs[0],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
||||
@@ -251,11 +288,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Latin": 10,
|
||||
"Common": 1,
|
||||
},
|
||||
Text: "basic latin",
|
||||
Text: inputs[0],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "trailing tab ",
|
||||
input: inputs[1],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 7, Spaces: []int{}, TargetScript: "Latin", RuneCount: 8},
|
||||
@@ -267,11 +304,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Latin": 11,
|
||||
"Common": 2,
|
||||
},
|
||||
Text: "trailing tab ",
|
||||
Text: inputs[1],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "Small text. \"$\", \"£\" and \"¥\" are currencies.",
|
||||
input: inputs[2],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
||||
@@ -289,11 +326,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Latin": 25,
|
||||
"Common": 19,
|
||||
},
|
||||
Text: "Small text. \"$\", \"£\" and \"¥\" are currencies.",
|
||||
Text: inputs[2],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "latin with 🖖",
|
||||
input: inputs[3],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
||||
@@ -305,11 +342,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Latin": 9,
|
||||
"Common": 3,
|
||||
},
|
||||
Text: "latin with 🖖",
|
||||
Text: inputs[3],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "기본 한국어",
|
||||
input: inputs[4],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Hangul", RuneCount: 2},
|
||||
@@ -320,11 +357,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Hangul": 5,
|
||||
"Common": 1,
|
||||
},
|
||||
Text: "기본 한국어",
|
||||
Text: inputs[4],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "基本的な日本語",
|
||||
input: inputs[5],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 8, Spaces: []int{}, TargetScript: "Han", RuneCount: 3},
|
||||
@@ -335,11 +372,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Hiragana": 1,
|
||||
"Han": 6,
|
||||
},
|
||||
Text: "基本的な日本語",
|
||||
Text: inputs[5],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "ウーロン茶",
|
||||
input: inputs[6],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 1},
|
||||
@@ -352,11 +389,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Common": 1,
|
||||
"Han": 1,
|
||||
},
|
||||
Text: "ウーロン茶",
|
||||
Text: inputs[6],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "私はエンジニアです",
|
||||
input: inputs[7],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
|
||||
@@ -369,11 +406,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Hiragana": 3,
|
||||
"Katakana": 5,
|
||||
},
|
||||
Text: "私はエンジニアです",
|
||||
Text: inputs[7],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "ティー私はエンジニアです",
|
||||
input: inputs[8],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 5, Spaces: []int{}, TargetScript: "Katakana", RuneCount: 2},
|
||||
@@ -389,11 +426,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Katakana": 7,
|
||||
"Common": 1,
|
||||
},
|
||||
Text: "ティー私はエンジニアです",
|
||||
Text: inputs[8],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "私はエンジニアです ティー",
|
||||
input: inputs[9],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 2, Spaces: []int{}, TargetScript: "Han", RuneCount: 1},
|
||||
@@ -410,11 +447,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Katakana": 7,
|
||||
"Common": 2,
|
||||
},
|
||||
Text: "私はエンジニアです ティー",
|
||||
Text: inputs[9],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "आधारभूत देवनागरी",
|
||||
input: inputs[10],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 20, Spaces: []int{}, TargetScript: "Devanagari", RuneCount: 7},
|
||||
@@ -425,11 +462,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Devanagari": 15,
|
||||
"Common": 1,
|
||||
},
|
||||
Text: "आधारभूत देवनागरी",
|
||||
Text: inputs[10],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "mixed 언어 传入 🚀!",
|
||||
input: inputs[11],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 4, Spaces: []int{}, TargetScript: "Latin", RuneCount: 5},
|
||||
@@ -445,11 +482,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Han": 2,
|
||||
"Common": 5,
|
||||
},
|
||||
Text: "mixed 언어 传入 🚀!",
|
||||
Text: inputs[11],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "/k͜p/",
|
||||
input: inputs[12],
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 0, Spaces: []int{}, TargetScript: "Common", RuneCount: 1},
|
||||
@@ -463,11 +500,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Common": 2,
|
||||
"Inherited": 1,
|
||||
},
|
||||
Text: "/k͜p/",
|
||||
Text: inputs[12],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "ä ä", // ä and a + ¨
|
||||
input: inputs[13], // ä and a + ¨
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 1, Spaces: []int{}, TargetScript: "Latin", RuneCount: 1},
|
||||
@@ -480,11 +517,11 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"Common": 1,
|
||||
"Inherited": 1,
|
||||
},
|
||||
Text: "ä ä",
|
||||
Text: inputs[13],
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "базовый русский", // cyrillic script isn't part of our default
|
||||
input: inputs[14], // cyrillic script isn't part of our default
|
||||
eOut: TextAnalysis{
|
||||
ScriptRanges: []ScriptRange{
|
||||
ScriptRange{Low: 0, High: 13, Spaces: []int{}, TargetScript: "_unknown", RuneCount: 7},
|
||||
@@ -495,7 +532,7 @@ func TestAnalyzeStringRaw(t *testing.T) {
|
||||
"_unknown": 14,
|
||||
"Common": 1,
|
||||
},
|
||||
Text: "базовый русский",
|
||||
Text: inputs[14],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user