In JSON chunker, allow chunking in the middle of large strings, and add a new location type representing the inside of a string.

This commit is contained in:
Nick Tobey
2025-01-07 18:39:10 -08:00
parent d1e7752ed6
commit 9d29928bc7
2 changed files with 58 additions and 11 deletions

View File

@@ -16,7 +16,6 @@ package tree
import (
"bytes"
"cmp"
"fmt"
"slices"
"strconv"
@@ -69,8 +68,25 @@ const (
objectInitialElement
arrayInitialElement
endOfValue
middleOfString
)
func compareJsonPathTypes(left, right jsonPathType) int {
if left == startOfValue && right != startOfValue {
return -1
}
if left == endOfValue && right != endOfValue {
return 1
}
if right == startOfValue && left != startOfValue {
return 1
}
if right == endOfValue && left != endOfValue {
return -1
}
return 0
}
func (t jsonPathType) isInitialElement() bool {
return t == objectInitialElement || t == arrayInitialElement
}
@@ -170,7 +186,7 @@ func isUnsupportedJsonArrayIndex(index []byte) bool {
}
func errorIfNotSupportedLocation(key []byte) error {
if jsonPathType(key[0]) > endOfValue {
if jsonPathType(key[0]) > middleOfString {
return unknownLocationKeyError
}
return nil
@@ -336,6 +352,10 @@ func (p *jsonLocation) getScannerState() jsonPathType {
return jsonPathType(p.key[0])
}
func (p jsonLocation) IsMiddleOfString() bool {
return p.getScannerState() == middleOfString
}
type jsonPathElement struct {
key []byte
isArrayIndex bool
@@ -429,7 +449,7 @@ func compareJsonLocations(left, right jsonLocation) int {
return -1
}
// left and right have the exact same key elements
return cmp.Compare(left.getScannerState(), right.getScannerState())
return compareJsonPathTypes(left.getScannerState(), right.getScannerState())
}

View File

@@ -118,6 +118,14 @@ func (s *JsonScanner) AdvanceToNextLocation() error {
} else {
return s.acceptNextKeyValue()
}
case middleOfString:
_, finishedString, err := s.acceptRestOfString()
if finishedString {
s.currentPath.setScannerState(endOfValue)
} else {
s.currentPath.setScannerState(middleOfString)
}
return err
default:
return jsonParseError
}
@@ -127,11 +135,16 @@ func (s *JsonScanner) acceptValue() error {
current := s.current()
switch current {
case '"':
_, err := s.acceptString()
_, finishedString, err := s.acceptString()
if err != nil {
return err
}
s.currentPath.setScannerState(endOfValue)
if finishedString {
s.currentPath.setScannerState(endOfValue)
} else {
s.currentPath.setScannerState(middleOfString)
}
return nil
case '[':
s.valueOffset++
@@ -177,22 +190,33 @@ func (s *JsonScanner) accept(b byte) error {
return nil
}
func (s *JsonScanner) acceptString() ([]byte, error) {
err := s.accept('"')
func (s *JsonScanner) acceptString() (stringBytes []byte, finishedString bool, err error) {
err = s.accept('"')
if err != nil {
return nil, err
return nil, false, err
}
return s.acceptRestOfString()
}
func (s *JsonScanner) acceptRestOfString() (stringBytes []byte, finishedString bool, err error) {
stringStart := s.valueOffset
for s.current() != '"' {
stringLength := 0
for s.current() != '"' && stringLength < 1000 {
switch s.current() {
case '\\':
s.valueOffset++
}
s.valueOffset++
stringLength++
}
result := s.jsonBuffer[stringStart:s.valueOffset]
if stringLength == 1000 {
// Split the segment here, so that the chunk doesn't get too large.
return result, false, nil
}
// Advance past the ending quotes
s.valueOffset++
return result, nil
return result, true, nil
}
func (s *JsonScanner) acceptKeyValue() error {
@@ -228,7 +252,10 @@ func (s *JsonScanner) acceptNextKeyValue() error {
}
func (s *JsonScanner) acceptObjectKey() error {
objectKey, err := s.acceptString()
objectKey, finishedString, err := s.acceptString()
if !finishedString {
// a very long key that might not fit? How to handle this?
}
if err != nil {
return err
}