Files
dolt/go/store/val/tuple.go
Maximilian Hoffman 6f61ac5a08 [store] row length guards for prolly serial messages (#7533)
* Add row length guards

* tidy

* bump

* more check to integrator

* default panic

* comments

* bump

* better names

* more edits

* merge main

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

* smaller default noms string type

* bats tests valid row lengths

* more tests

* more row len tests

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

---------

Co-authored-by: max-hoffman <max-hoffman@users.noreply.github.com>
2024-02-26 17:46:59 -08:00

226 lines
5.8 KiB
Go

// Copyright 2021 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package val
import (
"math"
"github.com/dolthub/dolt/go/store/hash"
"github.com/dolthub/dolt/go/store/pool"
)
const (
MaxTupleFields = 4096
countSize ByteSize = 2
nodeCountSize = uint64Size
treeLevelSize = uint8Size
// MaxTupleDataSize is the maximum KV length considering the extra
// flatbuffer metadata required to serialize the message. This number
// implicitly checks the "last row" size that will append chunk level
// metadata. Key and value offsets per field are excluded from this number.
// (uint16) - (field count) - (content hash) - (node count) - (tree level)
MaxTupleDataSize ByteSize = math.MaxUint16 - countSize - hash.ByteLen - nodeCountSize - treeLevelSize
)
// A Tuple is a vector of fields encoded as a byte slice. Key-Value Tuple pairs
// are used to store row data within clustered and secondary indexes in Dolt.
//
// The encoding format for Tuples starts with field values packed contiguously from
// the front of the Tuple, followed by field offsets, and finally a field count:
//
// +---------+---------+-----+---------+----------+-----+----------+-------+
// | Value 0 | Value 1 | ... | Value K | Offset 1 | ... | Offset K | Count |
// +---------+---------+-----+---------+----------+-----+----------+-------+
//
// Field offsets encode the byte-offset from the front of the Tuple to the beginning
// of the corresponding field in the Tuple. The offset for the first field is always
// zero and is therefor omitted. Offsets and the field count are little-endian
// encoded uint16 values.
//
// Tuples read and write field values as byte slices. Interpreting these encoded
// values is left up to TupleDesc which knows about a Tuple's schema and associated
// field encodings. Zero-length fields are interpreted as NULL values, all non-NULL
// values must be encoded with non-zero length. For this reason, variable-length
// strings are encoded with a NUL terminator (see codec.go).
//
// Accessing the ith field where i > count will return a NULL value. This allows us
// to implicitly add nullable columns to the end of a schema without needing to
// rewrite index storage. However, because Dolt storage in content-addressed, we
// must have a single canonical encoding for any given Tuple. For this reason, the
// NULL suffix of a Tuple is explicitly truncated and the field count reduced.
type Tuple []byte
var EmptyTuple = Tuple([]byte{0, 0})
func NewTuple(pool pool.BuffPool, values ...[]byte) Tuple {
values = trimNullSuffix(values)
var count int
var pos ByteSize
for _, v := range values {
if isNull(v) {
continue
}
count++
pos += sizeOf(v)
}
if len(values) > MaxTupleFields {
panic("tuple field maxIdx exceeds maximum")
}
if pos > MaxTupleDataSize {
panic("tuple data size exceeds maximum")
}
tup, offs := allocateTuple(pool, pos, len(values))
count = 0
pos = ByteSize(0)
for _, v := range values {
writeOffset(count, pos, offs)
count++
if isNull(v) {
continue
}
copy(tup[pos:pos+sizeOf(v)], v)
pos += sizeOf(v)
}
return tup
}
func trimNullSuffix(values [][]byte) [][]byte {
n := len(values)
for i := len(values) - 1; i >= 0; i-- {
if values[i] != nil {
break
}
n--
}
return values[:n]
}
func cloneTuple(pool pool.BuffPool, tup Tuple) Tuple {
buf := pool.Get(uint64(len(tup)))
copy(buf, tup)
return buf
}
func allocateTuple(pool pool.BuffPool, bufSz ByteSize, fields int) (tup Tuple, offs offsets) {
offSz := offsetsSize(fields)
tup = pool.Get(uint64(bufSz + offSz + countSize))
writeFieldCount(tup, fields)
offs = offsets(tup[bufSz : bufSz+offSz])
return
}
func (tup Tuple) GetOffset(i int) (int, bool) {
cnt := tup.Count()
if i >= cnt {
return 0, false
}
sz := ByteSize(len(tup))
split := sz - uint16Size*ByteSize(cnt)
offs := tup[split : sz-countSize]
start, stop := uint16(0), uint16(split)
if i*2 < len(offs) {
pos := i * 2
stop = ReadUint16(offs[pos : pos+2])
}
if i > 0 {
pos := (i - 1) * 2
start = ReadUint16(offs[pos : pos+2])
}
return int(start), start != stop
}
// GetField returns the value for field |i|.
func (tup Tuple) GetField(i int) []byte {
cnt := tup.Count()
if i >= cnt {
return nil
}
sz := ByteSize(len(tup))
split := sz - uint16Size*ByteSize(cnt)
offs := tup[split : sz-countSize]
start, stop := uint16(0), uint16(split)
if i*2 < len(offs) {
pos := i * 2
stop = ReadUint16(offs[pos : pos+2])
}
if i > 0 {
pos := (i - 1) * 2
start = ReadUint16(offs[pos : pos+2])
}
if start == stop {
return nil // NULL
}
return tup[start:stop]
}
func (tup Tuple) FieldIsNull(i int) bool {
return tup.GetField(i) == nil
}
func (tup Tuple) Count() int {
sl := tup[len(tup)-int(countSize):]
return int(ReadUint16(sl))
}
func isNull(val []byte) bool {
return val == nil
}
func sizeOf(val []byte) ByteSize {
return ByteSize(len(val))
}
func writeFieldCount(tup Tuple, count int) {
sl := tup[len(tup)-int(countSize):]
WriteUint16(sl, uint16(count))
}
type offsets []byte
// offsetsSize returns the number of bytes needed to
// store |fieldCount| offsets.
func offsetsSize(count int) ByteSize {
if count == 0 {
return 0
}
return ByteSize((count - 1) * 2)
}
// writeOffset writes offset |pos| at index |i|.
func writeOffset(i int, off ByteSize, arr offsets) {
if i == 0 {
return
}
start := (i - 1) * 2
WriteUint16(arr[start:start+2], uint16(off))
}