Files
dolt/go/store/nbs/table_index.go
T
Dhruv Sringari c4ec696d5d fix leaky lock
2022-04-15 10:52:52 -07:00

694 lines
20 KiB
Go

// Copyright 2022 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package nbs
import (
"bytes"
"encoding/binary"
"errors"
"io"
"os"
"sync"
"sync/atomic"
"github.com/dolthub/mmap-go"
"github.com/dolthub/dolt/go/libraries/utils/iohelp"
"github.com/dolthub/dolt/go/store/hash"
)
var (
ErrWrongBufferSize = errors.New("buffer length and/or capacity incorrect for chunkCount specified in footer")
ErrWrongCopySize = errors.New("could not copy enough bytes")
)
type tableIndex interface {
// ChunkCount returns the total number of chunks in the indexed file.
ChunkCount() uint32
// EntrySuffixMatches returns true if the entry at index |idx| matches
// the suffix of the address |h|. Used by |Lookup| after finding
// matching indexes based on |Prefixes|.
EntrySuffixMatches(idx uint32, h *addr) (bool, error)
// IndexEntry returns the |indexEntry| at |idx|. Optionally puts the
// full address of that entry in |a| if |a| is not |nil|.
IndexEntry(idx uint32, a *addr) (indexEntry, error)
// Lookup returns an |indexEntry| for the chunk corresponding to the
// provided address |h|. Second returns is |true| if an entry exists
// and |false| otherwise.
Lookup(h *addr) (indexEntry, bool, error)
// Ordinals returns a slice of indexes which maps the |i|th chunk in
// the indexed file to its corresponding entry in index. The |i|th
// entry in the result is the |i|th chunk in the indexed file, and its
// corresponding value in the slice is the index entry that maps to it.
Ordinals() ([]uint32, error)
// Prefixes returns the sorted slice of |uint64| |addr| prefixes; each
// entry corresponds to an indexed chunk address.
Prefixes() ([]uint64, error)
// PrefixAt returns the prefix at the specified index
PrefixAt(idx uint32) uint64
// TableFileSize returns the total size of the indexed table file, in bytes.
TableFileSize() uint64
// TotalUncompressedData returns the total uncompressed data size of
// the table file. Used for informational statistics only.
TotalUncompressedData() uint64
// Close releases any resources used by this tableIndex.
Close() error
// Clone returns a |tableIndex| with the same contents which can be
// |Close|d independently.
Clone() (tableIndex, error)
}
func ReadTableFooter(rd io.ReadSeeker) (chunkCount uint32, totalUncompressedData uint64, err error) {
footerSize := int64(magicNumberSize + uint64Size + uint32Size)
_, err = rd.Seek(-footerSize, io.SeekEnd)
if err != nil {
return 0, 0, err
}
footer, err := iohelp.ReadNBytes(rd, int(footerSize))
if err != nil {
return 0, 0, err
}
if string(footer[uint32Size+uint64Size:]) != magicNumber {
return 0, 0, ErrInvalidTableFile
}
chunkCount = binary.BigEndian.Uint32(footer)
totalUncompressedData = binary.BigEndian.Uint64(footer[uint32Size:])
return
}
func indexMemSize(chunkCount uint32) uint64 {
is := indexSize(chunkCount) + footerSize
// Extra required space for offsets that don't fit into the region where lengths were previously stored, see
// newOnHeapTableIndex
is += uint64(offsetSize * (chunkCount - chunkCount/2))
return is
}
// parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index
// and footer and its length must match the expected indexSize for the chunkCount specified in the footer.
// Retains the buffer and does not allocate new memory except for offsets, computes on buff in place.
func parseTableIndex(buff []byte, q MemoryQuotaProvider) (onHeapTableIndex, error) {
chunkCount, totalUncompressedData, err := ReadTableFooter(bytes.NewReader(buff))
if err != nil {
return onHeapTableIndex{}, err
}
buff, err = removeFooter(buff, chunkCount)
if err != nil {
return onHeapTableIndex{}, err
}
chunks2 := chunkCount / 2
chunks1 := chunkCount - chunks2
offsetsBuff1 := make([]byte, chunks1*offsetSize)
return newOnHeapTableIndex(buff, offsetsBuff1, chunkCount, totalUncompressedData, q)
}
// similar to parseTableIndex except that it uses the given |offsetsBuff1|
// instead of allocating the additional space.
func parseTableIndexWithOffsetBuff(buff []byte, offsetsBuff1 []byte, q MemoryQuotaProvider) (onHeapTableIndex, error) {
chunkCount, totalUncompressedData, err := ReadTableFooter(bytes.NewReader(buff))
if err != nil {
return onHeapTableIndex{}, err
}
buff, err = removeFooter(buff, chunkCount)
if err != nil {
return onHeapTableIndex{}, err
}
return newOnHeapTableIndex(buff, offsetsBuff1, chunkCount, totalUncompressedData, q)
}
func removeFooter(p []byte, chunkCount uint32) (out []byte, err error) {
iS := indexSize(chunkCount) + footerSize
if uint64(len(p)) != iS {
return nil, ErrWrongBufferSize
}
out = p[:len(p)-footerSize]
return
}
// parseTableIndexByCopy reads the footer, copies indexSize(chunkCount) bytes, and parses an on heap table index.
// Useful to create an onHeapTableIndex without retaining the entire underlying array of data.
func parseTableIndexByCopy(buff []byte, q MemoryQuotaProvider) (onHeapTableIndex, error) {
r := bytes.NewReader(buff)
return ReadTableIndexByCopy(r, q)
}
// ReadTableIndexByCopy loads an index into memory from an io.ReadSeeker
// Caution: Allocates new memory for entire index
func ReadTableIndexByCopy(rd io.ReadSeeker, q MemoryQuotaProvider) (onHeapTableIndex, error) {
chunkCount, totalUncompressedData, err := ReadTableFooter(rd)
if err != nil {
return onHeapTableIndex{}, err
}
iS := int64(indexSize(chunkCount))
_, err = rd.Seek(-(iS + footerSize), io.SeekEnd)
if err != nil {
return onHeapTableIndex{}, err
}
buff := make([]byte, iS)
_, err = io.ReadFull(rd, buff)
if err != nil {
return onHeapTableIndex{}, err
}
chunks2 := chunkCount / 2
chunks1 := chunkCount - chunks2
offsets1Buff := make([]byte, chunks1*offsetSize)
return newOnHeapTableIndex(buff, offsets1Buff, chunkCount, totalUncompressedData, q)
}
type onHeapTableIndex struct {
q MemoryQuotaProvider
refCnt *int32
tableFileSize uint64
// Tuple bytes
tupleB []byte
// Offset bytes
offsetB1 []byte
offsetB2 []byte
// Suffix bytes
suffixB []byte
chunkCount uint32
totalUncompressedData uint64
}
var _ tableIndex = &onHeapTableIndex{}
// newOnHeapTableIndex converts a table file index with stored lengths on
// |indexBuff| into an index with stored offsets. Since offsets are twice the
// size of a length, we need to allocate additional space to store all the
// offsets. It stores the first n - n/2 offsets in |offsetsBuff1| (the
// additional space) and the rest into the region of |indexBuff| previously
// occupied by lengths. |onHeapTableIndex| computes directly on the given
// |indexBuff| and |offsetsBuff1| buffers.
func newOnHeapTableIndex(indexBuff []byte, offsetsBuff1 []byte, chunkCount uint32, totalUncompressedData uint64, q MemoryQuotaProvider) (onHeapTableIndex, error) {
tuples := indexBuff[:prefixTupleSize*chunkCount]
lengths := indexBuff[prefixTupleSize*chunkCount : prefixTupleSize*chunkCount+lengthSize*chunkCount]
suffixes := indexBuff[prefixTupleSize*chunkCount+lengthSize*chunkCount:]
chunks2 := chunkCount / 2
lR := bytes.NewReader(lengths)
r := NewOffsetsReader(lR)
_, err := io.ReadFull(r, offsetsBuff1)
if err != nil {
return onHeapTableIndex{}, err
}
var offsetsBuff2 []byte
if chunks2 > 0 {
offsetsBuff2 = lengths[:chunks2*offsetSize]
_, err = io.ReadFull(r, offsetsBuff2)
if err != nil {
return onHeapTableIndex{}, err
}
}
refCnt := new(int32)
*refCnt = 1
return onHeapTableIndex{
refCnt: refCnt,
q: q,
tupleB: tuples,
offsetB1: offsetsBuff1,
offsetB2: offsetsBuff2,
suffixB: suffixes,
chunkCount: chunkCount,
totalUncompressedData: totalUncompressedData,
}, nil
}
func (ti onHeapTableIndex) ChunkCount() uint32 {
return ti.chunkCount
}
func (ti onHeapTableIndex) PrefixAt(idx uint32) uint64 {
return ti.prefixAt(idx)
}
func (ti onHeapTableIndex) EntrySuffixMatches(idx uint32, h *addr) (bool, error) {
ord := ti.ordinalAt(idx)
o := ord * addrSuffixSize
b := ti.suffixB[o : o+addrSuffixSize]
return bytes.Equal(h[addrPrefixSize:], b), nil
}
func (ti onHeapTableIndex) IndexEntry(idx uint32, a *addr) (entry indexEntry, err error) {
prefix, ord := ti.tupleAt(idx)
if a != nil {
binary.BigEndian.PutUint64(a[:], prefix)
o := int64(addrSuffixSize * ord)
b := ti.suffixB[o : o+addrSuffixSize]
copy(a[addrPrefixSize:], b)
}
return ti.getIndexEntry(ord), nil
}
func (ti onHeapTableIndex) getIndexEntry(ord uint32) indexEntry {
var prevOff uint64
if ord == 0 {
prevOff = 0
} else {
prevOff = ti.offsetAt(ord - 1)
}
ordOff := ti.offsetAt(ord)
length := uint32(ordOff - prevOff)
return indexResult{
o: prevOff,
l: length,
}
}
func (ti onHeapTableIndex) Lookup(h *addr) (indexEntry, bool, error) {
ord, err := ti.lookupOrdinal(h)
if err != nil {
return indexResult{}, false, err
}
if ord == ti.chunkCount {
return indexResult{}, false, nil
}
return ti.getIndexEntry(ord), true, nil
}
// lookupOrdinal returns the ordinal of |h| if present. Returns |ti.chunkCount|
// if absent.
func (ti onHeapTableIndex) lookupOrdinal(h *addr) (uint32, error) {
prefix := h.Prefix()
for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixAt(idx) == prefix; idx++ {
m, err := ti.EntrySuffixMatches(idx, h)
if err != nil {
return ti.chunkCount, err
}
if m {
return ti.ordinalAt(idx), nil
}
}
return ti.chunkCount, nil
}
// prefixIdx returns the first position in |tr.prefixes| whose value ==
// |prefix|. Returns |tr.chunkCount| if absent
func (ti onHeapTableIndex) prefixIdx(prefix uint64) (idx uint32) {
// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
// an extremely tight loop and inlining the code was a significant perf improvement.
idx, j := 0, ti.chunkCount
for idx < j {
h := idx + (j-idx)/2 // avoid overflow when computing h
// i ≤ h < j
if ti.prefixAt(h) < prefix {
idx = h + 1 // preserves f(i-1) == false
} else {
j = h // preserves f(j) == true
}
}
return
}
func (ti onHeapTableIndex) tupleAt(idx uint32) (prefix uint64, ord uint32) {
off := int64(prefixTupleSize * idx)
b := ti.tupleB[off : off+prefixTupleSize]
prefix = binary.BigEndian.Uint64(b[:])
ord = binary.BigEndian.Uint32(b[addrPrefixSize:])
return prefix, ord
}
func (ti onHeapTableIndex) prefixAt(idx uint32) uint64 {
off := int64(prefixTupleSize * idx)
b := ti.tupleB[off : off+addrPrefixSize]
return binary.BigEndian.Uint64(b)
}
func (ti onHeapTableIndex) ordinalAt(idx uint32) uint32 {
off := int64(prefixTupleSize*idx) + addrPrefixSize
b := ti.tupleB[off : off+ordinalSize]
return binary.BigEndian.Uint32(b)
}
// the first n - n/2 offsets are stored in offsetsB1 and the rest in offsetsB2
func (ti onHeapTableIndex) offsetAt(ord uint32) uint64 {
chunks1 := ti.chunkCount - ti.chunkCount/2
var b []byte
if ord < chunks1 {
off := int64(offsetSize * ord)
b = ti.offsetB1[off : off+offsetSize]
} else {
off := int64(offsetSize * (ord - chunks1))
b = ti.offsetB2[off : off+offsetSize]
}
return binary.BigEndian.Uint64(b)
}
func (ti onHeapTableIndex) Ordinals() ([]uint32, error) {
o := make([]uint32, ti.chunkCount)
for i, off := uint32(0), 0; i < ti.chunkCount; i, off = i+1, off+prefixTupleSize {
b := ti.tupleB[off+addrPrefixSize : off+prefixTupleSize]
o[i] = binary.BigEndian.Uint32(b)
}
return o, nil
}
func (ti onHeapTableIndex) Prefixes() ([]uint64, error) {
p := make([]uint64, ti.chunkCount)
for i, off := uint32(0), 0; i < ti.chunkCount; i, off = i+1, off+prefixTupleSize {
b := ti.tupleB[off : off+addrPrefixSize]
p[i] = binary.BigEndian.Uint64(b)
}
return p, nil
}
func (ti onHeapTableIndex) hashAt(idx uint32) hash.Hash {
// Get tuple
off := int64(prefixTupleSize * idx)
tuple := ti.tupleB[off : off+prefixTupleSize]
// Get prefix, ordinal, and suffix
prefix := tuple[:addrPrefixSize]
ord := binary.BigEndian.Uint32(tuple[addrPrefixSize:]) * addrSuffixSize
suffix := ti.suffixB[ord : ord+addrSuffixSize] // suffix is 12 bytes
// Combine prefix and suffix to get hash
buf := [hash.ByteLen]byte{}
copy(buf[:addrPrefixSize], prefix)
copy(buf[addrPrefixSize:], suffix)
return buf
}
// prefixIdxLBound returns the first position in |tr.prefixes| whose value is <= |prefix|.
// will return index less than where prefix would be if prefix is not found.
func (ti onHeapTableIndex) prefixIdxLBound(prefix uint64) uint32 {
l, r := uint32(0), ti.chunkCount
for l < r {
m := l + (r-l)/2 // find middle, rounding down
if ti.prefixAt(m) < prefix {
l = m + 1
} else {
r = m
}
}
return l
}
// prefixIdxLBound returns the first position in |tr.prefixes| whose value is >= |prefix|.
// will return index greater than where prefix would be if prefix is not found.
func (ti onHeapTableIndex) prefixIdxUBound(prefix uint64) (idx uint32) {
l, r := uint32(0), ti.chunkCount
for l < r {
m := l + (r-l+1)/2 // find middle, rounding up
if m >= ti.chunkCount { // prevent index out of bounds
return r
}
pre := ti.prefixAt(m)
if pre <= prefix {
l = m
} else {
r = m - 1
}
}
return l
}
func (ti onHeapTableIndex) padStringAndDecode(s string, p string) uint64 {
// Pad string
if p == "0" {
for i := len(s); i < 16; i++ {
s = s + p
}
} else {
for i := len(s); i < 16; i++ {
s = p + s
}
}
// Decode
h, _ := encoding.DecodeString(s)
return binary.BigEndian.Uint64(h)
}
func (ti onHeapTableIndex) ResolveShortHash(short []byte) ([]string, error) {
// Convert to string
shortHash := string(short)
// Calculate length
sLen := len(shortHash)
// Find lower and upper bounds of prefix indexes to check
var pIdxL, pIdxU uint32
if sLen >= 13 {
// Convert short string to prefix
sPrefix := ti.padStringAndDecode(shortHash, "0")
// Binary Search for prefix
pIdxL = ti.prefixIdx(sPrefix)
// Prefix doesn't exist
if pIdxL == ti.chunkCount {
return []string{}, errors.New("can't find prefix")
}
// Find last equal
pIdxU = pIdxL + 1
for sPrefix == ti.prefixAt(pIdxU) {
pIdxU++
}
} else {
// Convert short string to lower and upper bounds
sPrefixL := ti.padStringAndDecode(shortHash, "0")
sPrefixU := ti.padStringAndDecode(shortHash, "v")
// Binary search for lower and upper bounds
pIdxL = ti.prefixIdxLBound(sPrefixL)
pIdxU = ti.prefixIdxUBound(sPrefixU)
}
// Go through all equal prefixes
var res []string
for i := pIdxL; i < pIdxU; i++ {
// Get full hash at index
h := ti.hashAt(i)
// Convert to string representation
hashStr := h.String()
// If it matches append to result
if hashStr[:sLen] == shortHash {
res = append(res, hashStr)
}
}
return res, nil
}
// TableFileSize returns the size of the table file that this index references.
// This assumes that the index follows immediately after the last chunk in the
// file and that the last chunk in the file is in the index.
func (ti onHeapTableIndex) TableFileSize() uint64 {
if ti.chunkCount == 0 {
return footerSize
}
entry := ti.getIndexEntry(ti.chunkCount - 1)
offset, len := entry.Offset(), uint64(entry.Length())
return offset + len + indexSize(ti.chunkCount) + footerSize
}
func (ti onHeapTableIndex) TotalUncompressedData() uint64 {
return ti.totalUncompressedData
}
func (ti onHeapTableIndex) Close() error {
cnt := atomic.AddInt32(ti.refCnt, -1)
if cnt == 0 {
ti.tupleB = nil
ti.offsetB1 = nil
ti.offsetB2 = nil
ti.suffixB = nil
return ti.q.ReleaseQuota(indexMemSize(ti.chunkCount))
}
if cnt < 0 {
panic("Close() called and reduced ref count to < 0.")
}
return nil
}
func (ti onHeapTableIndex) Clone() (tableIndex, error) {
cnt := atomic.AddInt32(ti.refCnt, 1)
if cnt == 1 {
panic("Clone() called after last Close(). This index is no longer valid.")
}
return ti, nil
}
// mmapTableIndex is an onHeapTableIndex but creates all of its slice buffers
// from mmap. It overrides Clone and Close of mmapTableIndex so that it can
// count references and release mmapped regions appropriately.
type mmapTableIndex struct {
onHeapTableIndex
refCnt *int32
q MemoryQuotaProvider
mmapped mmapWStat
indexDataBuff []byte
offset1DataBuff []byte
}
// newMmapTableIndex mmaps a region of memory large enough to store a fully
// parsed onHeapTableIndex. After creating the mmapTableIndex, index data should
// be loaded into |indexDataBuff| and then parsed with parseIndexBuffer.
func newMmapTableIndex(chunkCount uint32) (*mmapTableIndex, error) {
indexSize := int(indexSize(chunkCount) + footerSize)
chunks2 := chunkCount / 2
chunks1 := chunkCount - chunks2
offsets1Size := int(chunks1 * offsetSize)
mmapped, err := mmapWithStats(nil, indexSize+offsets1Size, mmap.RDWR, mmap.ANON, 0)
if err != nil {
return nil, err
}
indexBytesBuff := mmapped.m[:indexSize]
offsets1Buff := mmapped.m[indexSize : indexSize+offsets1Size]
refCnt := new(int32)
*refCnt = 1
return &mmapTableIndex{
refCnt: refCnt,
mmapped: mmapped,
indexDataBuff: indexBytesBuff,
offset1DataBuff: offsets1Buff}, nil
}
func (ti *mmapTableIndex) Clone() (tableIndex, error) {
cnt := atomic.AddInt32(ti.refCnt, 1)
if cnt == 1 {
panic("Clone() called after last Close(). This index is no longer valid.")
}
return ti, nil
}
// Close closes the underlying onHeapTableIndex and then unmaps the memory
// region.
func (ti *mmapTableIndex) Close() error {
cnt := atomic.AddInt32(ti.refCnt, -1)
if cnt == 0 {
chunkCount := ti.chunkCount
// mmapTableIndex sets the quota provider for onHeapTableIndex to a
// noopQuotaProvider, so that we can release quota after the memory region
// is unmapped.
err := ti.onHeapTableIndex.Close()
if err != nil {
return err
}
ti.indexDataBuff = nil
ti.offset1DataBuff = nil
err = ti.mmapped.Unmap()
if err != nil {
return err
}
err = ti.q.ReleaseQuota(indexMemSize(chunkCount))
if err != nil {
return err
}
}
if cnt < 0 {
panic("Close() called and reduced ref count to < 0.")
}
return nil
}
func (ti *mmapTableIndex) parseIndexBuffer(q MemoryQuotaProvider) (err error) {
ti.onHeapTableIndex, err = parseTableIndexWithOffsetBuff(ti.indexDataBuff, ti.offset1DataBuff, &noopQuotaProvider{})
ti.q = q
return err
}
type notifyFunc func(n uint64, total uint64)
var noOpNotify = func(uint64, uint64) {}
type mmapStats struct {
mu sync.Mutex
totalUsed uint64
WillMmap notifyFunc
Mmapped notifyFunc
UnMapped notifyFunc
}
var GlobalMmapStats = &mmapStats{
sync.Mutex{},
0,
noOpNotify,
noOpNotify,
noOpNotify,
}
type mmapWStat struct {
m mmap.MMap
used uint64
}
func mmapWithStats(f *os.File, length int, prot, flags int, offset int64) (mmapWStat, error) {
GlobalMmapStats.mu.Lock()
defer GlobalMmapStats.mu.Unlock()
GlobalMmapStats.WillMmap(uint64(length), GlobalMmapStats.totalUsed)
mmap, err := mmap.MapRegion(f, length, prot, flags, offset)
if err != nil {
return mmapWStat{}, err
}
GlobalMmapStats.totalUsed += uint64(length)
GlobalMmapStats.Mmapped(uint64(length), GlobalMmapStats.totalUsed)
return mmapWStat{mmap, uint64(length)}, nil
}
func (m mmapWStat) Unmap() error {
GlobalMmapStats.mu.Lock()
defer GlobalMmapStats.mu.Unlock()
err := m.m.Unmap()
if err != nil {
return err
}
GlobalMmapStats.totalUsed -= m.used
GlobalMmapStats.UnMapped(m.used, GlobalMmapStats.totalUsed)
return nil
}