mirror of
https://github.com/dolthub/dolt.git
synced 2026-02-11 10:33:08 -06:00
Adds the ability to stream individual chunks requested via GetMany() back to caller. Removes readAmpThresh and maxReadSize. Lowers the S3ReadBlockSize to 512k.
439 lines
13 KiB
Go
439 lines
13 KiB
Go
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
|
// Licensed under the Apache License, version 2.0:
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
package nbs
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/binary"
|
|
"io"
|
|
"sort"
|
|
"sync"
|
|
|
|
"github.com/attic-labs/noms/go/chunks"
|
|
"github.com/attic-labs/noms/go/d"
|
|
"github.com/attic-labs/noms/go/hash"
|
|
"github.com/golang/snappy"
|
|
)
|
|
|
|
type tableIndex struct {
|
|
chunkCount uint32
|
|
prefixes, offsets []uint64
|
|
lengths, ordinals []uint32
|
|
suffixes []byte
|
|
}
|
|
|
|
// tableReader implements get & has queries against a single nbs table. goroutine safe.
|
|
type tableReader struct {
|
|
tableIndex
|
|
r io.ReaderAt
|
|
blockSize uint64
|
|
}
|
|
|
|
// parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. |tableIndex| doesn't keep alive any references to |buff|.
|
|
func parseTableIndex(buff []byte) tableIndex {
|
|
pos := uint64(len(buff))
|
|
|
|
// footer
|
|
pos -= magicNumberSize
|
|
d.Chk.True(string(buff[pos:]) == magicNumber)
|
|
|
|
// skip total chunk data
|
|
pos -= uint64Size
|
|
|
|
pos -= uint32Size
|
|
chunkCount := binary.BigEndian.Uint32(buff[pos:])
|
|
|
|
// index
|
|
suffixesSize := uint64(chunkCount) * addrSuffixSize
|
|
pos -= suffixesSize
|
|
suffixes := make([]byte, suffixesSize)
|
|
copy(suffixes, buff[pos:])
|
|
|
|
lengthsSize := uint64(chunkCount) * lengthSize
|
|
pos -= lengthsSize
|
|
lengths, offsets := computeOffsets(chunkCount, buff[pos:pos+lengthsSize])
|
|
|
|
tuplesSize := uint64(chunkCount) * prefixTupleSize
|
|
pos -= tuplesSize
|
|
prefixes, ordinals := computePrefixes(chunkCount, buff[pos:pos+tuplesSize])
|
|
|
|
return tableIndex{
|
|
chunkCount,
|
|
prefixes, offsets,
|
|
lengths, ordinals,
|
|
suffixes,
|
|
}
|
|
}
|
|
|
|
func computeOffsets(count uint32, buff []byte) (lengths []uint32, offsets []uint64) {
|
|
lengths = make([]uint32, count)
|
|
offsets = make([]uint64, count)
|
|
|
|
lengths[0] = binary.BigEndian.Uint32(buff)
|
|
|
|
for i := uint64(1); i < uint64(count); i++ {
|
|
lengths[i] = binary.BigEndian.Uint32(buff[i*lengthSize:])
|
|
offsets[i] = offsets[i-1] + uint64(lengths[i-1])
|
|
}
|
|
return
|
|
}
|
|
|
|
func computePrefixes(count uint32, buff []byte) (prefixes []uint64, ordinals []uint32) {
|
|
prefixes = make([]uint64, count)
|
|
ordinals = make([]uint32, count)
|
|
|
|
for i := uint64(0); i < uint64(count); i++ {
|
|
idx := i * prefixTupleSize
|
|
prefixes[i] = binary.BigEndian.Uint64(buff[idx:])
|
|
ordinals[i] = binary.BigEndian.Uint32(buff[idx+addrPrefixSize:])
|
|
}
|
|
return
|
|
}
|
|
|
|
func (ti tableIndex) prefixIdxToOrdinal(idx uint32) uint32 {
|
|
return ti.ordinals[idx]
|
|
}
|
|
|
|
// returns the first position in |tr.prefixes| whose value == |prefix|. Returns |tr.chunkCount|
|
|
// if absent
|
|
func (ti tableIndex) prefixIdx(prefix uint64) (idx uint32) {
|
|
// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
|
|
// an extremely tight loop and inlining the code was a significant perf improvement.
|
|
idx, j := 0, ti.chunkCount
|
|
for idx < j {
|
|
h := idx + (j-idx)/2 // avoid overflow when computing h
|
|
// i ≤ h < j
|
|
if ti.prefixes[h] < prefix {
|
|
idx = h + 1 // preserves f(i-1) == false
|
|
} else {
|
|
j = h // preserves f(j) == true
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// Return true IFF the suffix at insertion order |ordinal| matches the address |a|.
|
|
func (ti tableIndex) ordinalSuffixMatches(ordinal uint32, h addr) bool {
|
|
li := uint64(ordinal) * addrSuffixSize
|
|
return bytes.Compare(h[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize]) == 0
|
|
}
|
|
|
|
// returns the ordinal of |h| if present. returns |ti.chunkCount| if absent
|
|
func (ti tableIndex) lookupOrdinal(h addr) uint32 {
|
|
prefix := h.Prefix()
|
|
|
|
for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixes[idx] == prefix; idx++ {
|
|
ordinal := ti.prefixIdxToOrdinal(idx)
|
|
if ti.ordinalSuffixMatches(ordinal, h) {
|
|
return ordinal
|
|
}
|
|
}
|
|
|
|
return ti.chunkCount
|
|
}
|
|
|
|
// newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. r should allow retrieving any desired range of bytes from the table.
|
|
func newTableReader(index tableIndex, r io.ReaderAt, blockSize uint64) tableReader {
|
|
return tableReader{index, r, blockSize}
|
|
}
|
|
|
|
// Scan across (logically) two ordered slices of address prefixes.
|
|
func (tr tableReader) hasMany(addrs []hasRecord) (remaining bool) {
|
|
// TODO: Use findInIndex if (tr.chunkCount - len(addrs)*Log2(tr.chunkCount)) > (tr.chunkCount - len(addrs))
|
|
|
|
filterIdx := uint32(0)
|
|
filterLen := uint32(len(tr.prefixes))
|
|
|
|
for i, addr := range addrs {
|
|
if addr.has {
|
|
continue
|
|
}
|
|
|
|
for filterIdx < filterLen && addr.prefix > tr.prefixes[filterIdx] {
|
|
filterIdx++
|
|
}
|
|
|
|
if filterIdx >= filterLen {
|
|
remaining = true
|
|
return
|
|
}
|
|
|
|
if addr.prefix != tr.prefixes[filterIdx] {
|
|
remaining = true
|
|
continue
|
|
}
|
|
|
|
// prefixes are equal, so locate and compare against the corresponding suffix
|
|
for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ {
|
|
if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *addr.a) {
|
|
addrs[i].has = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !addrs[i].has {
|
|
remaining = true
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (tr tableReader) count() uint32 {
|
|
return tr.chunkCount
|
|
}
|
|
|
|
// returns true iff |h| can be found in this table.
|
|
func (tr tableReader) has(h addr) bool {
|
|
ordinal := tr.lookupOrdinal(h)
|
|
return ordinal < tr.count()
|
|
}
|
|
|
|
// returns the storage associated with |h|, iff present. Returns nil if absent. On success,
|
|
// the returned byte slice directly references the underlying storage.
|
|
func (tr tableReader) get(h addr) (data []byte) {
|
|
ordinal := tr.lookupOrdinal(h)
|
|
if ordinal == tr.count() {
|
|
return
|
|
}
|
|
|
|
offset := tr.offsets[ordinal]
|
|
length := uint64(tr.lengths[ordinal])
|
|
buff := make([]byte, length) // TODO: Avoid this allocation for every get
|
|
n, err := tr.r.ReadAt(buff, int64(offset))
|
|
d.Chk.NoError(err)
|
|
d.Chk.True(n == int(length))
|
|
data = tr.parseChunk(buff)
|
|
d.Chk.True(data != nil)
|
|
|
|
return
|
|
}
|
|
|
|
type offsetRec struct {
|
|
a *addr
|
|
ordinal uint32
|
|
offset uint64
|
|
}
|
|
|
|
type offsetRecSlice []offsetRec
|
|
|
|
func (hs offsetRecSlice) Len() int { return len(hs) }
|
|
func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset }
|
|
func (hs offsetRecSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] }
|
|
|
|
func (tr tableReader) readAtOffsets(readStart, readEnd uint64, reqs []getRecord, offsets offsetRecSlice, foundChunks chan *chunks.Chunk, wg *sync.WaitGroup) {
|
|
readLength := readEnd - readStart
|
|
buff := make([]byte, readLength)
|
|
n, err := tr.r.ReadAt(buff, int64(readStart))
|
|
d.Chk.NoError(err)
|
|
d.Chk.True(uint64(n) == readLength)
|
|
|
|
for _, rec := range offsets {
|
|
d.Chk.True(rec.offset >= readStart)
|
|
localStart := rec.offset - readStart
|
|
localEnd := localStart + uint64(tr.lengths[rec.ordinal])
|
|
d.Chk.True(localEnd <= readLength)
|
|
data := tr.parseChunk(buff[localStart:localEnd])
|
|
c := chunks.NewChunkWithHash(hash.Hash(*rec.a), data)
|
|
foundChunks <- &c
|
|
}
|
|
|
|
wg.Done()
|
|
}
|
|
|
|
// getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical
|
|
// blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix.
|
|
func (tr tableReader) getMany(reqs []getRecord, foundChunks chan *chunks.Chunk, wg *sync.WaitGroup) (remaining bool) {
|
|
// Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
|
|
// of table locations which must be read in order to satisfy the getMany operation.
|
|
var offsetRecords offsetRecSlice
|
|
offsetRecords, remaining = tr.findOffsets(reqs)
|
|
|
|
// Now |offsetRecords| contains all locations within the table which must be search (note
|
|
// that there may be duplicates of a particular location). Sort by offset and scan forward,
|
|
// grouping sequences of reads into large physical reads.
|
|
sort.Sort(offsetRecords)
|
|
|
|
var batch offsetRecSlice
|
|
var readStart, readEnd uint64
|
|
|
|
for i := 0; i < len(offsetRecords); {
|
|
rec := offsetRecords[i]
|
|
length := tr.lengths[rec.ordinal]
|
|
|
|
if batch == nil {
|
|
batch = make(offsetRecSlice, 1)
|
|
batch[0] = offsetRecords[i]
|
|
readStart = rec.offset
|
|
readEnd = readStart + uint64(length)
|
|
i++
|
|
continue
|
|
}
|
|
|
|
if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead {
|
|
batch = append(batch, rec)
|
|
readEnd = newReadEnd
|
|
i++
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg)
|
|
batch = nil
|
|
}
|
|
|
|
if batch != nil {
|
|
wg.Add(1)
|
|
go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg)
|
|
batch = nil
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// findOffsets iterates over |reqs| and |tr.prefixes| (both sorted by
|
|
// address) to build the set of table locations which must be read in order to
|
|
// find each chunk specified by |reqs|. If this table contains all requested
|
|
// chunks remaining will be set to false upon return. If some are not here,
|
|
// then remaining will be true.
|
|
func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool) {
|
|
filterIdx := uint32(0)
|
|
filterLen := uint32(len(tr.prefixes))
|
|
ors = make(offsetRecSlice, 0, len(reqs))
|
|
|
|
// Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
|
|
// of table locations which must be read in order to satisfy |reqs|.
|
|
for i, req := range reqs {
|
|
if req.found {
|
|
continue
|
|
}
|
|
|
|
// advance within the prefixes until we reach one which is >= req.prefix
|
|
for filterIdx < filterLen && tr.prefixes[filterIdx] < req.prefix {
|
|
filterIdx++
|
|
}
|
|
|
|
if filterIdx >= filterLen {
|
|
remaining = true // last prefix visited.
|
|
break
|
|
}
|
|
|
|
if req.prefix != tr.prefixes[filterIdx] {
|
|
remaining = true
|
|
continue
|
|
}
|
|
|
|
// record all offsets within the table which contain the data required.
|
|
for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ {
|
|
if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *req.a) {
|
|
reqs[i].found = true
|
|
ors = append(ors, offsetRec{req.a, tr.ordinals[j], tr.offsets[tr.ordinals[j]]})
|
|
}
|
|
}
|
|
}
|
|
return ors, remaining
|
|
}
|
|
|
|
func canReadAhead(fRec offsetRec, fLength uint32, readStart, readEnd, blockSize uint64) (newEnd uint64, canRead bool) {
|
|
if fRec.offset < readEnd {
|
|
// |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address
|
|
// prefix matches the prefix of a requested address. If the set of requests contains
|
|
// addresses which share a common prefix, then it's possible for multiple offsetRecords
|
|
// to reference the same table offset position. In that case, we'll see sequential
|
|
// offsetRecords with the same fRec.offset.
|
|
return readEnd, true
|
|
}
|
|
|
|
if fRec.offset-readEnd > blockSize {
|
|
return readEnd, false
|
|
}
|
|
|
|
return fRec.offset + uint64(fLength), true
|
|
}
|
|
|
|
// Fetches the byte stream of data logically encoded within the table starting at |pos|.
|
|
func (tr tableReader) parseChunk(buff []byte) []byte {
|
|
dataLen := uint64(len(buff)) - checksumSize
|
|
data, err := snappy.Decode(nil, buff[:dataLen])
|
|
d.Chk.NoError(err)
|
|
buff = buff[dataLen:]
|
|
|
|
chksum := binary.BigEndian.Uint32(buff)
|
|
d.Chk.True(chksum == crc(data))
|
|
|
|
return data
|
|
}
|
|
|
|
func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool) {
|
|
var offsetRecords offsetRecSlice
|
|
// Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table.
|
|
offsetRecords, remaining = tr.findOffsets(reqs)
|
|
|
|
// Now |offsetRecords| contains all locations within the table which must
|
|
// be searched (note that there may be duplicates of a particular
|
|
// location). Sort by offset and scan forward, grouping sequences of reads
|
|
// into large physical reads.
|
|
sort.Sort(offsetRecords)
|
|
|
|
var readStart, readEnd uint64
|
|
readStarted := false
|
|
|
|
for i := 0; i < len(offsetRecords); {
|
|
rec := offsetRecords[i]
|
|
length := tr.lengths[rec.ordinal]
|
|
|
|
if !readStarted {
|
|
readStarted = true
|
|
reads++
|
|
readStart = rec.offset
|
|
readEnd = readStart + uint64(length)
|
|
i++
|
|
continue
|
|
}
|
|
|
|
if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead {
|
|
readEnd = newReadEnd
|
|
i++
|
|
continue
|
|
}
|
|
|
|
readStarted = false
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (tr tableReader) extract(order EnumerationOrder, chunks chan<- extractRecord) {
|
|
// Build reverse lookup table from ordinal -> chunk hash
|
|
hashes := make(addrSlice, len(tr.prefixes))
|
|
for idx, prefix := range tr.prefixes {
|
|
ordinal := tr.prefixIdxToOrdinal(uint32(idx))
|
|
binary.BigEndian.PutUint64(hashes[ordinal][:], prefix)
|
|
li := uint64(ordinal) * addrSuffixSize
|
|
copy(hashes[ordinal][addrPrefixSize:], tr.suffixes[li:li+addrSuffixSize])
|
|
}
|
|
chunkLen := tr.offsets[tr.chunkCount-1] + uint64(tr.lengths[tr.chunkCount-1])
|
|
buff := make([]byte, chunkLen)
|
|
n, err := tr.r.ReadAt(buff, int64(tr.offsets[0]))
|
|
d.Chk.NoError(err)
|
|
d.Chk.True(uint64(n) == chunkLen)
|
|
|
|
sendChunk := func(i uint32) {
|
|
localOffset := tr.offsets[i] - tr.offsets[0]
|
|
chunks <- extractRecord{hashes[i], tr.parseChunk(buff[localOffset : localOffset+uint64(tr.lengths[i])])}
|
|
}
|
|
|
|
if order == ReverseOrder {
|
|
for i := uint32(1); i <= tr.chunkCount; i++ {
|
|
sendChunk(tr.chunkCount - i)
|
|
}
|
|
return
|
|
}
|
|
for i := uint32(0); i < tr.chunkCount; i++ {
|
|
sendChunk(i)
|
|
}
|
|
}
|