Files
dolt/go/nbs/table_reader.go
cmasone-attic 14e95379af NBS: Fragmentation tool using new estimate of locality (#3658)
The new version of this tool now estimates the locality of a DB
written using the "grandchild" strategy implemented by
types.ValueStore. It does do by dividing each level of the graph
up into groups that are roughly the size of the branching factor
of that level, and then calculating how many physical reads are
needed to read each group.

In the case of perfect locality, each group could be read in a
single physical read, so that's what the tool uses as its estimate
of the optimal case.

Toward #2968
2017-09-11 15:34:17 -07:00

490 lines
14 KiB
Go

// Copyright 2016 Attic Labs, Inc. All rights reserved.
// Licensed under the Apache License, version 2.0:
// http://www.apache.org/licenses/LICENSE-2.0
package nbs
import (
"bytes"
"encoding/binary"
"io"
"sort"
"sync"
"github.com/attic-labs/noms/go/chunks"
"github.com/attic-labs/noms/go/d"
"github.com/attic-labs/noms/go/hash"
"github.com/golang/snappy"
)
type tableIndex struct {
chunkCount uint32
totalUncompressedData uint64
prefixes, offsets []uint64
lengths, ordinals []uint32
suffixes []byte
}
type tableReaderAt interface {
ReadAtWithStats(p []byte, off int64, stats *Stats) (n int, err error)
}
// tableReader implements get & has queries against a single nbs table. goroutine safe.
// |blockSize| refers to the block-size of the underlying storage. We assume that, each time we read data, we actually have to read in blocks of this size. So, we're willing to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group more chunks together into a single read request to backing storage.
type tableReader struct {
tableIndex
r tableReaderAt
blockSize uint64
}
// parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. |tableIndex| doesn't keep alive any references to |buff|.
func parseTableIndex(buff []byte) tableIndex {
pos := uint64(len(buff))
// footer
pos -= magicNumberSize
d.Chk.True(string(buff[pos:]) == magicNumber)
// total uncompressed chunk data
pos -= uint64Size
totalUncompressedData := binary.BigEndian.Uint64(buff[pos:])
pos -= uint32Size
chunkCount := binary.BigEndian.Uint32(buff[pos:])
// index
suffixesSize := uint64(chunkCount) * addrSuffixSize
pos -= suffixesSize
suffixes := make([]byte, suffixesSize)
copy(suffixes, buff[pos:])
lengthsSize := uint64(chunkCount) * lengthSize
pos -= lengthsSize
lengths, offsets := computeOffsets(chunkCount, buff[pos:pos+lengthsSize])
tuplesSize := uint64(chunkCount) * prefixTupleSize
pos -= tuplesSize
prefixes, ordinals := computePrefixes(chunkCount, buff[pos:pos+tuplesSize])
return tableIndex{
chunkCount, totalUncompressedData,
prefixes, offsets,
lengths, ordinals,
suffixes,
}
}
func computeOffsets(count uint32, buff []byte) (lengths []uint32, offsets []uint64) {
lengths = make([]uint32, count)
offsets = make([]uint64, count)
lengths[0] = binary.BigEndian.Uint32(buff)
for i := uint64(1); i < uint64(count); i++ {
lengths[i] = binary.BigEndian.Uint32(buff[i*lengthSize:])
offsets[i] = offsets[i-1] + uint64(lengths[i-1])
}
return
}
func computePrefixes(count uint32, buff []byte) (prefixes []uint64, ordinals []uint32) {
prefixes = make([]uint64, count)
ordinals = make([]uint32, count)
for i := uint64(0); i < uint64(count); i++ {
idx := i * prefixTupleSize
prefixes[i] = binary.BigEndian.Uint64(buff[idx:])
ordinals[i] = binary.BigEndian.Uint32(buff[idx+addrPrefixSize:])
}
return
}
func (ti tableIndex) prefixIdxToOrdinal(idx uint32) uint32 {
return ti.ordinals[idx]
}
// returns the first position in |tr.prefixes| whose value == |prefix|. Returns |tr.chunkCount|
// if absent
func (ti tableIndex) prefixIdx(prefix uint64) (idx uint32) {
// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
// an extremely tight loop and inlining the code was a significant perf improvement.
idx, j := 0, ti.chunkCount
for idx < j {
h := idx + (j-idx)/2 // avoid overflow when computing h
// i ≤ h < j
if ti.prefixes[h] < prefix {
idx = h + 1 // preserves f(i-1) == false
} else {
j = h // preserves f(j) == true
}
}
return
}
// Return true IFF the suffix at insertion order |ordinal| matches the address |a|.
func (ti tableIndex) ordinalSuffixMatches(ordinal uint32, h addr) bool {
li := uint64(ordinal) * addrSuffixSize
return bytes.Compare(h[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize]) == 0
}
// returns the ordinal of |h| if present. returns |ti.chunkCount| if absent
func (ti tableIndex) lookupOrdinal(h addr) uint32 {
prefix := h.Prefix()
for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixes[idx] == prefix; idx++ {
ordinal := ti.prefixIdxToOrdinal(idx)
if ti.ordinalSuffixMatches(ordinal, h) {
return ordinal
}
}
return ti.chunkCount
}
// newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index and footer, though it may contain an unspecified number of bytes before that data. r should allow retrieving any desired range of bytes from the table.
func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) tableReader {
return tableReader{index, r, blockSize}
}
// Scan across (logically) two ordered slices of address prefixes.
func (tr tableReader) hasMany(addrs []hasRecord) (remaining bool) {
// TODO: Use findInIndex if (tr.chunkCount - len(addrs)*Log2(tr.chunkCount)) > (tr.chunkCount - len(addrs))
filterIdx := uint32(0)
filterLen := uint32(len(tr.prefixes))
for i, addr := range addrs {
if addr.has {
continue
}
for filterIdx < filterLen && addr.prefix > tr.prefixes[filterIdx] {
filterIdx++
}
if filterIdx >= filterLen {
remaining = true
return
}
if addr.prefix != tr.prefixes[filterIdx] {
remaining = true
continue
}
// prefixes are equal, so locate and compare against the corresponding suffix
for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ {
if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *addr.a) {
addrs[i].has = true
break
}
}
if !addrs[i].has {
remaining = true
}
}
return
}
func (tr tableReader) count() uint32 {
return tr.chunkCount
}
func (tr tableReader) uncompressedLen() uint64 {
return tr.totalUncompressedData
}
func (tr tableReader) index() tableIndex {
return tr.tableIndex
}
// returns true iff |h| can be found in this table.
func (tr tableReader) has(h addr) bool {
ordinal := tr.lookupOrdinal(h)
return ordinal < tr.count()
}
// returns the storage associated with |h|, iff present. Returns nil if absent. On success,
// the returned byte slice directly references the underlying storage.
func (tr tableReader) get(h addr, stats *Stats) (data []byte) {
ordinal := tr.lookupOrdinal(h)
if ordinal == tr.count() {
return
}
offset := tr.offsets[ordinal]
length := uint64(tr.lengths[ordinal])
buff := make([]byte, length) // TODO: Avoid this allocation for every get
n, err := tr.r.ReadAtWithStats(buff, int64(offset), stats)
d.Chk.NoError(err)
d.Chk.True(n == int(length))
data = tr.parseChunk(buff)
d.Chk.True(data != nil)
return
}
type offsetRec struct {
a *addr
ordinal uint32
offset uint64
}
type offsetRecSlice []offsetRec
func (hs offsetRecSlice) Len() int { return len(hs) }
func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset }
func (hs offsetRecSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] }
func (tr tableReader) readAtOffsets(
readStart, readEnd uint64,
reqs []getRecord,
offsets offsetRecSlice,
foundChunks chan *chunks.Chunk,
wg *sync.WaitGroup,
stats *Stats,
) {
readLength := readEnd - readStart
buff := make([]byte, readLength)
n, err := tr.r.ReadAtWithStats(buff, int64(readStart), stats)
d.Chk.NoError(err)
d.Chk.True(uint64(n) == readLength)
for _, rec := range offsets {
d.Chk.True(rec.offset >= readStart)
localStart := rec.offset - readStart
localEnd := localStart + uint64(tr.lengths[rec.ordinal])
d.Chk.True(localEnd <= readLength)
data := tr.parseChunk(buff[localStart:localEnd])
c := chunks.NewChunkWithHash(hash.Hash(*rec.a), data)
foundChunks <- &c
}
wg.Done()
}
// getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical
// blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix.
func (tr tableReader) getMany(
reqs []getRecord,
foundChunks chan *chunks.Chunk,
wg *sync.WaitGroup,
stats *Stats,
) (remaining bool) {
// Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
// of table locations which must be read in order to satisfy the getMany operation.
offsetRecords, remaining := tr.findOffsets(reqs)
tr.getManyAtOffsets(reqs, offsetRecords, foundChunks, wg, stats)
return remaining
}
func (tr tableReader) getManyAtOffsets(
reqs []getRecord,
offsetRecords offsetRecSlice,
foundChunks chan *chunks.Chunk,
wg *sync.WaitGroup,
stats *Stats,
) {
// Now |offsetRecords| contains all locations within the table which must be search (note
// that there may be duplicates of a particular location). Sort by offset and scan forward,
// grouping sequences of reads into large physical reads.
var batch offsetRecSlice
var readStart, readEnd uint64
for i := 0; i < len(offsetRecords); {
rec := offsetRecords[i]
length := tr.lengths[rec.ordinal]
if batch == nil {
batch = make(offsetRecSlice, 1)
batch[0] = offsetRecords[i]
readStart = rec.offset
readEnd = readStart + uint64(length)
i++
continue
}
if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead {
batch = append(batch, rec)
readEnd = newReadEnd
i++
continue
}
wg.Add(1)
go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg, stats)
batch = nil
}
if batch != nil {
wg.Add(1)
go tr.readAtOffsets(readStart, readEnd, reqs, batch, foundChunks, wg, stats)
batch = nil
}
return
}
// findOffsets iterates over |reqs| and |tr.prefixes| (both sorted by
// address) to build the set of table locations which must be read in order to
// find each chunk specified by |reqs|. If this table contains all requested
// chunks remaining will be set to false upon return. If some are not here,
// then remaining will be true. The result offsetRecSlice is sorted in offset
// order.
func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool) {
filterIdx := uint32(0)
filterLen := uint32(len(tr.prefixes))
ors = make(offsetRecSlice, 0, len(reqs))
// Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
// of table locations which must be read in order to satisfy |reqs|.
for i, req := range reqs {
if req.found {
continue
}
// advance within the prefixes until we reach one which is >= req.prefix
for filterIdx < filterLen && tr.prefixes[filterIdx] < req.prefix {
filterIdx++
}
if filterIdx >= filterLen {
remaining = true // last prefix visited.
break
}
if req.prefix != tr.prefixes[filterIdx] {
remaining = true
continue
}
// record all offsets within the table which contain the data required.
for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ {
if tr.ordinalSuffixMatches(tr.prefixIdxToOrdinal(j), *req.a) {
reqs[i].found = true
ors = append(ors, offsetRec{req.a, tr.ordinals[j], tr.offsets[tr.ordinals[j]]})
}
}
}
sort.Sort(ors)
return ors, remaining
}
func canReadAhead(fRec offsetRec, fLength uint32, readStart, readEnd, blockSize uint64) (newEnd uint64, canRead bool) {
if fRec.offset < readEnd {
// |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address
// prefix matches the prefix of a requested address. If the set of requests contains
// addresses which share a common prefix, then it's possible for multiple offsetRecords
// to reference the same table offset position. In that case, we'll see sequential
// offsetRecords with the same fRec.offset.
return readEnd, true
}
if fRec.offset-readEnd > blockSize {
return readEnd, false
}
return fRec.offset + uint64(fLength), true
}
// Fetches the byte stream of data logically encoded within the table starting at |pos|.
func (tr tableReader) parseChunk(buff []byte) []byte {
dataLen := uint64(len(buff)) - checksumSize
chksum := binary.BigEndian.Uint32(buff[dataLen:])
d.Chk.True(chksum == crc(buff[:dataLen]))
data, err := snappy.Decode(nil, buff[:dataLen])
d.Chk.NoError(err)
return data
}
func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool) {
var offsetRecords offsetRecSlice
// Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table.
offsetRecords, remaining = tr.findOffsets(reqs)
// Now |offsetRecords| contains all locations within the table which must
// be searched (note that there may be duplicates of a particular
// location). Scan forward, grouping sequences of reads into large physical
// reads.
var readStart, readEnd uint64
readStarted := false
for i := 0; i < len(offsetRecords); {
rec := offsetRecords[i]
length := tr.lengths[rec.ordinal]
if !readStarted {
readStarted = true
reads++
readStart = rec.offset
readEnd = readStart + uint64(length)
i++
continue
}
if newReadEnd, canRead := canReadAhead(rec, tr.lengths[rec.ordinal], readStart, readEnd, tr.blockSize); canRead {
readEnd = newReadEnd
i++
continue
}
readStarted = false
}
return
}
func (tr tableReader) extract(chunks chan<- extractRecord) {
// Build reverse lookup table from ordinal -> chunk hash
hashes := make(addrSlice, len(tr.prefixes))
for idx, prefix := range tr.prefixes {
ordinal := tr.prefixIdxToOrdinal(uint32(idx))
binary.BigEndian.PutUint64(hashes[ordinal][:], prefix)
li := uint64(ordinal) * addrSuffixSize
copy(hashes[ordinal][addrPrefixSize:], tr.suffixes[li:li+addrSuffixSize])
}
chunkLen := tr.offsets[tr.chunkCount-1] + uint64(tr.lengths[tr.chunkCount-1])
buff := make([]byte, chunkLen)
n, err := tr.r.ReadAtWithStats(buff, int64(tr.offsets[0]), &Stats{})
d.Chk.NoError(err)
d.Chk.True(uint64(n) == chunkLen)
sendChunk := func(i uint32) {
localOffset := tr.offsets[i] - tr.offsets[0]
chunks <- extractRecord{a: hashes[i], data: tr.parseChunk(buff[localOffset : localOffset+uint64(tr.lengths[i])])}
}
for i := uint32(0); i < tr.chunkCount; i++ {
sendChunk(i)
}
}
func (tr tableReader) reader() io.Reader {
return &readerAdapter{tr.r, 0}
}
type readerAdapter struct {
rat tableReaderAt
off int64
}
func (ra *readerAdapter) Read(p []byte) (n int, err error) {
n, err = ra.rat.ReadAtWithStats(p, ra.off, &Stats{})
ra.off += int64(n)
return
}