dolt/go/store/nbs/table_reader.go

// Copyright 2019 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// This file incorporates work covered by the following copyright and
// permission notice:
//
// Copyright 2016 Attic Labs, Inc. All rights reserved.
// Licensed under the Apache License, version 2.0:
// http://www.apache.org/licenses/LICENSE-2.0

package nbs

import (
	"context"
	"encoding/binary"
	"errors"
	"io"
	"sort"

	"github.com/golang/snappy"
	"golang.org/x/sync/errgroup"

	"github.com/dolthub/dolt/go/store/chunks"
	"github.com/dolthub/dolt/go/store/hash"
)

// Do not read more than 128MB at a time.
const maxReadSize = 128 * 1024 * 1024

// CompressedChunk represents a chunk of data in a table file which is still compressed via snappy.
type CompressedChunk struct {
	// H is the hash of the chunk
	H hash.Hash

	// FullCompressedChunk is the entirety of the compressed chunk data including the crc
	FullCompressedChunk []byte

	// CompressedData is just the snappy encoded byte buffer that stores the chunk data
	CompressedData []byte
}

// NewCompressedChunk creates a CompressedChunk
func NewCompressedChunk(h hash.Hash, buff []byte) (CompressedChunk, error) {
	dataLen := uint64(len(buff)) - checksumSize

	chksum := binary.BigEndian.Uint32(buff[dataLen:])
	compressedData := buff[:dataLen]

	if chksum != crc(compressedData) {
		return CompressedChunk{}, errors.New("checksum error")
	}

	return CompressedChunk{H: h, FullCompressedChunk: buff, CompressedData: compressedData}, nil
}

// ToChunk snappy decodes the compressed data and returns a chunks.Chunk
func (cmp CompressedChunk) ToChunk() (chunks.Chunk, error) {
	data, err := snappy.Decode(nil, cmp.CompressedData)

	if err != nil {
		return chunks.Chunk{}, err
	}

	return chunks.NewChunkWithHash(cmp.H, data), nil
}

func ChunkToCompressedChunk(chunk chunks.Chunk) CompressedChunk {
	compressed := snappy.Encode(nil, chunk.Data())
	length := len(compressed)
	// todo: this append allocates a new buffer and copies |compressed|.
	//  This is costly, but maybe better, as it allows us to reclaim the
	//  extra space allocated in snappy.Encode (see snappy.MaxEncodedLen).
	compressed = append(compressed, []byte{0, 0, 0, 0}...)
	binary.BigEndian.PutUint32(compressed[length:], crc(compressed[:length]))
	return CompressedChunk{H: chunk.Hash(), FullCompressedChunk: compressed, CompressedData: compressed[:length]}
}

// Hash returns the hash of the data
func (cmp CompressedChunk) Hash() hash.Hash {
	return cmp.H
}

// IsEmpty returns true if the chunk contains no data.
func (cmp CompressedChunk) IsEmpty() bool {
	return len(cmp.CompressedData) == 0 || (len(cmp.CompressedData) == 1 && cmp.CompressedData[0] == 0)
}

// CompressedSize returns the size of this CompressedChunk.
func (cmp CompressedChunk) CompressedSize() int {
	return len(cmp.CompressedData)
}

var EmptyCompressedChunk CompressedChunk

func init() {
	EmptyCompressedChunk = ChunkToCompressedChunk(chunks.EmptyChunk)
}

// ErrInvalidTableFile is an error returned when a table file is corrupt or invalid.
var ErrInvalidTableFile = errors.New("invalid or corrupt table file")

type indexEntry interface {
	Offset() uint64
	Length() uint32
}

type indexResult struct {
	o uint64
	l uint32
}

func (ir indexResult) Offset() uint64 {
	return ir.o
}

func (ir indexResult) Length() uint32 {
	return ir.l
}

type tableReaderAt interface {
	ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error)
	Reader(ctx context.Context) (io.ReadCloser, error)
	Close() error
	clone() (tableReaderAt, error)
}

// tableReader implements get & has queries against a single nbs table. goroutine safe.
// |blockSize| refers to the block-size of the underlying storage. We assume that, each
// time we read data, we actually have to read in blocks of this size. So, we're willing
// to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group
// more chunks together into a single read request to backing storage.
type tableReader struct {
	prefixes  []uint64
	idx       tableIndex
	r         tableReaderAt
	blockSize uint64
}

// newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index
// and footer, though it may contain an unspecified number of bytes before that data. r should allow
// retrieving any desired range of bytes from the table.
func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) (tableReader, error) {
	p, err := index.prefixes()
	if err != nil {
		return tableReader{}, err
	}
	return tableReader{
		prefixes:  p,
		idx:       index,
		r:         r,
		blockSize: blockSize,
	}, nil
}

// Scan across (logically) two ordered slices of address prefixes.
func (tr tableReader) hasMany(addrs []hasRecord) (bool, error) {
	filterIdx := uint32(0)
	filterLen := uint32(tr.idx.chunkCount())

	var remaining bool
	for i, addr := range addrs {
		if addr.has {
			continue
		}

		// Use binary search to find the location of the addr.prefix in
		// the prefixes array. filterIdx will be at the first entry
		// where its prefix >= addr.prefix after this search.
		//
		// TODO: This is worse than a linear scan for small table files
		// or for very large queries.
		j := filterLen
		for filterIdx < j {
			h := filterIdx + (j-filterIdx)/2
			// filterIdx <= h < j
			if tr.prefixes[h] < addr.prefix {
				filterIdx = h + 1 // tr.prefixes[filterIdx-1] < addr.prefix
			} else {
				j = h // tr.prefixes[j] >= addr.prefix
			}
		}

		if filterIdx >= filterLen {
			return true, nil
		}

		if addr.prefix != tr.prefixes[filterIdx] {
			remaining = true
			continue
		}

		// prefixes are equal, so locate and compare against the corresponding suffix
		for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ {
			m, err := tr.idx.entrySuffixMatches(j, addr.a)
			if err != nil {
				return false, err
			}
			if m {
				addrs[i].has = true
				break
			}
		}

		if !addrs[i].has {
			remaining = true
		}
	}

	return remaining, nil
}

func (tr tableReader) count() (uint32, error) {
	return tr.idx.chunkCount(), nil
}

func (tr tableReader) uncompressedLen() (uint64, error) {
	return tr.idx.totalUncompressedData(), nil
}

func (tr tableReader) index() (tableIndex, error) {
	return tr.idx, nil
}

// returns true iff |h| can be found in this table.
func (tr tableReader) has(h addr) (bool, error) {
	_, ok, err := tr.idx.lookup(&h)
	return ok, err
}

// returns the storage associated with |h|, iff present. Returns nil if absent. On success,
// the returned byte slice directly references the underlying storage.
func (tr tableReader) get(ctx context.Context, h addr, stats *Stats) ([]byte, error) {
	e, found, err := tr.idx.lookup(&h)
	if err != nil {
		return nil, err
	}
	if !found {
		return nil, nil
	}

	offset := e.Offset()
	length := uint64(e.Length())
	buff := make([]byte, length) // TODO: Avoid this allocation for every get

	n, err := tr.r.ReadAtWithStats(ctx, buff, int64(offset), stats)

	if err != nil {
		return nil, err
	}

	if n != int(length) {
		return nil, errors.New("failed to read all data")
	}

	cmp, err := NewCompressedChunk(hash.Hash(h), buff)

	if err != nil {
		return nil, err
	}

	if len(cmp.CompressedData) == 0 {
		return nil, errors.New("failed to get data")
	}

	chnk, err := cmp.ToChunk()

	if err != nil {
		return nil, err
	}

	return chnk.Data(), nil
}

type offsetRec struct {
	a      *addr
	offset uint64
	length uint32
}

type offsetRecSlice []offsetRec

func (hs offsetRecSlice) Len() int           { return len(hs) }
func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset }
func (hs offsetRecSlice) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }

var _ chunkReader = tableReader{}

func (tr tableReader) readCompressedAtOffsets(
	ctx context.Context,
	rb readBatch,
	found func(context.Context, CompressedChunk),
	stats *Stats,
) error {
	return tr.readAtOffsetsWithCB(ctx, rb, stats, func(ctx context.Context, cmp CompressedChunk) error {
		found(ctx, cmp)
		return nil
	})
}

func (tr tableReader) readAtOffsets(
	ctx context.Context,
	rb readBatch,
	found func(context.Context, *chunks.Chunk),
	stats *Stats,
) error {
	return tr.readAtOffsetsWithCB(ctx, rb, stats, func(ctx context.Context, cmp CompressedChunk) error {
		chk, err := cmp.ToChunk()

		if err != nil {
			return err
		}

		found(ctx, &chk)
		return nil
	})
}

func (tr tableReader) readAtOffsetsWithCB(
	ctx context.Context,
	rb readBatch,
	stats *Stats,
	cb func(ctx context.Context, cmp CompressedChunk) error,
) error {
	readLength := rb.End() - rb.Start()
	buff := make([]byte, readLength)

	n, err := tr.r.ReadAtWithStats(ctx, buff, int64(rb.Start()), stats)
	if err != nil {
		return err
	}

	if uint64(n) != readLength {
		return errors.New("failed to read all data")
	}

	for i := range rb {
		cmp, err := rb.ExtractChunkFromRead(buff, i)
		if err != nil {
			return err
		}

		err = cb(ctx, cmp)
		if err != nil {
			return err
		}
	}

	return nil
}

// getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical
// blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix.
func (tr tableReader) getMany(
	ctx context.Context,
	eg *errgroup.Group,
	reqs []getRecord,
	found func(context.Context, *chunks.Chunk),
	stats *Stats) (bool, error) {

	// Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
	// of table locations which must be read in order to satisfy the getMany operation.
	offsetRecords, remaining, err := tr.findOffsets(reqs)
	if err != nil {
		return false, err
	}
	err = tr.getManyAtOffsets(ctx, eg, offsetRecords, found, stats)
	return remaining, err
}
func (tr tableReader) getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(context.Context, CompressedChunk), stats *Stats) (bool, error) {
	// Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
	// of table locations which must be read in order to satisfy the getMany operation.
	offsetRecords, remaining, err := tr.findOffsets(reqs)
	if err != nil {
		return false, err
	}
	err = tr.getManyCompressedAtOffsets(ctx, eg, offsetRecords, found, stats)
	return remaining, err
}

func (tr tableReader) getManyCompressedAtOffsets(ctx context.Context, eg *errgroup.Group, offsetRecords offsetRecSlice, found func(context.Context, CompressedChunk), stats *Stats) error {
	return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func(
		ctx context.Context,
		rb readBatch,
		stats *Stats) error {
		return tr.readCompressedAtOffsets(ctx, rb, found, stats)
	})
}

func (tr tableReader) getManyAtOffsets(
	ctx context.Context,
	eg *errgroup.Group,
	offsetRecords offsetRecSlice,
	found func(context.Context, *chunks.Chunk),
	stats *Stats,
) error {
	return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func(
		ctx context.Context,
		rb readBatch,
		stats *Stats) error {
		return tr.readAtOffsets(ctx, rb, found, stats)
	})
}

type readBatch offsetRecSlice

func (r readBatch) Start() uint64 {
	return r[0].offset
}

func (r readBatch) End() uint64 {
	last := r[len(r)-1]
	return last.offset + uint64(last.length)
}

func (s readBatch) ExtractChunkFromRead(buff []byte, idx int) (CompressedChunk, error) {
	rec := s[idx]
	chunkStart := rec.offset - s.Start()
	return NewCompressedChunk(hash.Hash(*rec.a), buff[chunkStart:chunkStart+uint64(rec.length)])
}

func toReadBatches(offsets offsetRecSlice, blockSize uint64) []readBatch {
	res := make([]readBatch, 0)
	var batch readBatch
	for i := 0; i < len(offsets); {
		rec := offsets[i]
		if batch == nil {
			batch = readBatch{rec}
			i++
			continue
		}

		if _, canRead := canReadAhead(rec, batch.Start(), batch.End(), blockSize); canRead {
			batch = append(batch, rec)
			i++
			continue
		}

		res = append(res, batch)
		batch = nil
	}
	if batch != nil {
		res = append(res, batch)
	}
	return res
}

func (tr tableReader) getManyAtOffsetsWithReadFunc(
	ctx context.Context,
	eg *errgroup.Group,
	offsetRecords offsetRecSlice,
	stats *Stats,
	readAtOffsets func(
		ctx context.Context,
		rb readBatch,
		stats *Stats) error,
) error {
	batches := toReadBatches(offsetRecords, tr.blockSize)
	for i := range batches {
		if ctx.Err() != nil {
			return ctx.Err()
		}
		i := i
		eg.Go(func() error {
			return readAtOffsets(ctx, batches[i], stats)
		})
	}
	return nil
}

// findOffsets iterates over |reqs| and |prefixes| (both sorted by
// address) to build the set of table locations which must be read in order to
// find each chunk specified by |reqs|. If this table contains all requested
// chunks remaining will be set to false upon return. If some are not here,
// then remaining will be true. The result offsetRecSlice is sorted in offset
// order.
func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool, err error) {
	filterIdx := uint32(0)
	filterLen := uint32(len(tr.prefixes))
	ors = make(offsetRecSlice, 0, len(reqs))

	// Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
	// of table locations which must be read in order to satisfy |reqs|.
	for i, req := range reqs {
		if req.found {
			continue
		}

		// Use binary search to find the location of the addr.prefix in
		// the prefixes array. filterIdx will be at the first entry
		// where its prefix >= addr.prefix after this search.
		//
		// TODO: This is worse than a linear scan for small table files
		// or for very large queries.
		j := filterLen
		for filterIdx < j {
			h := filterIdx + (j-filterIdx)/2
			// filterIdx <= h < j
			if tr.prefixes[h] < req.prefix {
				filterIdx = h + 1 // tr.prefixes[filterIdx-1] < req.prefix
			} else {
				j = h // tr.prefixes[j] >= req.prefix
			}
		}

		if filterIdx >= filterLen {
			remaining = true // last prefix visited.
			break
		}

		if req.prefix != tr.prefixes[filterIdx] {
			remaining = true
			continue
		}

		// record all offsets within the table which contain the data required.
		for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ {
			m, err := tr.idx.entrySuffixMatches(j, req.a)
			if err != nil {
				return nil, false, err
			}
			if m {
				reqs[i].found = true
				entry, err := tr.idx.indexEntry(j, nil)
				if err != nil {
					return nil, false, err
				}
				ors = append(ors, offsetRec{req.a, entry.Offset(), entry.Length()})
				break
			}
		}

		if !reqs[i].found {
			remaining = true
		}
	}

	sort.Sort(ors)
	return ors, remaining, nil
}

func canReadAhead(fRec offsetRec, curStart, curEnd, blockSize uint64) (newEnd uint64, canRead bool) {
	if fRec.offset < curEnd {
		// |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address
		// prefix matches the prefix of a requested address. If the set of requests contains
		// addresses which share a common prefix, then it's possible for multiple offsetRecords
		// to reference the same table offset position. In that case, we'll see sequential
		// offsetRecords with the same fRec.offset.
		return curEnd, true
	}

	if curEnd-curStart >= maxReadSize {
		return curEnd, false
	}

	if fRec.offset-curEnd > blockSize {
		return curEnd, false
	}

	return fRec.offset + uint64(fRec.length), true
}

func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool, err error) {
	var offsetRecords offsetRecSlice
	// Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table.
	offsetRecords, remaining, err = tr.findOffsets(reqs)
	if err != nil {
		return 0, false, err
	}

	// Now |offsetRecords| contains all locations within the table which must
	// be searched (note that there may be duplicates of a particular
	// location). Scan forward, grouping sequences of reads into large physical
	// reads.

	var readStart, readEnd uint64
	readStarted := false

	for i := 0; i < len(offsetRecords); {
		rec := offsetRecords[i]
		length := rec.length

		if !readStarted {
			readStarted = true
			reads++
			readStart = rec.offset
			readEnd = readStart + uint64(length)
			i++
			continue
		}

		if newReadEnd, canRead := canReadAhead(rec, readStart, readEnd, tr.blockSize); canRead {
			readEnd = newReadEnd
			i++
			continue
		}

		readStarted = false
	}

	return
}

func (tr tableReader) extract(ctx context.Context, chunks chan<- extractRecord) error {
	sendChunk := func(or offsetRec) error {
		buff := make([]byte, or.length)
		n, err := tr.r.ReadAtWithStats(ctx, buff, int64(or.offset), &Stats{})
		if err != nil {
			return err
		}
		if uint32(n) != or.length {
			return errors.New("did not read all data")
		}
		cmp, err := NewCompressedChunk(hash.Hash(*or.a), buff)

		if err != nil {
			return err
		}

		chnk, err := cmp.ToChunk()

		if err != nil {
			return err
		}

		chunks <- extractRecord{a: *or.a, data: chnk.Data()}
		return nil
	}

	var ors offsetRecSlice
	for i := uint32(0); i < tr.idx.chunkCount(); i++ {
		a := new(addr)
		e, err := tr.idx.indexEntry(i, a)
		if err != nil {
			return err
		}
		ors = append(ors, offsetRec{a, e.Offset(), e.Length()})
	}
	sort.Sort(ors)
	for _, or := range ors {
		err := sendChunk(or)
		if err != nil {
			return err
		}
	}

	return nil
}

func (tr tableReader) reader(ctx context.Context) (io.ReadCloser, uint64, error) {
	i, _ := tr.index()
	sz := i.tableFileSize()
	r, err := tr.r.Reader(ctx)
	if err != nil {
		return nil, 0, err
	}
	return r, sz, nil
}

func (tr tableReader) getRecordRanges(requests []getRecord) (map[hash.Hash]Range, error) {
	// findOffsets sets getRecord.found
	recs, _, err := tr.findOffsets(requests)
	if err != nil {
		return nil, err
	}
	ranges := make(map[hash.Hash]Range, len(recs))
	for _, r := range recs {
		ranges[hash.Hash(*r.a)] = Range{
			Offset: r.offset,
			Length: r.length,
		}
	}
	return ranges, nil
}

func (tr tableReader) currentSize() uint64 {
	return tr.idx.tableFileSize()
}

func (tr tableReader) close() error {
	err := tr.idx.Close()
	if err != nil {
		tr.r.Close()
		return err
	}
	return tr.r.Close()
}

func (tr tableReader) clone() (tableReader, error) {
	idx, err := tr.idx.clone()
	if err != nil {
		return tableReader{}, err
	}
	r, err := tr.r.clone()
	if err != nil {
		idx.Close()
		return tableReader{}, err
	}
	return tableReader{
		prefixes:  tr.prefixes,
		idx:       idx,
		r:         r,
		blockSize: tr.blockSize,
	}, nil
}