// Copyright 2019 Dolthub, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // This file incorporates work covered by the following copyright and // permission notice: // // Copyright 2016 Attic Labs, Inc. All rights reserved. // Licensed under the Apache License, version 2.0: // http://www.apache.org/licenses/LICENSE-2.0 package nbs import ( "context" "encoding/binary" "errors" "io" "sort" "github.com/golang/snappy" "golang.org/x/sync/errgroup" "github.com/dolthub/dolt/go/store/chunks" "github.com/dolthub/dolt/go/store/hash" ) // Do not read more than 128MB at a time. const maxReadSize = 128 * 1024 * 1024 // CompressedChunk represents a chunk of data in a table file which is still compressed via snappy. type CompressedChunk struct { // H is the hash of the chunk H hash.Hash // FullCompressedChunk is the entirety of the compressed chunk data including the crc FullCompressedChunk []byte // CompressedData is just the snappy encoded byte buffer that stores the chunk data CompressedData []byte } // NewCompressedChunk creates a CompressedChunk func NewCompressedChunk(h hash.Hash, buff []byte) (CompressedChunk, error) { dataLen := uint64(len(buff)) - checksumSize chksum := binary.BigEndian.Uint32(buff[dataLen:]) compressedData := buff[:dataLen] if chksum != crc(compressedData) { return CompressedChunk{}, errors.New("checksum error") } return CompressedChunk{H: h, FullCompressedChunk: buff, CompressedData: compressedData}, nil } // ToChunk snappy decodes the compressed data and returns a chunks.Chunk func (cmp CompressedChunk) ToChunk() (chunks.Chunk, error) { data, err := snappy.Decode(nil, cmp.CompressedData) if err != nil { return chunks.Chunk{}, err } return chunks.NewChunkWithHash(cmp.H, data), nil } func ChunkToCompressedChunk(chunk chunks.Chunk) CompressedChunk { compressed := snappy.Encode(nil, chunk.Data()) length := len(compressed) // todo: this append allocates a new buffer and copies |compressed|. // This is costly, but maybe better, as it allows us to reclaim the // extra space allocated in snappy.Encode (see snappy.MaxEncodedLen). compressed = append(compressed, []byte{0, 0, 0, 0}...) binary.BigEndian.PutUint32(compressed[length:], crc(compressed[:length])) return CompressedChunk{H: chunk.Hash(), FullCompressedChunk: compressed, CompressedData: compressed[:length]} } // Hash returns the hash of the data func (cmp CompressedChunk) Hash() hash.Hash { return cmp.H } // IsEmpty returns true if the chunk contains no data. func (cmp CompressedChunk) IsEmpty() bool { return len(cmp.CompressedData) == 0 || (len(cmp.CompressedData) == 1 && cmp.CompressedData[0] == 0) } // CompressedSize returns the size of this CompressedChunk. func (cmp CompressedChunk) CompressedSize() int { return len(cmp.CompressedData) } var EmptyCompressedChunk CompressedChunk func init() { EmptyCompressedChunk = ChunkToCompressedChunk(chunks.EmptyChunk) } // ErrInvalidTableFile is an error returned when a table file is corrupt or invalid. var ErrInvalidTableFile = errors.New("invalid or corrupt table file") type indexEntry interface { Offset() uint64 Length() uint32 } type indexResult struct { o uint64 l uint32 } func (ir indexResult) Offset() uint64 { return ir.o } func (ir indexResult) Length() uint32 { return ir.l } type tableReaderAt interface { ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error) Reader(ctx context.Context) (io.ReadCloser, error) Close() error clone() (tableReaderAt, error) } // tableReader implements get & has queries against a single nbs table. goroutine safe. // |blockSize| refers to the block-size of the underlying storage. We assume that, each // time we read data, we actually have to read in blocks of this size. So, we're willing // to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group // more chunks together into a single read request to backing storage. type tableReader struct { prefixes []uint64 idx tableIndex r tableReaderAt blockSize uint64 } // newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index // and footer, though it may contain an unspecified number of bytes before that data. r should allow // retrieving any desired range of bytes from the table. func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) (tableReader, error) { p, err := index.prefixes() if err != nil { return tableReader{}, err } return tableReader{ prefixes: p, idx: index, r: r, blockSize: blockSize, }, nil } // Scan across (logically) two ordered slices of address prefixes. func (tr tableReader) hasMany(addrs []hasRecord) (bool, error) { filterIdx := uint32(0) filterLen := uint32(tr.idx.chunkCount()) var remaining bool for i, addr := range addrs { if addr.has { continue } // Use binary search to find the location of the addr.prefix in // the prefixes array. filterIdx will be at the first entry // where its prefix >= addr.prefix after this search. // // TODO: This is worse than a linear scan for small table files // or for very large queries. j := filterLen for filterIdx < j { h := filterIdx + (j-filterIdx)/2 // filterIdx <= h < j if tr.prefixes[h] < addr.prefix { filterIdx = h + 1 // tr.prefixes[filterIdx-1] < addr.prefix } else { j = h // tr.prefixes[j] >= addr.prefix } } if filterIdx >= filterLen { return true, nil } if addr.prefix != tr.prefixes[filterIdx] { remaining = true continue } // prefixes are equal, so locate and compare against the corresponding suffix for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ { m, err := tr.idx.entrySuffixMatches(j, addr.a) if err != nil { return false, err } if m { addrs[i].has = true break } } if !addrs[i].has { remaining = true } } return remaining, nil } func (tr tableReader) count() (uint32, error) { return tr.idx.chunkCount(), nil } func (tr tableReader) uncompressedLen() (uint64, error) { return tr.idx.totalUncompressedData(), nil } func (tr tableReader) index() (tableIndex, error) { return tr.idx, nil } // returns true iff |h| can be found in this table. func (tr tableReader) has(h addr) (bool, error) { _, ok, err := tr.idx.lookup(&h) return ok, err } // returns the storage associated with |h|, iff present. Returns nil if absent. On success, // the returned byte slice directly references the underlying storage. func (tr tableReader) get(ctx context.Context, h addr, stats *Stats) ([]byte, error) { e, found, err := tr.idx.lookup(&h) if err != nil { return nil, err } if !found { return nil, nil } offset := e.Offset() length := uint64(e.Length()) buff := make([]byte, length) // TODO: Avoid this allocation for every get n, err := tr.r.ReadAtWithStats(ctx, buff, int64(offset), stats) if err != nil { return nil, err } if n != int(length) { return nil, errors.New("failed to read all data") } cmp, err := NewCompressedChunk(hash.Hash(h), buff) if err != nil { return nil, err } if len(cmp.CompressedData) == 0 { return nil, errors.New("failed to get data") } chnk, err := cmp.ToChunk() if err != nil { return nil, err } return chnk.Data(), nil } type offsetRec struct { a *addr offset uint64 length uint32 } type offsetRecSlice []offsetRec func (hs offsetRecSlice) Len() int { return len(hs) } func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset } func (hs offsetRecSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } var _ chunkReader = tableReader{} func (tr tableReader) readCompressedAtOffsets( ctx context.Context, rb readBatch, found func(context.Context, CompressedChunk), stats *Stats, ) error { return tr.readAtOffsetsWithCB(ctx, rb, stats, func(ctx context.Context, cmp CompressedChunk) error { found(ctx, cmp) return nil }) } func (tr tableReader) readAtOffsets( ctx context.Context, rb readBatch, found func(context.Context, *chunks.Chunk), stats *Stats, ) error { return tr.readAtOffsetsWithCB(ctx, rb, stats, func(ctx context.Context, cmp CompressedChunk) error { chk, err := cmp.ToChunk() if err != nil { return err } found(ctx, &chk) return nil }) } func (tr tableReader) readAtOffsetsWithCB( ctx context.Context, rb readBatch, stats *Stats, cb func(ctx context.Context, cmp CompressedChunk) error, ) error { readLength := rb.End() - rb.Start() buff := make([]byte, readLength) n, err := tr.r.ReadAtWithStats(ctx, buff, int64(rb.Start()), stats) if err != nil { return err } if uint64(n) != readLength { return errors.New("failed to read all data") } for i := range rb { cmp, err := rb.ExtractChunkFromRead(buff, i) if err != nil { return err } err = cb(ctx, cmp) if err != nil { return err } } return nil } // getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical // blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix. func (tr tableReader) getMany( ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(context.Context, *chunks.Chunk), stats *Stats) (bool, error) { // Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set // of table locations which must be read in order to satisfy the getMany operation. offsetRecords, remaining, err := tr.findOffsets(reqs) if err != nil { return false, err } err = tr.getManyAtOffsets(ctx, eg, offsetRecords, found, stats) return remaining, err } func (tr tableReader) getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(context.Context, CompressedChunk), stats *Stats) (bool, error) { // Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set // of table locations which must be read in order to satisfy the getMany operation. offsetRecords, remaining, err := tr.findOffsets(reqs) if err != nil { return false, err } err = tr.getManyCompressedAtOffsets(ctx, eg, offsetRecords, found, stats) return remaining, err } func (tr tableReader) getManyCompressedAtOffsets(ctx context.Context, eg *errgroup.Group, offsetRecords offsetRecSlice, found func(context.Context, CompressedChunk), stats *Stats) error { return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func( ctx context.Context, rb readBatch, stats *Stats) error { return tr.readCompressedAtOffsets(ctx, rb, found, stats) }) } func (tr tableReader) getManyAtOffsets( ctx context.Context, eg *errgroup.Group, offsetRecords offsetRecSlice, found func(context.Context, *chunks.Chunk), stats *Stats, ) error { return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func( ctx context.Context, rb readBatch, stats *Stats) error { return tr.readAtOffsets(ctx, rb, found, stats) }) } type readBatch offsetRecSlice func (r readBatch) Start() uint64 { return r[0].offset } func (r readBatch) End() uint64 { last := r[len(r)-1] return last.offset + uint64(last.length) } func (s readBatch) ExtractChunkFromRead(buff []byte, idx int) (CompressedChunk, error) { rec := s[idx] chunkStart := rec.offset - s.Start() return NewCompressedChunk(hash.Hash(*rec.a), buff[chunkStart:chunkStart+uint64(rec.length)]) } func toReadBatches(offsets offsetRecSlice, blockSize uint64) []readBatch { res := make([]readBatch, 0) var batch readBatch for i := 0; i < len(offsets); { rec := offsets[i] if batch == nil { batch = readBatch{rec} i++ continue } if _, canRead := canReadAhead(rec, batch.Start(), batch.End(), blockSize); canRead { batch = append(batch, rec) i++ continue } res = append(res, batch) batch = nil } if batch != nil { res = append(res, batch) } return res } func (tr tableReader) getManyAtOffsetsWithReadFunc( ctx context.Context, eg *errgroup.Group, offsetRecords offsetRecSlice, stats *Stats, readAtOffsets func( ctx context.Context, rb readBatch, stats *Stats) error, ) error { batches := toReadBatches(offsetRecords, tr.blockSize) for i := range batches { if ctx.Err() != nil { return ctx.Err() } i := i eg.Go(func() error { return readAtOffsets(ctx, batches[i], stats) }) } return nil } // findOffsets iterates over |reqs| and |prefixes| (both sorted by // address) to build the set of table locations which must be read in order to // find each chunk specified by |reqs|. If this table contains all requested // chunks remaining will be set to false upon return. If some are not here, // then remaining will be true. The result offsetRecSlice is sorted in offset // order. func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool, err error) { filterIdx := uint32(0) filterLen := uint32(len(tr.prefixes)) ors = make(offsetRecSlice, 0, len(reqs)) // Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set // of table locations which must be read in order to satisfy |reqs|. for i, req := range reqs { if req.found { continue } // Use binary search to find the location of the addr.prefix in // the prefixes array. filterIdx will be at the first entry // where its prefix >= addr.prefix after this search. // // TODO: This is worse than a linear scan for small table files // or for very large queries. j := filterLen for filterIdx < j { h := filterIdx + (j-filterIdx)/2 // filterIdx <= h < j if tr.prefixes[h] < req.prefix { filterIdx = h + 1 // tr.prefixes[filterIdx-1] < req.prefix } else { j = h // tr.prefixes[j] >= req.prefix } } if filterIdx >= filterLen { remaining = true // last prefix visited. break } if req.prefix != tr.prefixes[filterIdx] { remaining = true continue } // record all offsets within the table which contain the data required. for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ { m, err := tr.idx.entrySuffixMatches(j, req.a) if err != nil { return nil, false, err } if m { reqs[i].found = true entry, err := tr.idx.indexEntry(j, nil) if err != nil { return nil, false, err } ors = append(ors, offsetRec{req.a, entry.Offset(), entry.Length()}) break } } if !reqs[i].found { remaining = true } } sort.Sort(ors) return ors, remaining, nil } func canReadAhead(fRec offsetRec, curStart, curEnd, blockSize uint64) (newEnd uint64, canRead bool) { if fRec.offset < curEnd { // |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address // prefix matches the prefix of a requested address. If the set of requests contains // addresses which share a common prefix, then it's possible for multiple offsetRecords // to reference the same table offset position. In that case, we'll see sequential // offsetRecords with the same fRec.offset. return curEnd, true } if curEnd-curStart >= maxReadSize { return curEnd, false } if fRec.offset-curEnd > blockSize { return curEnd, false } return fRec.offset + uint64(fRec.length), true } func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool, err error) { var offsetRecords offsetRecSlice // Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table. offsetRecords, remaining, err = tr.findOffsets(reqs) if err != nil { return 0, false, err } // Now |offsetRecords| contains all locations within the table which must // be searched (note that there may be duplicates of a particular // location). Scan forward, grouping sequences of reads into large physical // reads. var readStart, readEnd uint64 readStarted := false for i := 0; i < len(offsetRecords); { rec := offsetRecords[i] length := rec.length if !readStarted { readStarted = true reads++ readStart = rec.offset readEnd = readStart + uint64(length) i++ continue } if newReadEnd, canRead := canReadAhead(rec, readStart, readEnd, tr.blockSize); canRead { readEnd = newReadEnd i++ continue } readStarted = false } return } func (tr tableReader) extract(ctx context.Context, chunks chan<- extractRecord) error { sendChunk := func(or offsetRec) error { buff := make([]byte, or.length) n, err := tr.r.ReadAtWithStats(ctx, buff, int64(or.offset), &Stats{}) if err != nil { return err } if uint32(n) != or.length { return errors.New("did not read all data") } cmp, err := NewCompressedChunk(hash.Hash(*or.a), buff) if err != nil { return err } chnk, err := cmp.ToChunk() if err != nil { return err } chunks <- extractRecord{a: *or.a, data: chnk.Data()} return nil } var ors offsetRecSlice for i := uint32(0); i < tr.idx.chunkCount(); i++ { a := new(addr) e, err := tr.idx.indexEntry(i, a) if err != nil { return err } ors = append(ors, offsetRec{a, e.Offset(), e.Length()}) } sort.Sort(ors) for _, or := range ors { err := sendChunk(or) if err != nil { return err } } return nil } func (tr tableReader) reader(ctx context.Context) (io.ReadCloser, uint64, error) { i, _ := tr.index() sz := i.tableFileSize() r, err := tr.r.Reader(ctx) if err != nil { return nil, 0, err } return r, sz, nil } func (tr tableReader) getRecordRanges(requests []getRecord) (map[hash.Hash]Range, error) { // findOffsets sets getRecord.found recs, _, err := tr.findOffsets(requests) if err != nil { return nil, err } ranges := make(map[hash.Hash]Range, len(recs)) for _, r := range recs { ranges[hash.Hash(*r.a)] = Range{ Offset: r.offset, Length: r.length, } } return ranges, nil } func (tr tableReader) currentSize() uint64 { return tr.idx.tableFileSize() } func (tr tableReader) close() error { err := tr.idx.Close() if err != nil { tr.r.Close() return err } return tr.r.Close() } func (tr tableReader) clone() (tableReader, error) { idx, err := tr.idx.clone() if err != nil { return tableReader{}, err } r, err := tr.r.clone() if err != nil { idx.Close() return tableReader{}, err } return tableReader{ prefixes: tr.prefixes, idx: idx, r: r, blockSize: tr.blockSize, }, nil }