ternfs-XTXMarkets/go/lib/span.go

package lib

import (
	"bytes"
	"fmt"
	"io"
	"os"
	"path/filepath"
	"sync"
	"xtx/eggsfs/crc32c"
	"xtx/eggsfs/msgs"
	"xtx/eggsfs/rs"
)

type blockReader struct {
	cells    int
	cellSize int
	stride   int
	data     []byte
	cursor   int
}

func (r *blockReader) Read(p []byte) (int, error) {
	if r.cursor >= r.cells*r.cellSize {
		return 0, io.EOF
	}
	cell := r.cursor / r.cellSize
	cellCursor := r.cursor % r.cellSize
	read := copy(p, r.data[cell*r.stride+cellCursor:cell*r.stride+r.cellSize])
	r.cursor += read
	return read, nil
}

func (c *Client) createInlineSpan(
	log *Logger,
	id msgs.InodeId,
	cookie [8]byte,
	offset uint64,
	sizeWithZeros uint32,
	data []byte,
) error {
	if int(sizeWithZeros) < len(data) {
		panic(fmt.Errorf("sizeWithZeros=%v < len(data)=%v", sizeWithZeros, len(data)))
	}
	crc := crc32c.Sum(0, data)
	crc = crc32c.ZeroExtend(crc, int(sizeWithZeros)-len(data))
	req := msgs.AddInlineSpanReq{
		FileId:       id,
		Cookie:       cookie,
		StorageClass: msgs.INLINE_STORAGE,
		ByteOffset:   offset,
		Size:         sizeWithZeros,
		Body:         data,
		Crc:          msgs.Crc(crc),
	}
	if err := c.ShardRequest(log, id.Shard(), &req, &msgs.AddInlineSpanResp{}); err != nil {
		return err
	}
	return nil
}

func ensureLen(buf []byte, l int) []byte {
	lenBefore := len(buf)
	if l <= cap(buf) {
		buf = buf[:l]
	} else {
		buf = buf[:cap(buf)]
		buf = append(buf, make([]byte, l-len(buf))...)
	}
	// memset? what's that?
	for i := lenBefore; i < len(buf); i++ {
		buf[i] = 0
	}
	return buf
}

const EGGSFS_PAGE_SIZE int = 4096

func prepareSpanInitiateReq(
	blacklist []msgs.BlockServiceId,
	spanPolicies *msgs.SpanPolicy,
	blockPolicies *msgs.BlockPolicy,
	stripePolicy *msgs.StripePolicy,
	id msgs.InodeId,
	cookie [8]byte,
	offset uint64,
	sizeWithZeros uint32,
	data []byte,
) ([]byte, *msgs.AddSpanInitiateReq) {
	if int(sizeWithZeros) < len(data) {
		panic(fmt.Errorf("sizeWithZeros=%v < len(data)=%v", sizeWithZeros, len(data)))
	}

	crc := crc32c.Sum(0, data)
	crc = crc32c.ZeroExtend(crc, int(sizeWithZeros)-len(data))

	// Compute all the size parameters. We use TargetStripeSize as an upper bound,
	// for now (i.e. the stripe will always be smaller than TargetStripeSize)
	S := (len(data) + int(stripePolicy.TargetStripeSize) - 1) / int(stripePolicy.TargetStripeSize)
	if S > 15 {
		S = 15
	}
	spanPolicy := spanPolicies.Pick(uint32(len(data)))
	D := spanPolicy.Parity.DataBlocks()
	P := spanPolicy.Parity.ParityBlocks()
	B := spanPolicy.Parity.Blocks()
	blockSize := (len(data) + D - 1) / D
	cellSize := (blockSize + S - 1) / S
	// Round up cell to page size
	cellSize = EGGSFS_PAGE_SIZE * ((cellSize + EGGSFS_PAGE_SIZE - 1) / EGGSFS_PAGE_SIZE)
	blockSize = cellSize * S
	storageClass := blockPolicies.Pick(uint32(blockSize)).StorageClass

	// Pad the data with zeros
	data = ensureLen(data, S*D*cellSize)

	initiateReq := msgs.AddSpanInitiateReq{
		FileId:       id,
		Cookie:       cookie,
		ByteOffset:   offset,
		Size:         sizeWithZeros,
		Crc:          msgs.Crc(crc),
		StorageClass: storageClass,
		Blacklist:    blacklist,
		Parity:       spanPolicy.Parity,
		Stripes:      uint8(S),
		CellSize:     uint32(cellSize),
		Crcs:         make([]msgs.Crc, B*S),
	}

	if D == 1 { // mirroring
		for s := 0; s < S; s++ {
			crc := msgs.Crc(crc32c.Sum(0, data[s*cellSize:(s+1)*cellSize]))
			for b := 0; b < B; b++ {
				initiateReq.Crcs[B*s+b] = crc
			}
		}
	} else { // RS
		// Make space for the parity blocks after the data blocks
		data = ensureLen(data, blockSize*B)
		rs := rs.Get(spanPolicy.Parity)
		dataSrcs := make([][]byte, D)
		parityDests := make([][]byte, P)
		for s := 0; s < S; s++ {
			// Compute CRCs for data blocks, and store their offsets
			for d := 0; d < D; d++ {
				dataStart := D*cellSize*s + cellSize*d
				dataEnd := D*cellSize*s + cellSize*(d+1)
				dataSrcs[d] = data[dataStart:dataEnd]
				initiateReq.Crcs[B*s+d] = msgs.Crc(crc32c.Sum(0, dataSrcs[d]))
			}
			// Generate parity
			for p := 0; p < P; p++ {
				dataStart := S*D*cellSize + P*cellSize*s + cellSize*p
				dataEnd := S*D*cellSize + P*cellSize*s + cellSize*(p+1)
				parityDests[p] = data[dataStart:dataEnd]
			}
			rs.ComputeParityInto(dataSrcs, parityDests)
			// Compute parity CRC
			for p := 0; p < P; p++ {
				initiateReq.Crcs[B*s+(D+p)] = msgs.Crc(crc32c.Sum(0, parityDests[p]))
			}
		}
	}

	return data, &initiateReq
}

func mkBlockReader(
	req *msgs.AddSpanInitiateReq,
	data []byte,
	block int,
) (msgs.Crc, io.Reader) {
	D := req.Parity.DataBlocks()
	P := req.Parity.ParityBlocks()
	B := req.Parity.Blocks()
	S := int(req.Stripes)
	cellSize := int(req.CellSize)
	blockCrc := uint32(0)
	for s := 0; s < int(req.Stripes); s++ {
		blockCrc = crc32c.Append(blockCrc, uint32(req.Crcs[B*s+block]), cellSize)
	}
	if D == 1 {
		// mirroring, we only have one block
		return msgs.Crc(blockCrc), bytes.NewReader(data)
	} else if block < D {
		// data block, first section of the blob
		r := &blockReader{
			cells:    S,
			cellSize: cellSize,
			stride:   cellSize * D,
			data:     data[block*cellSize:],
			cursor:   0,
		}
		return msgs.Crc(blockCrc), r
	} else {
		// parity block, second section of the blob
		r := &blockReader{
			cells:    S,
			cellSize: cellSize,
			stride:   cellSize * P,
			data:     data[D*S*cellSize+(block-D)*cellSize:],
			cursor:   0,
		}
		return msgs.Crc(blockCrc), r
	}
}

// Appends a new span to a given file.
// Note: the buffer underlying data might be modified by adding padding zeros
// for the purpose of splitting things into blocks/stripes. The (possibly modified)
// buffer is returned, regardless of whether the error is nil or not.
func (c *Client) CreateSpan(
	log *Logger,
	blacklist []msgs.BlockServiceId,
	spanPolicies *msgs.SpanPolicy,
	blockPolicies *msgs.BlockPolicy,
	stripePolicy *msgs.StripePolicy,
	id msgs.InodeId,
	cookie [8]byte,
	offset uint64,
	// The span size might be greater than `len(*data)`, in which case we have trailing
	// zeros (this allows us to cheaply stored zero sections).
	spanSize uint32,
	// This function might append to this, and write after it. It never modifies the data.
	// The new buffer is returned, so the caller can keep using the new, larger buffer,
	// for subsequent spans.
	//
	// Note that the new buffer is returned even if an error is returned.
	data []byte,
) ([]byte, error) {
	if len(data) < 256 {
		if err := c.createInlineSpan(log, id, cookie, offset, spanSize, data); err != nil {
			return data, err
		}
		return data, nil
	}

	var initiateReq *msgs.AddSpanInitiateReq
	data, initiateReq = prepareSpanInitiateReq(blacklist, spanPolicies, blockPolicies, stripePolicy, id, cookie, offset, spanSize, data)
	{
		expectedSize := float64(spanSize) * float64(initiateReq.Parity.Blocks()/initiateReq.Parity.DataBlocks())
		actualSize := initiateReq.CellSize * uint32(initiateReq.Stripes) * uint32(initiateReq.Parity.Blocks())
		log.Debug("span logical size: %v, span physical size: %v, waste: %v%%", spanSize, actualSize, 100.0*(float64(actualSize)-expectedSize)/float64(actualSize))
	}

	initiateResp := msgs.AddSpanInitiateResp{}
	if err := c.ShardRequest(log, id.Shard(), initiateReq, &initiateResp); err != nil {
		return data, err
	}

	certifyReq := msgs.AddSpanCertifyReq{
		FileId:     id,
		Cookie:     cookie,
		ByteOffset: offset,
		Proofs:     make([]msgs.BlockProof, len(initiateResp.Blocks)),
	}
	for i, block := range initiateResp.Blocks {
		conn, err := c.GetBlocksConn(log, block.BlockServiceId, block.BlockServiceIp1, block.BlockServicePort1, block.BlockServiceIp2, block.BlockServicePort2)
		if err != nil {
			return data, err
		}
		blockCrc, blockReader := mkBlockReader(initiateReq, data, i)
		proof, err := WriteBlock(log, conn, &block, blockReader, initiateReq.CellSize*uint32(initiateReq.Stripes), blockCrc)
		conn.Close()
		if err != nil {
			return data, fmt.Errorf("writing block failed with error %v", err)
		}
		certifyReq.Proofs[i].BlockId = block.BlockId
		certifyReq.Proofs[i].Proof = proof
	}
	if err := c.ShardRequest(log, id.Shard(), &certifyReq, &msgs.AddSpanCertifyResp{}); err != nil {
		return data, err
	}

	return data, nil
}

func (c *Client) CreateFile(
	log *Logger,
	dirInfoCache *DirInfoCache,
	path string, // must be absolute
	r io.Reader,
) (msgs.InodeId, error) {
	if path[0] != '/' {
		return 0, fmt.Errorf("non-absolute file path %v", path)
	}
	dirPath := filepath.Dir(path)
	fileName := filepath.Base(path)
	if fileName == dirPath {
		return 0, fmt.Errorf("bad file path %v", path)
	}
	dirId, err := c.ResolvePath(log, dirPath)
	if err != nil {
		return 0, err
	}
	spanPolicies := msgs.SpanPolicy{}
	if _, err := c.ResolveDirectoryInfoEntry(log, dirInfoCache, dirId, &spanPolicies); err != nil {
		return 0, err
	}
	blockPolicies := msgs.BlockPolicy{}
	if _, err := c.ResolveDirectoryInfoEntry(log, dirInfoCache, dirId, &blockPolicies); err != nil {
		return 0, err
	}
	stripePolicy := msgs.StripePolicy{}
	if _, err := c.ResolveDirectoryInfoEntry(log, dirInfoCache, dirId, &stripePolicy); err != nil {
		return 0, err
	}
	fileResp := msgs.ConstructFileResp{}
	if err := c.ShardRequest(log, dirId.Shard(), &msgs.ConstructFileReq{Type: msgs.FILE}, &fileResp); err != nil {
		return 0, err
	}
	fileId := fileResp.Id
	cookie := fileResp.Cookie
	maxSpanSize := spanPolicies.Entries[len(spanPolicies.Entries)-1].MaxSize
	spanBuf := make([]byte, maxSpanSize)
	offset := uint64(0)
	for {
		spanBuf = spanBuf[:maxSpanSize]
		read, err := io.ReadFull(r, spanBuf)
		if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
			return 0, err
		}
		if err == io.EOF {
			break
		}
		spanBuf, err = c.CreateSpan(
			log, []msgs.BlockServiceId{}, &spanPolicies, &blockPolicies, &stripePolicy,
			fileId, cookie, offset, uint32(read), spanBuf[:read],
		)
		if err != nil {
			return 0, err
		}
		offset += uint64(read)
		if read < int(maxSpanSize) {
			break
		}
	}
	if err := c.ShardRequest(log, dirId.Shard(), &msgs.LinkFileReq{FileId: fileId, Cookie: cookie, OwnerId: dirId, Name: fileName}, &msgs.LinkFileResp{}); err != nil {
		return 0, err
	}
	return fileId, nil
}

type mirroredSpanReader struct {
	cursor    int
	block     int
	blockConn PuttableReadCloser
	cellBuf   *[]byte
	cellCrcs  []msgs.Crc // starting from the _next_ stripe crc
}

func (r *mirroredSpanReader) Close() error {
	return r.blockConn.Close()
}

func (r *mirroredSpanReader) Put() {
	r.blockConn.Put()
}

type rsNormalSpanReader struct {
	bufPool           *ReadSpanBufPool
	cursor            int
	haveBlocks        []uint8 // which blocks are we fetching. most of the times it'll just be the data blocks
	blockConns        []PuttableReadCloser
	blocksRunningCrcs []msgs.Crc
	stripeBuf         *[]byte
	stripeCrcs        []msgs.Crc // starting from the _next_ stripe CRC.
	parityBuffers     []*[]byte  // buffers in which to temporarily place the parity blocks. usually empty
}

func (sr *rsNormalSpanReader) Close() error {
	sr.bufPool.Put(sr.stripeBuf)
	for _, b := range sr.parityBuffers {
		sr.bufPool.Put(b)
	}
	var lastErr error
	for _, c := range sr.blockConns {
		if err := c.Close(); err != nil {
			fmt.Fprintf(os.Stderr, "got error when closing connection: %v\n", err)
			lastErr = err
		}
	}
	return lastErr
}

func (sr *rsNormalSpanReader) Put() {
	for _, c := range sr.blockConns {
		c.Put()
	}
}

// This is when we actively detect a bad CRC, and we have no choice but to load
// the remainder of the span in its entirety to find out which block is broken,
// and then resume.
type rsCorruptedSpanReader struct {
	bufPool     *ReadSpanBufPool
	startStripe int // at which stripe we switched to said reading
	cursor      int
	dataBlocks  []*[]byte
}

func (r *rsCorruptedSpanReader) Close() error {
	for _, b := range r.dataBlocks {
		r.bufPool.Put(b)
	}
	return nil
}

func (*rsCorruptedSpanReader) Put() {}

type spanReader struct {
	bufPool    *ReadSpanBufPool
	spanSize   uint32
	spanCrc    msgs.Crc
	readBytes  uint32
	runningCrc msgs.Crc
	parity     rs.Parity
	stripes    uint8
	cellSize   uint32
	blocksCrcs []msgs.Crc
	blockConn  func(block int, offset uint32, size uint32) (PuttableReadCloser, error)
	r          PuttableCloser
}

func (sr *spanReader) Close() error {
	return sr.r.Close()
}

func (sr *spanReader) Put() {
	sr.r.Put()
}

func (sr *spanReader) repairCorruptedStripe(
	bufPool *ReadSpanBufPool,
	// the broken stripe
	stripe int,
	stripeData *[]byte,
	parityData []*[]byte,
	haveBlocks []uint8, // the blocks we have been using so far
	haveBlocksCrc []msgs.Crc, // the CRCs so far for the blocks that we have
	haveBlocksConns []PuttableReadCloser, // the connections to the blocks we already have
) (*rsCorruptedSpanReader, error) {
	D := sr.parity.DataBlocks()
	B := sr.parity.Blocks()
	blockStart := stripe * int(sr.cellSize)
	remainingBlockSize := (int(sr.stripes) - stripe) * int(sr.cellSize)
	goodBlocks := make([]bool, B) // _known_ good blocks
	badBlocks := make([]bool, B)  // _known_ bad blocks
	blocksData := make([]*[]byte, B)
	defer func() {
		for i := D; i < B; i++ {
			b := blocksData[i]
			if b != nil {
				bufPool.Put(b)
			}
		}
	}()
	// make space for full sized remaining data blocks
	for b := 0; b < D; b++ {
		blocksData[b] = bufPool.Get(remainingBlockSize)
	}
	// load current stripe data into the blocks buffers
	{
		parityIx := 0
		for _, b := range haveBlocks {
			if int(b) < D {
				copy(*blocksData[int(b)], (*stripeData)[int(b)*int(sr.cellSize):])
			} else {
				blocksData[b] = bufPool.Get(remainingBlockSize)
				copy(*blocksData[int(b)], *parityData[parityIx])
				parityIx++
			}
		}
	}
	// load _rest_ of blocks, check which ones are good
	numGoodBlocks := 0
	for {
		cursors := make([]int, D)
		for i := range cursors {
			cursors[i] = int(sr.cellSize)
		}
		allDone := true
		for i, b := range haveBlocks {
			read, err := haveBlocksConns[i].Read((*blocksData[int(b)])[cursors[i]:])
			if err != io.EOF && err != nil {
				return nil, err
			}
			haveBlocksCrc[i] = msgs.Crc(crc32c.Sum(uint32(haveBlocksCrc[i]), (*blocksData[int(b)])[cursors[i]:cursors[i]+read]))
			cursors[i] += read
			if cursors[i] < len(*blocksData[int(b)]) {
				allDone = false
			} else if msgs.Crc(haveBlocksCrc[i]) != sr.blocksCrcs[b] {
				badBlocks[int(b)] = true
			} else {
				goodBlocks[int(b)] = true
				numGoodBlocks++
			}
		}
		if allDone {
			break
		}
	}
	// Find and download blocks to recover
	var tmpBuf *[]byte
	defer bufPool.Put(tmpBuf)
	for b := 0; b < B && numGoodBlocks < D; b++ {
		if badBlocks[b] || goodBlocks[b] { // we know this is bad
			continue
		}
		// We can try to download this one.
		// We need to download the entire blocks here, because we need to check the CRC.
		blockSize := sr.cellSize * uint32(sr.stripes)
		conn, err := sr.blockConn(b, 0, blockSize)
		if err != nil {
			return nil, err
		}
		if conn == nil {
			badBlocks[b] = true
			continue
		}
		if tmpBuf == nil {
			tmpBuf = bufPool.Get(int(blockSize))
		}
		if _, err := io.ReadFull(conn, *tmpBuf); err != nil {
			conn.Close()
			return nil, err
		}
		if err := conn.Close(); err != nil {
			return nil, err
		}
		crc := crc32c.Sum(0, *tmpBuf)
		if crc == uint32(sr.blocksCrcs[b]) {
			// this is a good one
			blocksData[b] = bufPool.Get(remainingBlockSize)
			copy(*blocksData[b], (*tmpBuf)[blockStart:])
			goodBlocks[b] = true
			numGoodBlocks++
		} else {
			badBlocks[b] = true
		}
	}
	if numGoodBlocks < D {
		return nil, fmt.Errorf("the number of good blocks (%v) is lower than the number of data blocks (%v)", numGoodBlocks, D)
	}
	newHaveBlocks := []uint8{}
	newHaveBlocksData := [][]byte{}
	for b := 0; b < B; b++ {
		if !goodBlocks[b] {
			continue
		}
		newHaveBlocks = append(newHaveBlocks, uint8(b))
		newHaveBlocksData = append(newHaveBlocksData, *blocksData[b])
	}
	rs := rs.Get(sr.parity)
	for b := 0; b < D; b++ {
		if goodBlocks[b] {
			continue
		}
		rs.RecoverInto(newHaveBlocks, newHaveBlocksData, uint8(b), *blocksData[b])
	}
	r := rsCorruptedSpanReader{
		bufPool:     bufPool,
		startStripe: stripe,
		cursor:      stripe * D * int(sr.cellSize),
		dataBlocks:  blocksData[:D],
	}
	return &r, nil
}

func (sr *spanReader) loadStripe(nsr *rsNormalSpanReader) error {
	D := sr.parity.DataBlocks()
	blocksCursors := make([]int, len(nsr.blockConns))
	needsRecover := false
	for {
		allDone := true
		parityIx := 0
		for i, b := range nsr.haveBlocks {
			var buf []byte
			if int(b) < D { // data block, already ready
				stripeStart := int(b)*int(sr.cellSize) + blocksCursors[b]
				stripeEnd := (int(b) + 1) * int(sr.cellSize)
				buf = (*nsr.stripeBuf)[stripeStart:stripeEnd]
			} else { // parity block
				needsRecover = true
				buf = (*nsr.parityBuffers[parityIx])[blocksCursors[i]:]
				parityIx++
			}
			blockRead, err := nsr.blockConns[i].Read(buf)
			if err != nil {
				return err
			}
			nsr.blocksRunningCrcs[i] = msgs.Crc(crc32c.Sum(uint32(nsr.blocksRunningCrcs[i]), buf[:blockRead]))
			blocksCursors[i] += blockRead
			if blocksCursors[i] < int(sr.cellSize) {
				allDone = false
			}
		}
		if allDone {
			break
		}
	}
	if needsRecover { // we need to recover data blocks
		rs := rs.Get(sr.parity)
		bufs := make([][]byte, D)
		parityIx := 0
		for i, b := range nsr.haveBlocks {
			if int(b) < D {
				stripeStart := int(b) * int(sr.cellSize)
				stripeEnd := (int(b) + 1) * int(sr.cellSize)
				bufs[i] = (*nsr.stripeBuf)[stripeStart:stripeEnd]
			} else {
				bufs[i] = *nsr.parityBuffers[parityIx]
				parityIx++
			}
		}
		lastDataBlock := uint8(0)
		for i := 0; i < D; i++ {
			if int(nsr.haveBlocks[i]) >= D {
				break
			}
			for ; lastDataBlock < nsr.haveBlocks[i]; lastDataBlock++ {
				stripeStart := int(lastDataBlock) * int(sr.cellSize)
				stripeEnd := (int(lastDataBlock) + 1) * int(sr.cellSize)
				rs.RecoverInto(nsr.haveBlocks, bufs, lastDataBlock, (*nsr.stripeBuf)[stripeStart:stripeEnd])
			}
			lastDataBlock++
		}
	}
	crc := crc32c.Sum(0, *nsr.stripeBuf)
	if crc != uint32(nsr.stripeCrcs[0]) {
		var err error
		sr.r, err = sr.repairCorruptedStripe(
			sr.bufPool,
			nsr.cursor/(int(sr.cellSize)*D),
			nsr.stripeBuf,
			nsr.parityBuffers,
			nsr.haveBlocks,
			nsr.blocksRunningCrcs,
			nsr.blockConns,
		)
		nsr.Close() // close connections
		if err != nil {
			return err
		}
		return nil
	}
	nsr.stripeCrcs = nsr.stripeCrcs[1:]
	return nil
}

func (sr *spanReader) loadCell(mr *mirroredSpanReader) error {
	l := func() (bool, error) {
		cursor := 0
		for cursor < int(sr.cellSize) {
			read, err := mr.blockConn.Read((*mr.cellBuf)[cursor:])
			if err != nil {
				return false, err
			}
			cursor += read
		}
		crc := crc32c.Sum(0, *mr.cellBuf)
		good := crc == uint32(mr.cellCrcs[0])
		if !good {
			if err := mr.blockConn.Close(); err != nil {
				return false, err
			}
		}
		return good, nil
	}
	good, err := l()
	if err != nil {
		return err
	}
	// look for another block if necessary
	startingBlock := mr.block
	B := sr.parity.Blocks()
	for !good {
		mr.block = (mr.block + 1) % B
		if mr.block == startingBlock {
			break
		}
		var err error
		mr.blockConn, err = sr.blockConn(int(mr.block), uint32(mr.cursor), sr.cellSize*uint32(sr.stripes)-uint32(mr.cursor))
		if err != nil {
			return err
		}
		if mr.blockConn == nil {
			continue
		}
		good, err = l()
		if err != nil {
			return err
		}
	}
	if !good {
		return fmt.Errorf("could not find block without CRC errors")
	}
	mr.cellCrcs = mr.cellCrcs[1:]
	return nil
}

func (sr *spanReader) readMirrored(mr *mirroredSpanReader, p []byte) (int, error) {
	if mr.cursor >= int(sr.spanSize) {
		return 0, io.EOF
	}
	if mr.cursor >= int(sr.stripes)*int(sr.cellSize) { // trailing zeros
		read := 0
		for mr.cursor < int(sr.spanSize) && read < len(p) {
			p[read] = 0
			read++
			mr.cursor++
		}
		return read, nil
	}
	currentCell := mr.cursor / int(sr.cellSize)
	cellPos := mr.cursor % int(sr.cellSize)
	remainingCell := (*mr.cellBuf)[cellPos:]
	if mr.cursor+len(remainingCell) > int(sr.spanSize) { // zero padding
		remainingCell = remainingCell[:int(sr.spanSize)-mr.cursor]
	}
	read := copy(p, remainingCell)
	mr.cursor += read
	if cellPos+read == len(*mr.cellBuf) && currentCell+1 < int(sr.stripes) {
		if err := sr.loadCell(mr); err != nil {
			return read, err
		}
	}
	return read, nil
}

func (sr *spanReader) readRsHappy(nsr *rsNormalSpanReader, p []byte) (int, error) {
	if nsr.cursor >= int(sr.spanSize) {
		return 0, io.EOF
	}
	D := sr.parity.DataBlocks()
	if nsr.cursor >= int(sr.stripes)*D*int(sr.cellSize) { // trailing zeros
		read := 0
		for nsr.cursor < int(sr.spanSize) && read < len(p) {
			p[read] = 0
			read++
			nsr.cursor++
		}
		return read, nil
	}
	currentStripe := nsr.cursor / (D * int(sr.cellSize))
	stripePos := nsr.cursor % (D * int(sr.cellSize))
	remainingStripe := (*nsr.stripeBuf)[stripePos:]
	if nsr.cursor+len(remainingStripe) > int(sr.spanSize) { // zero padding
		remainingStripe = remainingStripe[:int(sr.spanSize)-nsr.cursor]
	}
	read := copy(p, remainingStripe)
	nsr.cursor += read
	if stripePos+read == len(*nsr.stripeBuf) && currentStripe+1 < int(sr.stripes) {
		if err := sr.loadStripe(nsr); err != nil {
			return read, err
		}
	}
	return read, nil
}

func (sr *spanReader) readRsCorrupted(wsr *rsCorruptedSpanReader, p []byte) (int, error) {
	D := sr.parity.DataBlocks()
	if wsr.cursor >= int(sr.spanSize) {
		return 0, io.EOF
	}
	if wsr.cursor >= int(sr.stripes)*D*int(sr.cellSize) { // trailing zeros
		read := 0
		for wsr.cursor < int(sr.spanSize) && read < len(p) {
			p[read] = 0
			read++
			wsr.cursor++
		}
		return read, nil
	}
	currentStripe := wsr.cursor / (D * int(sr.cellSize))
	stripePos := wsr.cursor % (D * int(sr.cellSize))
	currentBlock := stripePos / int(sr.cellSize)
	cellPos := stripePos % int(sr.cellSize)
	remainingCell := (*wsr.dataBlocks[currentBlock])[int(sr.cellSize)*(currentStripe-wsr.startStripe)+cellPos : int(sr.cellSize)*(currentStripe-wsr.startStripe+1)]
	if wsr.cursor+len(remainingCell) > int(sr.spanSize) { // zero padding
		remainingCell = remainingCell[:int(sr.spanSize)-wsr.cursor]
	}
	read := copy(p, remainingCell)
	wsr.cursor += read
	return read, nil
}

func (sr *spanReader) Read(p []byte) (int, error) {
	var read int
	var err error
	switch r := sr.r.(type) {
	case *mirroredSpanReader:
		read, err = sr.readMirrored(r, p)
	case *rsNormalSpanReader:
		read, err = sr.readRsHappy(r, p)
	case *rsCorruptedSpanReader:
		read, err = sr.readRsCorrupted(r, p)
	default:
		panic(fmt.Errorf("bad reader %T", sr.r))
	}
	sr.readBytes += uint32(read)
	if sr.readBytes > sr.spanSize {
		panic(fmt.Errorf("read beyond end of span -- %v vs %v", sr.readBytes, sr.spanSize))
	}
	sr.runningCrc = msgs.Crc(crc32c.Sum(uint32(sr.runningCrc), p[:read]))
	if sr.readBytes == sr.spanSize && sr.runningCrc != sr.spanCrc {
		panic(fmt.Errorf("span contents CRC is not what we expect -- %v vs %v", sr.runningCrc, sr.spanCrc))
	}
	return read, err
}

// Given a way to start streaming a block, produces a stream with the span
// contents. Will automatically repair the span if CRC errors are detected.
func readSpanFromBlocks(
	bufPool *ReadSpanBufPool,
	spanSize uint32,
	spanCrc msgs.Crc,
	parity rs.Parity,
	stripes uint8,
	cellSize uint32,
	blockCrcs []msgs.Crc,
	stripesCrc []msgs.Crc,
	// If this function returns `nil, nil`, it means that that block service
	// is currently not available for whatever reason.
	//
	// We currently make the assumption that the connections that are available
	// at the beginning will be available throughout the duration of span reading.
	blockConn func(block int, offset uint32, size uint32) (PuttableReadCloser, error),
) (PuttableReadCloser, error) {
	D := parity.DataBlocks()
	B := parity.Blocks()
	sr := spanReader{
		bufPool:    bufPool,
		spanSize:   spanSize,
		spanCrc:    spanCrc,
		stripes:    stripes,
		cellSize:   cellSize,
		parity:     parity,
		blockConn:  blockConn,
		blocksCrcs: blockCrcs,
	}
	if D == 1 {
		mr := &mirroredSpanReader{
			cursor:   0,
			cellBuf:  sr.bufPool.Get(int(cellSize)),
			cellCrcs: stripesCrc,
		}
		for b := 0; b < B; b++ {
			conn, err := blockConn(b, 0, uint32(cellSize)*uint32(stripes))
			if err != nil {
				return nil, err
			}
			if conn != nil {
				mr.blockConn = conn
				mr.block = b
				break
			}
		}
		if mr.blockConn == nil {
			return nil, fmt.Errorf("couldn't get block connection to any of the blocks")
		}
		sr.r = mr
		if err := sr.loadCell(mr); err != nil {
			return nil, err
		}
	} else {
		conns := make([]PuttableReadCloser, 0)
		haveBlocks := make([]uint8, 0)
		parityBuffers := []*[]byte{}
		for i := 0; i < B; i++ {
			conn, err := blockConn(i, 0, uint32(cellSize)*uint32(stripes))
			if err != nil {
				return nil, err
			}
			if conn == nil {
				continue
			}
			conns = append(conns, conn)
			haveBlocks = append(haveBlocks, uint8(i))
			if i >= D {
				parityBuffers = append(parityBuffers, sr.bufPool.Get(int(cellSize)))
			}
			if len(haveBlocks) == D {
				break
			}
		}
		if len(haveBlocks) != D {
			return nil, fmt.Errorf("couldn't get enough block connections (need at least %v, got %v)", D, len(conns))
		}
		stripeBuf := sr.bufPool.Get(int(cellSize) * D)
		rsr := &rsNormalSpanReader{
			bufPool:           sr.bufPool,
			cursor:            0,
			blockConns:        conns,
			haveBlocks:        haveBlocks,
			stripeBuf:         stripeBuf,
			stripeCrcs:        stripesCrc,
			blocksRunningCrcs: make([]msgs.Crc, D),
			parityBuffers:     parityBuffers,
		}
		sr.r = rsr
		if err := sr.loadStripe(rsr); err != nil {
			return nil, err
		}
	}
	return &sr, nil
}

type inlineSpanReader struct {
	size   int
	cursor int
	data   []byte
}

func (r *inlineSpanReader) Read(p []byte) (int, error) {
	if r.cursor >= r.size {
		return 0, io.EOF
	}
	if r.cursor >= len(r.data) {
		read := 0
		for r.cursor < r.size && read < len(p) {
			p[read] = 0
			read++
			r.cursor++
		}
		return read, nil
	}
	read := copy(p, r.data[r.cursor:])
	r.cursor += read
	return read, nil
}

func (*inlineSpanReader) Close() error {
	return nil
}

func (*inlineSpanReader) Put() {}

type ReadSpanBufPool struct {
	pool sync.Pool
}

func NewReadSpanBufPool() *ReadSpanBufPool {
	pool := ReadSpanBufPool{
		pool: sync.Pool{
			New: func() any {
				buf := []byte{}
				return &buf
			},
		},
	}
	return &pool
}

// This does _not_ zero the memory in the bufs -- i.e. there might
// be garbage in it.
func (pool *ReadSpanBufPool) Get(l int) *[]byte {
	buf := pool.pool.Get().(*[]byte)
	if cap(*buf) >= l {
		*buf = (*buf)[:l]
	} else {
		*buf = (*buf)[:cap(*buf)]
		*buf = append(*buf, make([]byte, l-len(*buf))...)
	}
	return buf
}

func (pool *ReadSpanBufPool) Put(buf *[]byte) {
	if buf != nil {
		pool.pool.Put(buf)
	}
}

func (c *Client) ReadSpan(
	log *Logger,
	bufPool *ReadSpanBufPool,
	blacklist []msgs.BlockServiceId,
	blockServices []msgs.BlockService,
	fetchedSpan *msgs.FetchedSpan,
) (PuttableReadCloser, error) {
	if fetchedSpan.Header.StorageClass == msgs.INLINE_STORAGE {
		data := fetchedSpan.Body.(*msgs.FetchedInlineSpan).Body
		dataCrc := msgs.Crc(crc32c.Sum(0, data))
		if dataCrc != fetchedSpan.Header.Crc {
			panic(fmt.Errorf("header CRC for inline span is %v, but data is %v", fetchedSpan.Header.Crc, dataCrc))
		}
		isr := inlineSpanReader{
			size:   int(fetchedSpan.Header.Size),
			cursor: 0,
			data:   fetchedSpan.Body.(*msgs.FetchedInlineSpan).Body,
		}
		return &isr, nil
	}

	body := fetchedSpan.Body.(*msgs.FetchedBlocksSpan)
	log.DebugStack(1, "span parity %v", body.Parity)
	blocksCrcs := make([]msgs.Crc, body.Parity.Blocks())
	for i := range blocksCrcs {
		blocksCrcs[i] = body.Blocks[i].Crc
	}
	blockConn := func(blockIx int, offset uint32, size uint32) (PuttableReadCloser, error) {
		log.DebugStack(1, "requested connection for block ix %v, offset %v, size %v", blockIx, offset, size)
		block := body.Blocks[blockIx]
		blockService := blockServices[block.BlockServiceIx]
		for _, blacklisted := range blacklist {
			if blockService.Id == blacklisted {
				return nil, nil
			}
		}
		conn, err := c.GetBlocksConn(log, blockService.Id, blockService.Ip1, blockService.Port1, blockService.Ip2, blockService.Port2)
		if err != nil {
			return nil, err
		}
		if err := FetchBlock(log, conn, &blockService, block.BlockId, offset, size); err != nil {
			conn.Close()
			return nil, err
		}
		return conn, err
	}
	return readSpanFromBlocks(bufPool, fetchedSpan.Header.Size, fetchedSpan.Header.Crc, body.Parity, body.Stripes, body.CellSize, blocksCrcs, body.StripesCrc, blockConn)
}

type fileReader struct {
	client     *Client
	log        *Logger
	bufPool    *ReadSpanBufPool
	blacklist  []msgs.BlockServiceId
	fileId     msgs.InodeId
	spansResp  msgs.FileSpansResp
	spanReader io.ReadCloser
}

func (f *fileReader) Close() error {
	if f.spanReader != nil {
		return f.spanReader.Close()
	}
	return nil
}

func (f *fileReader) loadNextSpanAndRead(p []byte) (int, error) {
	if len(f.spansResp.Spans) == 0 { // no remaining spans
		if f.spansResp.NextOffset == 0 { // no remaining spans, and no next batch of spans available
			return 0, io.EOF
		} else { // request next spans and try again
			req := msgs.FileSpansReq{FileId: f.fileId, ByteOffset: f.spansResp.NextOffset}
			if err := f.client.ShardRequest(f.log, f.fileId.Shard(), &req, &f.spansResp); err != nil {
				return 0, err
			}
			return f.Read(p)
		}
	} else { // load next span
		if f.spanReader != nil {
			if err := f.spanReader.Close(); err != nil {
				return 0, err
			}
		}
		var err error
		f.spanReader, err = f.client.ReadSpan(f.log, f.bufPool, f.blacklist, f.spansResp.BlockServices, &f.spansResp.Spans[0])
		if err != nil {
			return 0, err
		}
		f.spansResp.Spans = f.spansResp.Spans[1:]
		return f.Read(p)
	}
}

func (f *fileReader) Read(p []byte) (int, error) {
	if f.spanReader == nil {
		return f.loadNextSpanAndRead(p)
	}
	spanRead, err := f.spanReader.Read(p)
	if err == io.EOF {
		return f.loadNextSpanAndRead(p)
	}
	return spanRead, err
}

func (c *Client) ReadFile(
	log *Logger,
	bufPool *ReadSpanBufPool,
	blacklist []msgs.BlockServiceId,
	id msgs.InodeId,
) (io.ReadCloser, error) {
	r := &fileReader{
		client:    c,
		log:       log,
		bufPool:   bufPool,
		blacklist: blacklist,
		fileId:    id,
	}
	req := msgs.FileSpansReq{FileId: r.fileId, ByteOffset: r.spansResp.NextOffset}
	if err := r.client.ShardRequest(r.log, r.fileId.Shard(), &req, &r.spansResp); err != nil {
		return nil, err
	}
	return r, nil
}