mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-05-07 21:01:48 -05:00
6addbdee6a
Initial version really by Pawel, but many changes in between. Big outstanding issues: * span cache reclamation (unbounded memory otherwise...) * bad block service detection and workarounds * corrupted blocks detection and workaround Co-authored-by: Paweł Dziepak <pawel.dziepak@xtxmarkets.com>
1098 lines
30 KiB
Go
1098 lines
30 KiB
Go
package lib
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"xtx/eggsfs/crc32c"
|
|
"xtx/eggsfs/msgs"
|
|
"xtx/eggsfs/rs"
|
|
)
|
|
|
|
type blockReader struct {
|
|
cells int
|
|
cellSize int
|
|
stride int
|
|
data []byte
|
|
cursor int
|
|
}
|
|
|
|
func (r *blockReader) Read(p []byte) (int, error) {
|
|
if r.cursor >= r.cells*r.cellSize {
|
|
return 0, io.EOF
|
|
}
|
|
cell := r.cursor / r.cellSize
|
|
cellCursor := r.cursor % r.cellSize
|
|
read := copy(p, r.data[cell*r.stride+cellCursor:cell*r.stride+r.cellSize])
|
|
r.cursor += read
|
|
return read, nil
|
|
}
|
|
|
|
func (c *Client) createInlineSpan(
|
|
log *Logger,
|
|
id msgs.InodeId,
|
|
cookie [8]byte,
|
|
offset uint64,
|
|
sizeWithZeros uint32,
|
|
data []byte,
|
|
) error {
|
|
if int(sizeWithZeros) < len(data) {
|
|
panic(fmt.Errorf("sizeWithZeros=%v < len(data)=%v", sizeWithZeros, len(data)))
|
|
}
|
|
crc := crc32c.Sum(0, data)
|
|
crc = crc32c.ZeroExtend(crc, int(sizeWithZeros)-len(data))
|
|
req := msgs.AddInlineSpanReq{
|
|
FileId: id,
|
|
Cookie: cookie,
|
|
StorageClass: msgs.INLINE_STORAGE,
|
|
ByteOffset: offset,
|
|
Size: sizeWithZeros,
|
|
Body: data,
|
|
Crc: msgs.Crc(crc),
|
|
}
|
|
if err := c.ShardRequest(log, id.Shard(), &req, &msgs.AddInlineSpanResp{}); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func ensureLen(buf []byte, l int) []byte {
|
|
lenBefore := len(buf)
|
|
if l <= cap(buf) {
|
|
buf = buf[:l]
|
|
} else {
|
|
buf = buf[:cap(buf)]
|
|
buf = append(buf, make([]byte, l-len(buf))...)
|
|
}
|
|
// memset? what's that?
|
|
for i := lenBefore; i < len(buf); i++ {
|
|
buf[i] = 0
|
|
}
|
|
return buf
|
|
}
|
|
|
|
const EGGSFS_PAGE_SIZE int = 4096
|
|
|
|
func prepareSpanInitiateReq(
|
|
blacklist []msgs.BlockServiceId,
|
|
spanPolicies *msgs.SpanPolicy,
|
|
blockPolicies *msgs.BlockPolicy,
|
|
stripePolicy *msgs.StripePolicy,
|
|
id msgs.InodeId,
|
|
cookie [8]byte,
|
|
offset uint64,
|
|
sizeWithZeros uint32,
|
|
data []byte,
|
|
) ([]byte, *msgs.AddSpanInitiateReq) {
|
|
if int(sizeWithZeros) < len(data) {
|
|
panic(fmt.Errorf("sizeWithZeros=%v < len(data)=%v", sizeWithZeros, len(data)))
|
|
}
|
|
|
|
crc := crc32c.Sum(0, data)
|
|
crc = crc32c.ZeroExtend(crc, int(sizeWithZeros)-len(data))
|
|
|
|
// Compute all the size parameters. We use TargetStripeSize as an upper bound,
|
|
// for now (i.e. the stripe will always be smaller than TargetStripeSize)
|
|
S := (len(data) + int(stripePolicy.TargetStripeSize) - 1) / int(stripePolicy.TargetStripeSize)
|
|
if S > 15 {
|
|
S = 15
|
|
}
|
|
spanPolicy := spanPolicies.Pick(uint32(len(data)))
|
|
D := spanPolicy.Parity.DataBlocks()
|
|
P := spanPolicy.Parity.ParityBlocks()
|
|
B := spanPolicy.Parity.Blocks()
|
|
blockSize := (len(data) + D - 1) / D
|
|
cellSize := (blockSize + S - 1) / S
|
|
// Round up cell to page size
|
|
cellSize = EGGSFS_PAGE_SIZE * ((cellSize + EGGSFS_PAGE_SIZE - 1) / EGGSFS_PAGE_SIZE)
|
|
blockSize = cellSize * S
|
|
storageClass := blockPolicies.Pick(uint32(blockSize)).StorageClass
|
|
|
|
// Pad the data with zeros
|
|
data = ensureLen(data, S*D*cellSize)
|
|
|
|
initiateReq := msgs.AddSpanInitiateReq{
|
|
FileId: id,
|
|
Cookie: cookie,
|
|
ByteOffset: offset,
|
|
Size: sizeWithZeros,
|
|
Crc: msgs.Crc(crc),
|
|
StorageClass: storageClass,
|
|
Blacklist: blacklist,
|
|
Parity: spanPolicy.Parity,
|
|
Stripes: uint8(S),
|
|
CellSize: uint32(cellSize),
|
|
Crcs: make([]msgs.Crc, B*S),
|
|
}
|
|
|
|
if D == 1 { // mirroring
|
|
for s := 0; s < S; s++ {
|
|
crc := msgs.Crc(crc32c.Sum(0, data[s*cellSize:(s+1)*cellSize]))
|
|
for b := 0; b < B; b++ {
|
|
initiateReq.Crcs[B*s+b] = crc
|
|
}
|
|
}
|
|
} else { // RS
|
|
// Make space for the parity blocks after the data blocks
|
|
data = ensureLen(data, blockSize*B)
|
|
rs := rs.Get(spanPolicy.Parity)
|
|
dataSrcs := make([][]byte, D)
|
|
parityDests := make([][]byte, P)
|
|
for s := 0; s < S; s++ {
|
|
// Compute CRCs for data blocks, and store their offsets
|
|
for d := 0; d < D; d++ {
|
|
dataStart := D*cellSize*s + cellSize*d
|
|
dataEnd := D*cellSize*s + cellSize*(d+1)
|
|
dataSrcs[d] = data[dataStart:dataEnd]
|
|
initiateReq.Crcs[B*s+d] = msgs.Crc(crc32c.Sum(0, dataSrcs[d]))
|
|
}
|
|
// Generate parity
|
|
for p := 0; p < P; p++ {
|
|
dataStart := S*D*cellSize + P*cellSize*s + cellSize*p
|
|
dataEnd := S*D*cellSize + P*cellSize*s + cellSize*(p+1)
|
|
parityDests[p] = data[dataStart:dataEnd]
|
|
}
|
|
rs.ComputeParityInto(dataSrcs, parityDests)
|
|
// Compute parity CRC
|
|
for p := 0; p < P; p++ {
|
|
initiateReq.Crcs[B*s+(D+p)] = msgs.Crc(crc32c.Sum(0, parityDests[p]))
|
|
}
|
|
}
|
|
}
|
|
|
|
return data, &initiateReq
|
|
}
|
|
|
|
func mkBlockReader(
|
|
req *msgs.AddSpanInitiateReq,
|
|
data []byte,
|
|
block int,
|
|
) (msgs.Crc, io.Reader) {
|
|
D := req.Parity.DataBlocks()
|
|
P := req.Parity.ParityBlocks()
|
|
B := req.Parity.Blocks()
|
|
S := int(req.Stripes)
|
|
cellSize := int(req.CellSize)
|
|
blockCrc := uint32(0)
|
|
for s := 0; s < int(req.Stripes); s++ {
|
|
blockCrc = crc32c.Append(blockCrc, uint32(req.Crcs[B*s+block]), cellSize)
|
|
}
|
|
if D == 1 {
|
|
// mirroring, we only have one block
|
|
return msgs.Crc(blockCrc), bytes.NewReader(data)
|
|
} else if block < D {
|
|
// data block, first section of the blob
|
|
r := &blockReader{
|
|
cells: S,
|
|
cellSize: cellSize,
|
|
stride: cellSize * D,
|
|
data: data[block*cellSize:],
|
|
cursor: 0,
|
|
}
|
|
return msgs.Crc(blockCrc), r
|
|
} else {
|
|
// parity block, second section of the blob
|
|
r := &blockReader{
|
|
cells: S,
|
|
cellSize: cellSize,
|
|
stride: cellSize * P,
|
|
data: data[D*S*cellSize+(block-D)*cellSize:],
|
|
cursor: 0,
|
|
}
|
|
return msgs.Crc(blockCrc), r
|
|
}
|
|
}
|
|
|
|
// Appends a new span to a given file.
|
|
// Note: the buffer underlying data might be modified by adding padding zeros
|
|
// for the purpose of splitting things into blocks/stripes. The (possibly modified)
|
|
// buffer is returned, regardless of whether the error is nil or not.
|
|
func (c *Client) CreateSpan(
|
|
log *Logger,
|
|
blacklist []msgs.BlockServiceId,
|
|
spanPolicies *msgs.SpanPolicy,
|
|
blockPolicies *msgs.BlockPolicy,
|
|
stripePolicy *msgs.StripePolicy,
|
|
id msgs.InodeId,
|
|
cookie [8]byte,
|
|
offset uint64,
|
|
// The span size might be greater than `len(*data)`, in which case we have trailing
|
|
// zeros (this allows us to cheaply stored zero sections).
|
|
spanSize uint32,
|
|
// This function might append to this, and write after it. It never modifies the data.
|
|
// The new buffer is returned, so the caller can keep using the new, larger buffer,
|
|
// for subsequent spans.
|
|
//
|
|
// Note that the new buffer is returned even if an error is returned.
|
|
data []byte,
|
|
) ([]byte, error) {
|
|
if len(data) < 256 {
|
|
if err := c.createInlineSpan(log, id, cookie, offset, spanSize, data); err != nil {
|
|
return data, err
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
var initiateReq *msgs.AddSpanInitiateReq
|
|
data, initiateReq = prepareSpanInitiateReq(blacklist, spanPolicies, blockPolicies, stripePolicy, id, cookie, offset, spanSize, data)
|
|
{
|
|
expectedSize := float64(spanSize) * float64(initiateReq.Parity.Blocks()/initiateReq.Parity.DataBlocks())
|
|
actualSize := initiateReq.CellSize * uint32(initiateReq.Stripes) * uint32(initiateReq.Parity.Blocks())
|
|
log.Debug("span logical size: %v, span physical size: %v, waste: %v%%", spanSize, actualSize, 100.0*(float64(actualSize)-expectedSize)/float64(actualSize))
|
|
}
|
|
|
|
initiateResp := msgs.AddSpanInitiateResp{}
|
|
if err := c.ShardRequest(log, id.Shard(), initiateReq, &initiateResp); err != nil {
|
|
return data, err
|
|
}
|
|
|
|
certifyReq := msgs.AddSpanCertifyReq{
|
|
FileId: id,
|
|
Cookie: cookie,
|
|
ByteOffset: offset,
|
|
Proofs: make([]msgs.BlockProof, len(initiateResp.Blocks)),
|
|
}
|
|
for i, block := range initiateResp.Blocks {
|
|
conn, err := c.GetBlocksConn(log, block.BlockServiceId, block.BlockServiceIp1, block.BlockServicePort1, block.BlockServiceIp2, block.BlockServicePort2)
|
|
if err != nil {
|
|
return data, err
|
|
}
|
|
blockCrc, blockReader := mkBlockReader(initiateReq, data, i)
|
|
proof, err := WriteBlock(log, conn, &block, blockReader, initiateReq.CellSize*uint32(initiateReq.Stripes), blockCrc)
|
|
conn.Close()
|
|
if err != nil {
|
|
return data, fmt.Errorf("writing block failed with error %v", err)
|
|
}
|
|
certifyReq.Proofs[i].BlockId = block.BlockId
|
|
certifyReq.Proofs[i].Proof = proof
|
|
}
|
|
if err := c.ShardRequest(log, id.Shard(), &certifyReq, &msgs.AddSpanCertifyResp{}); err != nil {
|
|
return data, err
|
|
}
|
|
|
|
return data, nil
|
|
}
|
|
|
|
func (c *Client) CreateFile(
|
|
log *Logger,
|
|
dirInfoCache *DirInfoCache,
|
|
path string, // must be absolute
|
|
r io.Reader,
|
|
) (msgs.InodeId, error) {
|
|
if path[0] != '/' {
|
|
return 0, fmt.Errorf("non-absolute file path %v", path)
|
|
}
|
|
dirPath := filepath.Dir(path)
|
|
fileName := filepath.Base(path)
|
|
if fileName == dirPath {
|
|
return 0, fmt.Errorf("bad file path %v", path)
|
|
}
|
|
dirId, err := c.ResolvePath(log, dirPath)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
spanPolicies := msgs.SpanPolicy{}
|
|
if _, err := c.ResolveDirectoryInfoEntry(log, dirInfoCache, dirId, &spanPolicies); err != nil {
|
|
return 0, err
|
|
}
|
|
blockPolicies := msgs.BlockPolicy{}
|
|
if _, err := c.ResolveDirectoryInfoEntry(log, dirInfoCache, dirId, &blockPolicies); err != nil {
|
|
return 0, err
|
|
}
|
|
stripePolicy := msgs.StripePolicy{}
|
|
if _, err := c.ResolveDirectoryInfoEntry(log, dirInfoCache, dirId, &stripePolicy); err != nil {
|
|
return 0, err
|
|
}
|
|
fileResp := msgs.ConstructFileResp{}
|
|
if err := c.ShardRequest(log, dirId.Shard(), &msgs.ConstructFileReq{Type: msgs.FILE}, &fileResp); err != nil {
|
|
return 0, err
|
|
}
|
|
fileId := fileResp.Id
|
|
cookie := fileResp.Cookie
|
|
maxSpanSize := spanPolicies.Entries[len(spanPolicies.Entries)-1].MaxSize
|
|
spanBuf := make([]byte, maxSpanSize)
|
|
offset := uint64(0)
|
|
for {
|
|
spanBuf = spanBuf[:maxSpanSize]
|
|
read, err := io.ReadFull(r, spanBuf)
|
|
if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
|
|
return 0, err
|
|
}
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
spanBuf, err = c.CreateSpan(
|
|
log, []msgs.BlockServiceId{}, &spanPolicies, &blockPolicies, &stripePolicy,
|
|
fileId, cookie, offset, uint32(read), spanBuf[:read],
|
|
)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
offset += uint64(read)
|
|
if read < int(maxSpanSize) {
|
|
break
|
|
}
|
|
}
|
|
if err := c.ShardRequest(log, dirId.Shard(), &msgs.LinkFileReq{FileId: fileId, Cookie: cookie, OwnerId: dirId, Name: fileName}, &msgs.LinkFileResp{}); err != nil {
|
|
return 0, err
|
|
}
|
|
return fileId, nil
|
|
}
|
|
|
|
type mirroredSpanReader struct {
|
|
cursor int
|
|
block int
|
|
blockConn PuttableReadCloser
|
|
cellBuf *[]byte
|
|
cellCrcs []msgs.Crc // starting from the _next_ stripe crc
|
|
}
|
|
|
|
func (r *mirroredSpanReader) Close() error {
|
|
return r.blockConn.Close()
|
|
}
|
|
|
|
func (r *mirroredSpanReader) Put() {
|
|
r.blockConn.Put()
|
|
}
|
|
|
|
type rsNormalSpanReader struct {
|
|
bufPool *ReadSpanBufPool
|
|
cursor int
|
|
haveBlocks []uint8 // which blocks are we fetching. most of the times it'll just be the data blocks
|
|
blockConns []PuttableReadCloser
|
|
blocksRunningCrcs []msgs.Crc
|
|
stripeBuf *[]byte
|
|
stripeCrcs []msgs.Crc // starting from the _next_ stripe CRC.
|
|
parityBuffers []*[]byte // buffers in which to temporarily place the parity blocks. usually empty
|
|
}
|
|
|
|
func (sr *rsNormalSpanReader) Close() error {
|
|
sr.bufPool.Put(sr.stripeBuf)
|
|
for _, b := range sr.parityBuffers {
|
|
sr.bufPool.Put(b)
|
|
}
|
|
var lastErr error
|
|
for _, c := range sr.blockConns {
|
|
if err := c.Close(); err != nil {
|
|
fmt.Fprintf(os.Stderr, "got error when closing connection: %v\n", err)
|
|
lastErr = err
|
|
}
|
|
}
|
|
return lastErr
|
|
}
|
|
|
|
func (sr *rsNormalSpanReader) Put() {
|
|
for _, c := range sr.blockConns {
|
|
c.Put()
|
|
}
|
|
}
|
|
|
|
// This is when we actively detect a bad CRC, and we have no choice but to load
|
|
// the remainder of the span in its entirety to find out which block is broken,
|
|
// and then resume.
|
|
type rsCorruptedSpanReader struct {
|
|
bufPool *ReadSpanBufPool
|
|
startStripe int // at which stripe we switched to said reading
|
|
cursor int
|
|
dataBlocks []*[]byte
|
|
}
|
|
|
|
func (r *rsCorruptedSpanReader) Close() error {
|
|
for _, b := range r.dataBlocks {
|
|
r.bufPool.Put(b)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (*rsCorruptedSpanReader) Put() {}
|
|
|
|
type spanReader struct {
|
|
bufPool *ReadSpanBufPool
|
|
spanSize uint32
|
|
spanCrc msgs.Crc
|
|
readBytes uint32
|
|
runningCrc msgs.Crc
|
|
parity rs.Parity
|
|
stripes uint8
|
|
cellSize uint32
|
|
blocksCrcs []msgs.Crc
|
|
blockConn func(block int, offset uint32, size uint32) (PuttableReadCloser, error)
|
|
r PuttableCloser
|
|
}
|
|
|
|
func (sr *spanReader) Close() error {
|
|
return sr.r.Close()
|
|
}
|
|
|
|
func (sr *spanReader) Put() {
|
|
sr.r.Put()
|
|
}
|
|
|
|
func (sr *spanReader) repairCorruptedStripe(
|
|
bufPool *ReadSpanBufPool,
|
|
// the broken stripe
|
|
stripe int,
|
|
stripeData *[]byte,
|
|
parityData []*[]byte,
|
|
haveBlocks []uint8, // the blocks we have been using so far
|
|
haveBlocksCrc []msgs.Crc, // the CRCs so far for the blocks that we have
|
|
haveBlocksConns []PuttableReadCloser, // the connections to the blocks we already have
|
|
) (*rsCorruptedSpanReader, error) {
|
|
D := sr.parity.DataBlocks()
|
|
B := sr.parity.Blocks()
|
|
blockStart := stripe * int(sr.cellSize)
|
|
remainingBlockSize := (int(sr.stripes) - stripe) * int(sr.cellSize)
|
|
goodBlocks := make([]bool, B) // _known_ good blocks
|
|
badBlocks := make([]bool, B) // _known_ bad blocks
|
|
blocksData := make([]*[]byte, B)
|
|
defer func() {
|
|
for i := D; i < B; i++ {
|
|
b := blocksData[i]
|
|
if b != nil {
|
|
bufPool.Put(b)
|
|
}
|
|
}
|
|
}()
|
|
// make space for full sized remaining data blocks
|
|
for b := 0; b < D; b++ {
|
|
blocksData[b] = bufPool.Get(remainingBlockSize)
|
|
}
|
|
// load current stripe data into the blocks buffers
|
|
{
|
|
parityIx := 0
|
|
for _, b := range haveBlocks {
|
|
if int(b) < D {
|
|
copy(*blocksData[int(b)], (*stripeData)[int(b)*int(sr.cellSize):])
|
|
} else {
|
|
blocksData[b] = bufPool.Get(remainingBlockSize)
|
|
copy(*blocksData[int(b)], *parityData[parityIx])
|
|
parityIx++
|
|
}
|
|
}
|
|
}
|
|
// load _rest_ of blocks, check which ones are good
|
|
numGoodBlocks := 0
|
|
for {
|
|
cursors := make([]int, D)
|
|
for i := range cursors {
|
|
cursors[i] = int(sr.cellSize)
|
|
}
|
|
allDone := true
|
|
for i, b := range haveBlocks {
|
|
read, err := haveBlocksConns[i].Read((*blocksData[int(b)])[cursors[i]:])
|
|
if err != io.EOF && err != nil {
|
|
return nil, err
|
|
}
|
|
haveBlocksCrc[i] = msgs.Crc(crc32c.Sum(uint32(haveBlocksCrc[i]), (*blocksData[int(b)])[cursors[i]:cursors[i]+read]))
|
|
cursors[i] += read
|
|
if cursors[i] < len(*blocksData[int(b)]) {
|
|
allDone = false
|
|
} else if msgs.Crc(haveBlocksCrc[i]) != sr.blocksCrcs[b] {
|
|
badBlocks[int(b)] = true
|
|
} else {
|
|
goodBlocks[int(b)] = true
|
|
numGoodBlocks++
|
|
}
|
|
}
|
|
if allDone {
|
|
break
|
|
}
|
|
}
|
|
// Find and download blocks to recover
|
|
var tmpBuf *[]byte
|
|
defer bufPool.Put(tmpBuf)
|
|
for b := 0; b < B && numGoodBlocks < D; b++ {
|
|
if badBlocks[b] || goodBlocks[b] { // we know this is bad
|
|
continue
|
|
}
|
|
// We can try to download this one.
|
|
// We need to download the entire blocks here, because we need to check the CRC.
|
|
blockSize := sr.cellSize * uint32(sr.stripes)
|
|
conn, err := sr.blockConn(b, 0, blockSize)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if conn == nil {
|
|
badBlocks[b] = true
|
|
continue
|
|
}
|
|
if tmpBuf == nil {
|
|
tmpBuf = bufPool.Get(int(blockSize))
|
|
}
|
|
if _, err := io.ReadFull(conn, *tmpBuf); err != nil {
|
|
conn.Close()
|
|
return nil, err
|
|
}
|
|
if err := conn.Close(); err != nil {
|
|
return nil, err
|
|
}
|
|
crc := crc32c.Sum(0, *tmpBuf)
|
|
if crc == uint32(sr.blocksCrcs[b]) {
|
|
// this is a good one
|
|
blocksData[b] = bufPool.Get(remainingBlockSize)
|
|
copy(*blocksData[b], (*tmpBuf)[blockStart:])
|
|
goodBlocks[b] = true
|
|
numGoodBlocks++
|
|
} else {
|
|
badBlocks[b] = true
|
|
}
|
|
}
|
|
if numGoodBlocks < D {
|
|
return nil, fmt.Errorf("the number of good blocks (%v) is lower than the number of data blocks (%v)", numGoodBlocks, D)
|
|
}
|
|
newHaveBlocks := []uint8{}
|
|
newHaveBlocksData := [][]byte{}
|
|
for b := 0; b < B; b++ {
|
|
if !goodBlocks[b] {
|
|
continue
|
|
}
|
|
newHaveBlocks = append(newHaveBlocks, uint8(b))
|
|
newHaveBlocksData = append(newHaveBlocksData, *blocksData[b])
|
|
}
|
|
rs := rs.Get(sr.parity)
|
|
for b := 0; b < D; b++ {
|
|
if goodBlocks[b] {
|
|
continue
|
|
}
|
|
rs.RecoverInto(newHaveBlocks, newHaveBlocksData, uint8(b), *blocksData[b])
|
|
}
|
|
r := rsCorruptedSpanReader{
|
|
bufPool: bufPool,
|
|
startStripe: stripe,
|
|
cursor: stripe * D * int(sr.cellSize),
|
|
dataBlocks: blocksData[:D],
|
|
}
|
|
return &r, nil
|
|
}
|
|
|
|
func (sr *spanReader) loadStripe(nsr *rsNormalSpanReader) error {
|
|
D := sr.parity.DataBlocks()
|
|
blocksCursors := make([]int, len(nsr.blockConns))
|
|
needsRecover := false
|
|
for {
|
|
allDone := true
|
|
parityIx := 0
|
|
for i, b := range nsr.haveBlocks {
|
|
var buf []byte
|
|
if int(b) < D { // data block, already ready
|
|
stripeStart := int(b)*int(sr.cellSize) + blocksCursors[b]
|
|
stripeEnd := (int(b) + 1) * int(sr.cellSize)
|
|
buf = (*nsr.stripeBuf)[stripeStart:stripeEnd]
|
|
} else { // parity block
|
|
needsRecover = true
|
|
buf = (*nsr.parityBuffers[parityIx])[blocksCursors[i]:]
|
|
parityIx++
|
|
}
|
|
blockRead, err := nsr.blockConns[i].Read(buf)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
nsr.blocksRunningCrcs[i] = msgs.Crc(crc32c.Sum(uint32(nsr.blocksRunningCrcs[i]), buf[:blockRead]))
|
|
blocksCursors[i] += blockRead
|
|
if blocksCursors[i] < int(sr.cellSize) {
|
|
allDone = false
|
|
}
|
|
}
|
|
if allDone {
|
|
break
|
|
}
|
|
}
|
|
if needsRecover { // we need to recover data blocks
|
|
rs := rs.Get(sr.parity)
|
|
bufs := make([][]byte, D)
|
|
parityIx := 0
|
|
for i, b := range nsr.haveBlocks {
|
|
if int(b) < D {
|
|
stripeStart := int(b) * int(sr.cellSize)
|
|
stripeEnd := (int(b) + 1) * int(sr.cellSize)
|
|
bufs[i] = (*nsr.stripeBuf)[stripeStart:stripeEnd]
|
|
} else {
|
|
bufs[i] = *nsr.parityBuffers[parityIx]
|
|
parityIx++
|
|
}
|
|
}
|
|
lastDataBlock := uint8(0)
|
|
for i := 0; i < D; i++ {
|
|
if int(nsr.haveBlocks[i]) >= D {
|
|
break
|
|
}
|
|
for ; lastDataBlock < nsr.haveBlocks[i]; lastDataBlock++ {
|
|
stripeStart := int(lastDataBlock) * int(sr.cellSize)
|
|
stripeEnd := (int(lastDataBlock) + 1) * int(sr.cellSize)
|
|
rs.RecoverInto(nsr.haveBlocks, bufs, lastDataBlock, (*nsr.stripeBuf)[stripeStart:stripeEnd])
|
|
}
|
|
lastDataBlock++
|
|
}
|
|
}
|
|
crc := crc32c.Sum(0, *nsr.stripeBuf)
|
|
if crc != uint32(nsr.stripeCrcs[0]) {
|
|
var err error
|
|
sr.r, err = sr.repairCorruptedStripe(
|
|
sr.bufPool,
|
|
nsr.cursor/(int(sr.cellSize)*D),
|
|
nsr.stripeBuf,
|
|
nsr.parityBuffers,
|
|
nsr.haveBlocks,
|
|
nsr.blocksRunningCrcs,
|
|
nsr.blockConns,
|
|
)
|
|
nsr.Close() // close connections
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
nsr.stripeCrcs = nsr.stripeCrcs[1:]
|
|
return nil
|
|
}
|
|
|
|
func (sr *spanReader) loadCell(mr *mirroredSpanReader) error {
|
|
l := func() (bool, error) {
|
|
cursor := 0
|
|
for cursor < int(sr.cellSize) {
|
|
read, err := mr.blockConn.Read((*mr.cellBuf)[cursor:])
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
cursor += read
|
|
}
|
|
crc := crc32c.Sum(0, *mr.cellBuf)
|
|
good := crc == uint32(mr.cellCrcs[0])
|
|
if !good {
|
|
if err := mr.blockConn.Close(); err != nil {
|
|
return false, err
|
|
}
|
|
}
|
|
return good, nil
|
|
}
|
|
good, err := l()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// look for another block if necessary
|
|
startingBlock := mr.block
|
|
B := sr.parity.Blocks()
|
|
for !good {
|
|
mr.block = (mr.block + 1) % B
|
|
if mr.block == startingBlock {
|
|
break
|
|
}
|
|
var err error
|
|
mr.blockConn, err = sr.blockConn(int(mr.block), uint32(mr.cursor), sr.cellSize*uint32(sr.stripes)-uint32(mr.cursor))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if mr.blockConn == nil {
|
|
continue
|
|
}
|
|
good, err = l()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if !good {
|
|
return fmt.Errorf("could not find block without CRC errors")
|
|
}
|
|
mr.cellCrcs = mr.cellCrcs[1:]
|
|
return nil
|
|
}
|
|
|
|
func (sr *spanReader) readMirrored(mr *mirroredSpanReader, p []byte) (int, error) {
|
|
if mr.cursor >= int(sr.spanSize) {
|
|
return 0, io.EOF
|
|
}
|
|
if mr.cursor >= int(sr.stripes)*int(sr.cellSize) { // trailing zeros
|
|
read := 0
|
|
for mr.cursor < int(sr.spanSize) && read < len(p) {
|
|
p[read] = 0
|
|
read++
|
|
mr.cursor++
|
|
}
|
|
return read, nil
|
|
}
|
|
currentCell := mr.cursor / int(sr.cellSize)
|
|
cellPos := mr.cursor % int(sr.cellSize)
|
|
remainingCell := (*mr.cellBuf)[cellPos:]
|
|
if mr.cursor+len(remainingCell) > int(sr.spanSize) { // zero padding
|
|
remainingCell = remainingCell[:int(sr.spanSize)-mr.cursor]
|
|
}
|
|
read := copy(p, remainingCell)
|
|
mr.cursor += read
|
|
if cellPos+read == len(*mr.cellBuf) && currentCell+1 < int(sr.stripes) {
|
|
if err := sr.loadCell(mr); err != nil {
|
|
return read, err
|
|
}
|
|
}
|
|
return read, nil
|
|
}
|
|
|
|
func (sr *spanReader) readRsHappy(nsr *rsNormalSpanReader, p []byte) (int, error) {
|
|
if nsr.cursor >= int(sr.spanSize) {
|
|
return 0, io.EOF
|
|
}
|
|
D := sr.parity.DataBlocks()
|
|
if nsr.cursor >= int(sr.stripes)*D*int(sr.cellSize) { // trailing zeros
|
|
read := 0
|
|
for nsr.cursor < int(sr.spanSize) && read < len(p) {
|
|
p[read] = 0
|
|
read++
|
|
nsr.cursor++
|
|
}
|
|
return read, nil
|
|
}
|
|
currentStripe := nsr.cursor / (D * int(sr.cellSize))
|
|
stripePos := nsr.cursor % (D * int(sr.cellSize))
|
|
remainingStripe := (*nsr.stripeBuf)[stripePos:]
|
|
if nsr.cursor+len(remainingStripe) > int(sr.spanSize) { // zero padding
|
|
remainingStripe = remainingStripe[:int(sr.spanSize)-nsr.cursor]
|
|
}
|
|
read := copy(p, remainingStripe)
|
|
nsr.cursor += read
|
|
if stripePos+read == len(*nsr.stripeBuf) && currentStripe+1 < int(sr.stripes) {
|
|
if err := sr.loadStripe(nsr); err != nil {
|
|
return read, err
|
|
}
|
|
}
|
|
return read, nil
|
|
}
|
|
|
|
func (sr *spanReader) readRsCorrupted(wsr *rsCorruptedSpanReader, p []byte) (int, error) {
|
|
D := sr.parity.DataBlocks()
|
|
if wsr.cursor >= int(sr.spanSize) {
|
|
return 0, io.EOF
|
|
}
|
|
if wsr.cursor >= int(sr.stripes)*D*int(sr.cellSize) { // trailing zeros
|
|
read := 0
|
|
for wsr.cursor < int(sr.spanSize) && read < len(p) {
|
|
p[read] = 0
|
|
read++
|
|
wsr.cursor++
|
|
}
|
|
return read, nil
|
|
}
|
|
currentStripe := wsr.cursor / (D * int(sr.cellSize))
|
|
stripePos := wsr.cursor % (D * int(sr.cellSize))
|
|
currentBlock := stripePos / int(sr.cellSize)
|
|
cellPos := stripePos % int(sr.cellSize)
|
|
remainingCell := (*wsr.dataBlocks[currentBlock])[int(sr.cellSize)*(currentStripe-wsr.startStripe)+cellPos : int(sr.cellSize)*(currentStripe-wsr.startStripe+1)]
|
|
if wsr.cursor+len(remainingCell) > int(sr.spanSize) { // zero padding
|
|
remainingCell = remainingCell[:int(sr.spanSize)-wsr.cursor]
|
|
}
|
|
read := copy(p, remainingCell)
|
|
wsr.cursor += read
|
|
return read, nil
|
|
}
|
|
|
|
func (sr *spanReader) Read(p []byte) (int, error) {
|
|
var read int
|
|
var err error
|
|
switch r := sr.r.(type) {
|
|
case *mirroredSpanReader:
|
|
read, err = sr.readMirrored(r, p)
|
|
case *rsNormalSpanReader:
|
|
read, err = sr.readRsHappy(r, p)
|
|
case *rsCorruptedSpanReader:
|
|
read, err = sr.readRsCorrupted(r, p)
|
|
default:
|
|
panic(fmt.Errorf("bad reader %T", sr.r))
|
|
}
|
|
sr.readBytes += uint32(read)
|
|
if sr.readBytes > sr.spanSize {
|
|
panic(fmt.Errorf("read beyond end of span -- %v vs %v", sr.readBytes, sr.spanSize))
|
|
}
|
|
sr.runningCrc = msgs.Crc(crc32c.Sum(uint32(sr.runningCrc), p[:read]))
|
|
if sr.readBytes == sr.spanSize && sr.runningCrc != sr.spanCrc {
|
|
panic(fmt.Errorf("span contents CRC is not what we expect -- %v vs %v", sr.runningCrc, sr.spanCrc))
|
|
}
|
|
return read, err
|
|
}
|
|
|
|
// Given a way to start streaming a block, produces a stream with the span
|
|
// contents. Will automatically repair the span if CRC errors are detected.
|
|
func readSpanFromBlocks(
|
|
bufPool *ReadSpanBufPool,
|
|
spanSize uint32,
|
|
spanCrc msgs.Crc,
|
|
parity rs.Parity,
|
|
stripes uint8,
|
|
cellSize uint32,
|
|
blockCrcs []msgs.Crc,
|
|
stripesCrc []msgs.Crc,
|
|
// If this function returns `nil, nil`, it means that that block service
|
|
// is currently not available for whatever reason.
|
|
//
|
|
// We currently make the assumption that the connections that are available
|
|
// at the beginning will be available throughout the duration of span reading.
|
|
blockConn func(block int, offset uint32, size uint32) (PuttableReadCloser, error),
|
|
) (PuttableReadCloser, error) {
|
|
D := parity.DataBlocks()
|
|
B := parity.Blocks()
|
|
sr := spanReader{
|
|
bufPool: bufPool,
|
|
spanSize: spanSize,
|
|
spanCrc: spanCrc,
|
|
stripes: stripes,
|
|
cellSize: cellSize,
|
|
parity: parity,
|
|
blockConn: blockConn,
|
|
blocksCrcs: blockCrcs,
|
|
}
|
|
if D == 1 {
|
|
mr := &mirroredSpanReader{
|
|
cursor: 0,
|
|
cellBuf: sr.bufPool.Get(int(cellSize)),
|
|
cellCrcs: stripesCrc,
|
|
}
|
|
for b := 0; b < B; b++ {
|
|
conn, err := blockConn(b, 0, uint32(cellSize)*uint32(stripes))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if conn != nil {
|
|
mr.blockConn = conn
|
|
mr.block = b
|
|
break
|
|
}
|
|
}
|
|
if mr.blockConn == nil {
|
|
return nil, fmt.Errorf("couldn't get block connection to any of the blocks")
|
|
}
|
|
sr.r = mr
|
|
if err := sr.loadCell(mr); err != nil {
|
|
return nil, err
|
|
}
|
|
} else {
|
|
conns := make([]PuttableReadCloser, 0)
|
|
haveBlocks := make([]uint8, 0)
|
|
parityBuffers := []*[]byte{}
|
|
for i := 0; i < B; i++ {
|
|
conn, err := blockConn(i, 0, uint32(cellSize)*uint32(stripes))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if conn == nil {
|
|
continue
|
|
}
|
|
conns = append(conns, conn)
|
|
haveBlocks = append(haveBlocks, uint8(i))
|
|
if i >= D {
|
|
parityBuffers = append(parityBuffers, sr.bufPool.Get(int(cellSize)))
|
|
}
|
|
if len(haveBlocks) == D {
|
|
break
|
|
}
|
|
}
|
|
if len(haveBlocks) != D {
|
|
return nil, fmt.Errorf("couldn't get enough block connections (need at least %v, got %v)", D, len(conns))
|
|
}
|
|
stripeBuf := sr.bufPool.Get(int(cellSize) * D)
|
|
rsr := &rsNormalSpanReader{
|
|
bufPool: sr.bufPool,
|
|
cursor: 0,
|
|
blockConns: conns,
|
|
haveBlocks: haveBlocks,
|
|
stripeBuf: stripeBuf,
|
|
stripeCrcs: stripesCrc,
|
|
blocksRunningCrcs: make([]msgs.Crc, D),
|
|
parityBuffers: parityBuffers,
|
|
}
|
|
sr.r = rsr
|
|
if err := sr.loadStripe(rsr); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return &sr, nil
|
|
}
|
|
|
|
type inlineSpanReader struct {
|
|
size int
|
|
cursor int
|
|
data []byte
|
|
}
|
|
|
|
func (r *inlineSpanReader) Read(p []byte) (int, error) {
|
|
if r.cursor >= r.size {
|
|
return 0, io.EOF
|
|
}
|
|
if r.cursor >= len(r.data) {
|
|
read := 0
|
|
for r.cursor < r.size && read < len(p) {
|
|
p[read] = 0
|
|
read++
|
|
r.cursor++
|
|
}
|
|
return read, nil
|
|
}
|
|
read := copy(p, r.data[r.cursor:])
|
|
r.cursor += read
|
|
return read, nil
|
|
}
|
|
|
|
func (*inlineSpanReader) Close() error {
|
|
return nil
|
|
}
|
|
|
|
func (*inlineSpanReader) Put() {}
|
|
|
|
type ReadSpanBufPool struct {
|
|
pool sync.Pool
|
|
}
|
|
|
|
func NewReadSpanBufPool() *ReadSpanBufPool {
|
|
pool := ReadSpanBufPool{
|
|
pool: sync.Pool{
|
|
New: func() any {
|
|
buf := []byte{}
|
|
return &buf
|
|
},
|
|
},
|
|
}
|
|
return &pool
|
|
}
|
|
|
|
// This does _not_ zero the memory in the bufs -- i.e. there might
|
|
// be garbage in it.
|
|
func (pool *ReadSpanBufPool) Get(l int) *[]byte {
|
|
buf := pool.pool.Get().(*[]byte)
|
|
if cap(*buf) >= l {
|
|
*buf = (*buf)[:l]
|
|
} else {
|
|
*buf = (*buf)[:cap(*buf)]
|
|
*buf = append(*buf, make([]byte, l-len(*buf))...)
|
|
}
|
|
return buf
|
|
}
|
|
|
|
func (pool *ReadSpanBufPool) Put(buf *[]byte) {
|
|
if buf != nil {
|
|
pool.pool.Put(buf)
|
|
}
|
|
}
|
|
|
|
func (c *Client) ReadSpan(
|
|
log *Logger,
|
|
bufPool *ReadSpanBufPool,
|
|
blacklist []msgs.BlockServiceId,
|
|
blockServices []msgs.BlockService,
|
|
fetchedSpan *msgs.FetchedSpan,
|
|
) (PuttableReadCloser, error) {
|
|
if fetchedSpan.Header.StorageClass == msgs.INLINE_STORAGE {
|
|
data := fetchedSpan.Body.(*msgs.FetchedInlineSpan).Body
|
|
dataCrc := msgs.Crc(crc32c.Sum(0, data))
|
|
if dataCrc != fetchedSpan.Header.Crc {
|
|
panic(fmt.Errorf("header CRC for inline span is %v, but data is %v", fetchedSpan.Header.Crc, dataCrc))
|
|
}
|
|
isr := inlineSpanReader{
|
|
size: int(fetchedSpan.Header.Size),
|
|
cursor: 0,
|
|
data: fetchedSpan.Body.(*msgs.FetchedInlineSpan).Body,
|
|
}
|
|
return &isr, nil
|
|
}
|
|
|
|
body := fetchedSpan.Body.(*msgs.FetchedBlocksSpan)
|
|
log.DebugStack(1, "span parity %v", body.Parity)
|
|
blocksCrcs := make([]msgs.Crc, body.Parity.Blocks())
|
|
for i := range blocksCrcs {
|
|
blocksCrcs[i] = body.Blocks[i].Crc
|
|
}
|
|
blockConn := func(blockIx int, offset uint32, size uint32) (PuttableReadCloser, error) {
|
|
log.DebugStack(1, "requested connection for block ix %v, offset %v, size %v", blockIx, offset, size)
|
|
block := body.Blocks[blockIx]
|
|
blockService := blockServices[block.BlockServiceIx]
|
|
for _, blacklisted := range blacklist {
|
|
if blockService.Id == blacklisted {
|
|
return nil, nil
|
|
}
|
|
}
|
|
conn, err := c.GetBlocksConn(log, blockService.Id, blockService.Ip1, blockService.Port1, blockService.Ip2, blockService.Port2)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := FetchBlock(log, conn, &blockService, block.BlockId, offset, size); err != nil {
|
|
conn.Close()
|
|
return nil, err
|
|
}
|
|
return conn, err
|
|
}
|
|
return readSpanFromBlocks(bufPool, fetchedSpan.Header.Size, fetchedSpan.Header.Crc, body.Parity, body.Stripes, body.CellSize, blocksCrcs, body.StripesCrc, blockConn)
|
|
}
|
|
|
|
type fileReader struct {
|
|
client *Client
|
|
log *Logger
|
|
bufPool *ReadSpanBufPool
|
|
blacklist []msgs.BlockServiceId
|
|
fileId msgs.InodeId
|
|
spansResp msgs.FileSpansResp
|
|
spanReader io.ReadCloser
|
|
}
|
|
|
|
func (f *fileReader) Close() error {
|
|
if f.spanReader != nil {
|
|
return f.spanReader.Close()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (f *fileReader) loadNextSpanAndRead(p []byte) (int, error) {
|
|
if len(f.spansResp.Spans) == 0 { // no remaining spans
|
|
if f.spansResp.NextOffset == 0 { // no remaining spans, and no next batch of spans available
|
|
return 0, io.EOF
|
|
} else { // request next spans and try again
|
|
req := msgs.FileSpansReq{FileId: f.fileId, ByteOffset: f.spansResp.NextOffset}
|
|
if err := f.client.ShardRequest(f.log, f.fileId.Shard(), &req, &f.spansResp); err != nil {
|
|
return 0, err
|
|
}
|
|
return f.Read(p)
|
|
}
|
|
} else { // load next span
|
|
if f.spanReader != nil {
|
|
if err := f.spanReader.Close(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
var err error
|
|
f.spanReader, err = f.client.ReadSpan(f.log, f.bufPool, f.blacklist, f.spansResp.BlockServices, &f.spansResp.Spans[0])
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
f.spansResp.Spans = f.spansResp.Spans[1:]
|
|
return f.Read(p)
|
|
}
|
|
}
|
|
|
|
func (f *fileReader) Read(p []byte) (int, error) {
|
|
if f.spanReader == nil {
|
|
return f.loadNextSpanAndRead(p)
|
|
}
|
|
spanRead, err := f.spanReader.Read(p)
|
|
if err == io.EOF {
|
|
return f.loadNextSpanAndRead(p)
|
|
}
|
|
return spanRead, err
|
|
}
|
|
|
|
func (c *Client) ReadFile(
|
|
log *Logger,
|
|
bufPool *ReadSpanBufPool,
|
|
blacklist []msgs.BlockServiceId,
|
|
id msgs.InodeId,
|
|
) (io.ReadCloser, error) {
|
|
r := &fileReader{
|
|
client: c,
|
|
log: log,
|
|
bufPool: bufPool,
|
|
blacklist: blacklist,
|
|
fileId: id,
|
|
}
|
|
req := msgs.FileSpansReq{FileId: r.fileId, ByteOffset: r.spansResp.NextOffset}
|
|
if err := r.client.ShardRequest(r.log, r.fileId.Shard(), &req, &r.spansResp); err != nil {
|
|
return nil, err
|
|
}
|
|
return r, nil
|
|
}
|