Files
ternfs-XTXMarkets/go/ternblocks/ternblocks.go
Francesco Mazzoli 2ccdfd1d44 Factor out go files in more meaningful directories
Since we're going to add licenses soon
2025-09-04 15:07:35 +00:00

1862 lines
62 KiB
Go

package main
import (
"bufio"
"bytes"
"crypto/aes"
"crypto/cipher"
crand "crypto/rand"
"encoding/binary"
"errors"
"flag"
"fmt"
"io"
"math/rand"
mrand "math/rand"
"net"
"os"
"os/signal"
"path"
"path/filepath"
"runtime/pprof"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"unsafe"
"xtx/ternfs/bufpool"
"xtx/ternfs/cbcmac"
"xtx/ternfs/certificate"
"xtx/ternfs/client"
"xtx/ternfs/crc32c"
"xtx/ternfs/flags"
"xtx/ternfs/log"
lrecover "xtx/ternfs/log/recover"
"xtx/ternfs/msgs"
"xtx/ternfs/timing"
"xtx/ternfs/wyhash"
"golang.org/x/sys/unix"
)
// #include <unistd.h>
// #include <fcntl.h>
// #include <errno.h>
// #include <stdio.h>
// #include <stdlib.h>
// #include <string.h>
// #include <sys/syscall.h>
//
// struct linux_dirent {
// unsigned long d_ino;
// off_t d_off;
// unsigned short d_reclen;
// char d_name[];
// };
//
// // If negative, it's an error code.
// ssize_t count_blocks(char* base_path) {
// int base_fd = -1;
// int dir_fd = -1;
// ssize_t blocks = 0;
// base_fd = open(base_path, O_RDONLY|O_DIRECTORY);
// if (base_fd < 0) {
// fprintf(stderr, "could not open %s: %d\n", base_path, errno);
// blocks = -errno;
// goto out;
// }
// char buf[1024];
// for (int i = 0; i < 256; i++) {
// char dir_path[3];
// snprintf(dir_path, 3, "%02x", i);
// if (dir_fd >= 0) { close(dir_fd); }
// dir_fd = openat(base_fd, dir_path, O_RDONLY|O_DIRECTORY);
// if (dir_fd < 0) {
// if (errno == ENOENT) {
// continue;
// }
// fprintf(stderr, "could not open dir %s/%s: %d\n", base_path, dir_path, errno);
// blocks = -errno;
// goto out;
// }
// for (;;) {
// long read = syscall(SYS_getdents, dir_fd, buf, sizeof(buf));
// if (read < 0) {
// fprintf(stderr, "could not read direntries in %s/%s: %d\n", base_path, dir_path, errno);
// blocks = -errno;
// goto out;
// }
// if (read == 0) { break; }
// long bpos = 0;
// while( bpos < read) {
// struct linux_dirent *entry = (struct linux_dirent *)(buf + bpos);
// bpos += entry->d_reclen;
// if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0 ||
// strncmp(entry->d_name, "tmp.", 4) == 0) {
// // we only want to count blocks and ignore . and .. and tmp.* files
// continue;
// }
// blocks++;
// }
// }
// }
// out:
// if (dir_fd >= 0) { close(dir_fd); }
// if (base_fd >= 0) { close(base_fd); }
// return blocks;
// }
import "C"
type blockServiceStats struct {
blocksWritten uint64
bytesWritten uint64
blocksErased uint64
blocksFetched uint64
bytesFetched uint64
blocksChecked uint64
bytesChecked uint64
blocksConverted uint64
blockConversionDiscarded uint64
}
type env struct {
bufPool *bufpool.BufPool
stats map[msgs.BlockServiceId]*blockServiceStats
counters map[msgs.BlocksMessageKind]*timing.Timings
eraseLocks map[msgs.BlockServiceId]*sync.Mutex
shuckleConn *client.ShuckleConn
failureDomain string
hostname string
pathPrefix string
readWholeFile bool
ioAlertPercent uint8
}
func BlockWriteProof(blockServiceId msgs.BlockServiceId, blockId msgs.BlockId, key cipher.Block) [8]byte {
buf := bytes.NewBuffer([]byte{})
// struct.pack_into('<QcQ', b, 0, block_service_id, b'W', block_id)
binary.Write(buf, binary.LittleEndian, uint64(blockServiceId))
buf.Write([]byte{'W'})
binary.Write(buf, binary.LittleEndian, uint64(blockId))
return cbcmac.CBCMAC(key, buf.Bytes())
}
func raiseAlertAndHardwareEvent(logger *log.Logger, hostname string, blockServiceId string, msg string) {
logger.RaiseHardwareEvent(hostname, blockServiceId, msg)
}
func blockServiceIdFromKey(secretKey [16]byte) msgs.BlockServiceId {
// we don't really care about leaking part or all of the key -- the whole key business is
// to defend against bugs, not malicious agents.
//
// That said, we prefer to encode the knowledge of how to generate block service ids once,
// in the block service.
//
// Also, we remove the highest bit for the sake of SQLite, at least for now
return msgs.BlockServiceId(binary.LittleEndian.Uint64(secretKey[:8]) & uint64(0x7FFFFFFFFFFFFFFF))
}
// Do this in one go in C to minimize syscall overhead in golang,
// also I think the busy loop with syscalls in them was starving
// the goroutines waiting on network stuff, which in turn caused
// xmon connections to be dropped.
//
// Note that `findRunnable` picks up network stuff last:
// <https://github.com/golang/go/blob/1d45a7ef560a76318ed59dfdb178cecd58caf948/src/runtime/proc.go#L3222-L3242>
func countBlocks(basePath string) (uint64, error) {
cBasePath := C.CString(basePath)
defer C.free(unsafe.Pointer(cBasePath))
blocks := C.count_blocks(cBasePath)
if blocks < 0 {
return 0, syscall.Errno(-blocks)
}
return uint64(blocks), nil
}
func updateBlockServiceInfoCapacity(
_ *log.Logger,
blockService *blockService,
reservedStorage uint64,
) error {
var statfs unix.Statfs_t
if err := unix.Statfs(path.Join(blockService.path, "secret.key"), &statfs); err != nil {
return err
}
capacityBytes := statfs.Blocks * uint64(statfs.Bsize)
if capacityBytes < reservedStorage {
capacityBytes = 0
} else {
capacityBytes -= reservedStorage
}
availableBytes := statfs.Bavail * uint64(statfs.Bsize)
if availableBytes < reservedStorage {
availableBytes = 0
} else {
availableBytes -= reservedStorage
}
blockService.cachedInfo.CapacityBytes = capacityBytes
blockService.cachedInfo.AvailableBytes = availableBytes
return nil
}
// either updates `blockService`, or returns an error.
func updateBlockServiceInfoBlocks(
log *log.Logger,
blockService *blockService,
) error {
t := time.Now()
log.Info("starting to count blocks for %v", blockService.cachedInfo.Id)
blocksWithCrc, err := countBlocks(path.Join(blockService.path, "with_crc"))
if err != nil {
return err
}
blockService.cachedInfo.Blocks = blocksWithCrc
log.Info("done counting blocks for %v in %v. (blocks: %d)", blockService.cachedInfo.Id, time.Since(t), blocksWithCrc)
return nil
}
func initBlockServicesInfo(
env *env,
log *log.Logger,
locationId msgs.Location,
addrs msgs.AddrsInfo,
failureDomain [16]byte,
blockServices map[msgs.BlockServiceId]*blockService,
reservedStorage uint64,
) error {
log.Info("initializing block services info")
var wg sync.WaitGroup
wg.Add(len(blockServices))
alert := log.NewNCAlert(0)
log.RaiseNC(alert, "getting info for %v block services", len(blockServices))
for id, bs := range blockServices {
bs.cachedInfo.LocationId = locationId
bs.cachedInfo.Id = id
bs.cachedInfo.Addrs = addrs
bs.cachedInfo.SecretKey = bs.key
bs.cachedInfo.StorageClass = bs.storageClass
bs.cachedInfo.FailureDomain.Name = failureDomain
if len(env.pathPrefix) > 0 {
bs.cachedInfo.Path = fmt.Sprintf("%s:%s", env.pathPrefix, bs.path)
} else {
bs.cachedInfo.Path = bs.path
}
closureBs := bs
go func() {
// only update if it isn't filled it in already from shuckle
if closureBs.cachedInfo.Blocks == 0 {
if err := updateBlockServiceInfoCapacity(log, closureBs, reservedStorage); err != nil {
panic(err)
}
if err := updateBlockServiceInfoBlocks(log, closureBs); err != nil {
panic(err)
}
}
wg.Done()
}()
}
wg.Wait()
log.ClearNC(alert)
return nil
}
var minimumRegisterInterval time.Duration = time.Second * 60
var maximumRegisterInterval time.Duration = minimumRegisterInterval * 2
var variantRegisterInterval time.Duration = maximumRegisterInterval - minimumRegisterInterval
func registerPeriodically(
log *log.Logger,
blockServices map[msgs.BlockServiceId]*blockService,
env *env,
) {
req := msgs.RegisterBlockServicesReq{}
alert := log.NewNCAlert(10 * time.Second)
for {
req.BlockServices = req.BlockServices[:0]
for _, bs := range blockServices {
if bs.couldNotUpdateInfoBlocks || bs.couldNotUpdateInfoCapacity {
continue
}
req.BlockServices = append(req.BlockServices, bs.cachedInfo)
}
log.Trace("registering with %+v", req)
_, err := env.shuckleConn.Request(&req)
if err != nil {
log.RaiseNC(alert, "could not register block services with %+v: %v", env.shuckleConn.ShuckleAddress(), err)
time.Sleep(100 * time.Millisecond)
continue
}
log.ClearNC(alert)
waitFor := minimumRegisterInterval + time.Duration(mrand.Uint64()%uint64(variantRegisterInterval.Nanoseconds()))
log.Info("registered with %v (%v alive), waiting %v", env.shuckleConn.ShuckleAddress(), len(blockServices), waitFor)
time.Sleep(waitFor)
}
}
func updateBlockServiceInfoBlocksForever(
log *log.Logger,
blockServices map[msgs.BlockServiceId]*blockService,
) {
for {
for _, bs := range blockServices {
if err := updateBlockServiceInfoBlocks(log, bs); err != nil {
bs.couldNotUpdateInfoBlocks = true
log.RaiseNC(&bs.couldNotUpdateInfoBlocksAlert, "could not count blocks for block service %v: %v", bs.cachedInfo.Id, err)
} else {
bs.couldNotUpdateInfoBlocks = false
log.ClearNC(&bs.couldNotUpdateInfoBlocksAlert)
}
}
time.Sleep(time.Minute) // so that we won't busy loop in tests etc
}
}
func updateBlockServiceInfoCapacityForever(
log *log.Logger,
blockServices map[msgs.BlockServiceId]*blockService,
reservedStorage uint64,
) {
for {
for _, bs := range blockServices {
if err := updateBlockServiceInfoCapacity(log, bs, reservedStorage); err != nil {
bs.couldNotUpdateInfoCapacity = true
log.RaiseNC(&bs.couldNotUpdateInfoCapacityAlert, "could not get capacity for block service %v: %v", bs.cachedInfo.Id, err)
} else {
bs.couldNotUpdateInfoCapacity = false
log.ClearNC(&bs.couldNotUpdateInfoCapacityAlert)
}
}
time.Sleep(10 * time.Second) // so that we won't busy loop in tests etc
}
}
func checkEraseCertificate(log *log.Logger, blockServiceId msgs.BlockServiceId, cipher cipher.Block, req *msgs.EraseBlockReq) error {
expectedMac, good := certificate.CheckBlockEraseCertificate(blockServiceId, cipher, req)
if !good {
log.RaiseAlert("bad MAC, got %v, expected %v", req.Certificate, expectedMac)
return msgs.BAD_CERTIFICATE
}
return nil
}
func eraseBlock(log *log.Logger, env *env, blockServiceId msgs.BlockServiceId, basePath string, blockId msgs.BlockId) error {
m := env.eraseLocks[blockServiceId]
m.Lock()
defer m.Unlock()
blockPath := path.Join(basePath, blockId.Path())
log.Debug("deleting block %v at path %v", blockId, blockPath)
err := eraseFileIfExistsAndSyncDir(blockPath)
if err != nil && !os.IsNotExist(err) {
return err
}
atomic.AddUint64(&env.stats[blockServiceId].blocksErased, 1)
return nil
}
func writeBlocksResponse(log *log.Logger, w io.Writer, resp msgs.BlocksResponse) error {
log.Trace("writing response %T %+v", resp, resp)
buf := bytes.NewBuffer([]byte{})
if err := binary.Write(buf, binary.LittleEndian, msgs.BLOCKS_RESP_PROTOCOL_VERSION); err != nil {
return err
}
if _, err := buf.Write([]byte{uint8(resp.BlocksResponseKind())}); err != nil {
return err
}
if err := resp.Pack(buf); err != nil {
return err
}
if _, err := w.Write(buf.Bytes()); err != nil {
return err
}
return nil
}
func writeBlocksResponseError(log *log.Logger, w io.Writer, err msgs.TernError) error {
log.Debug("writing blocks error %v", err)
buf := bytes.NewBuffer([]byte{})
if err := binary.Write(buf, binary.LittleEndian, msgs.BLOCKS_RESP_PROTOCOL_VERSION); err != nil {
return err
}
if _, err := buf.Write([]byte{msgs.ERROR}); err != nil {
return err
}
if err := binary.Write(buf, binary.LittleEndian, uint16(err)); err != nil {
return err
}
if _, err := w.Write(buf.Bytes()); err != nil {
return err
}
return nil
}
type newToOldReadConverter struct {
log *log.Logger
r io.Reader
b []byte
totalRead int
bytesInBuffer int
}
func (c *newToOldReadConverter) Read(p []byte) (int, error) {
var read int = 0
for len(p) > 0 {
c.log.Debug("{len_p: %d, read, %d, bytesInBuffer: %d, totalRead %d}", len(p), read, c.bytesInBuffer, c.totalRead)
readFromFile, err := c.r.Read(c.b[c.bytesInBuffer:])
if err != nil && err != io.EOF {
return read, err
}
c.bytesInBuffer += readFromFile
if c.bytesInBuffer == 0 && err == io.EOF {
return read, io.EOF
}
offsetInBuffer := 0
for offsetInBuffer < c.bytesInBuffer && len(p) > 0 {
toCopy := c.bytesInBuffer - offsetInBuffer
if toCopy > len(p) {
toCopy = len(p)
}
offSetInPage := c.totalRead % int(msgs.TERN_PAGE_WITH_CRC_SIZE)
availableInPage := int(msgs.TERN_PAGE_SIZE) - offSetInPage
if toCopy > availableInPage {
toCopy = availableInPage
}
c.log.Debug("{toCopy: %d, offSetInPage: %d, availableInPage: %d, offsetInBuffer: %d, bytesInBuffer: %d}", toCopy, offSetInPage, availableInPage, offsetInBuffer, c.bytesInBuffer)
copy(p, c.b[offsetInBuffer:offsetInBuffer+toCopy])
c.totalRead += toCopy
read += toCopy
p = p[toCopy:]
offsetInBuffer += toCopy
offSetInPage += toCopy
if offSetInPage == int(msgs.TERN_PAGE_SIZE) {
if c.bytesInBuffer-offsetInBuffer < 4 {
break
}
c.totalRead += 4
offsetInBuffer += 4
}
}
copy(c.b, c.b[offsetInBuffer:c.bytesInBuffer])
c.bytesInBuffer -= offsetInBuffer
}
c.log.Debug("{len_p: %d, read: %d", len(p), read)
return read, nil
}
func sendFetchBlock(log *log.Logger, env *env, blockServiceId msgs.BlockServiceId, basePath string, blockId msgs.BlockId, offset uint32, count uint32, conn *net.TCPConn, withCrc bool, fileId msgs.InodeId) error {
if offset%msgs.TERN_PAGE_SIZE != 0 {
log.RaiseAlert("trying to read from offset other than page boundary")
return msgs.BLOCK_FETCH_OUT_OF_BOUNDS
}
if count%msgs.TERN_PAGE_SIZE != 0 {
log.RaiseAlert("trying to read count which is not a multiple of page size")
return msgs.BLOCK_FETCH_OUT_OF_BOUNDS
}
pageCount := count / msgs.TERN_PAGE_SIZE
offsetPageCount := offset / msgs.TERN_PAGE_SIZE
blockPath := path.Join(basePath, blockId.Path())
log.Debug("fetching block id %v at path %v", blockId, blockPath)
f, err := os.Open(blockPath)
if errors.Is(err, syscall.ENODATA) {
// see <https://internal-repo/issues/106>
raiseAlertAndHardwareEvent(log, env.hostname, blockServiceId.String(),
fmt.Sprintf("could not open block %v, got ENODATA, this probably means that the block/disk is gone", blockPath))
// return io error, downstream code will pick it up
return syscall.EIO
}
if os.IsNotExist(err) {
log.ErrorNoAlert("could not find block to fetch at path %v for file %v. Request from client: %v", blockPath, fileId, conn.RemoteAddr())
return msgs.BLOCK_NOT_FOUND
}
if err != nil {
return err
}
defer func() {
if f != nil {
f.Close()
}
}()
fi, err := f.Stat()
if err != nil {
return err
}
preReadSize := fi.Size()
filePageCount := uint32(fi.Size()) / msgs.TERN_PAGE_WITH_CRC_SIZE
if offsetPageCount+pageCount > filePageCount {
log.RaiseAlert("malformed request for block %v. requested read at [%d - %d] but stored block size is %d", blockId, offset, offset+count, filePageCount*msgs.TERN_PAGE_SIZE)
return msgs.BLOCK_FETCH_OUT_OF_BOUNDS
}
if !env.readWholeFile {
preReadSize = int64(pageCount) * int64(msgs.TERN_PAGE_WITH_CRC_SIZE)
}
var reader io.ReadSeeker = f
if withCrc {
offset = offsetPageCount * msgs.TERN_PAGE_WITH_CRC_SIZE
count = pageCount * msgs.TERN_PAGE_WITH_CRC_SIZE
unix.Fadvise(int(f.Fd()), int64(offset), preReadSize, unix.FADV_SEQUENTIAL|unix.FADV_WILLNEED)
if _, err := reader.Seek(int64(offset), 0); err != nil {
return err
}
var resp msgs.BlocksResponse = &msgs.FetchBlockResp{}
if withCrc {
resp = &msgs.FetchBlockWithCrcResp{}
}
if err := writeBlocksResponse(log, conn, resp); err != nil {
return err
}
lf := io.LimitedReader{
R: reader,
N: int64(count),
}
read, err := conn.ReadFrom(&lf)
if err != nil {
return err
}
if read != int64(count) {
log.RaiseAlert("expected to read at least %v bytes, but only got %v for file %q", count, read, blockPath)
return msgs.INTERNAL_ERROR
}
} else {
// the only remaining case is that we have a file in new format and client wants old format
offset = offsetPageCount * msgs.TERN_PAGE_WITH_CRC_SIZE
unix.Fadvise(int(f.Fd()), int64(offset), preReadSize, unix.FADV_SEQUENTIAL|unix.FADV_WILLNEED)
if _, err := reader.Seek(int64(offset), 0); err != nil {
return err
}
if err := writeBlocksResponse(log, conn, &msgs.FetchBlockResp{}); err != nil {
return err
}
buf := env.bufPool.Get(1 << 20)
defer env.bufPool.Put(buf)
converter := newToOldReadConverter{
log: log,
r: reader,
b: buf.Bytes(),
totalRead: 0,
bytesInBuffer: 0,
}
lf := io.LimitedReader{
R: &converter,
N: int64(count),
}
read, err := conn.ReadFrom(&lf)
if err != nil {
return err
}
if read != int64(count) {
log.RaiseAlert("expected to read at least %v bytes, but only got %v for file %q", count, read, blockPath)
return msgs.INTERNAL_ERROR
}
}
s := env.stats[blockServiceId]
atomic.AddUint64(&s.blocksFetched, 1)
atomic.AddUint64(&s.bytesFetched, uint64(count))
return nil
}
func getPhysicalBlockSize(path string) (int, error) {
fs := syscall.Statfs_t{}
err := syscall.Statfs(path, &fs)
if err != nil {
return 0, err
}
return int(fs.Bsize), nil
}
func checkBlock(log *log.Logger, env *env, blockServiceId msgs.BlockServiceId, basePath string, blockId msgs.BlockId, expectedSize uint32, crc msgs.Crc, conn *net.TCPConn) error {
blockPath := path.Join(basePath, blockId.Path())
log.Debug("checking block id %v at path %v", blockId, blockPath)
// we try to open no crc first as this file is deleted only after file with crc is created
// if it doesn't exist in both places then it really is deleted
f, err := os.Open(blockPath)
if errors.Is(err, syscall.ENODATA) {
// see <https://internal-repo/issues/106>
raiseAlertAndHardwareEvent(log, env.failureDomain, blockServiceId.String(),
fmt.Sprintf("could not open block %v, got ENODATA, this probably means that the block/disk is gone", blockPath))
// return io error, downstream code will pick it up
return syscall.EIO
}
if os.IsNotExist(err) {
return msgs.BLOCK_NOT_FOUND
}
if err != nil {
return err
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
log.ErrorNoAlert("could not read file %v : %v", blockPath, err)
return err
}
s := env.stats[blockServiceId]
atomic.AddUint64(&s.blocksChecked, 1)
atomic.AddUint64(&s.bytesChecked, uint64(expectedSize))
if uint32(fi.Size())%msgs.TERN_PAGE_WITH_CRC_SIZE != 0 {
log.ErrorNoAlert("size %v for block %v, not multiple of TERN_PAGE_WITH_CRC_SIZE", uint32(fi.Size()), blockPath)
return msgs.BAD_BLOCK_CRC
}
actualDataSize := (uint32(fi.Size()) / msgs.TERN_PAGE_WITH_CRC_SIZE) * msgs.TERN_PAGE_SIZE
if actualDataSize != expectedSize {
log.ErrorNoAlert("size %v for block %v, not equal to expected size %v", actualDataSize, blockPath, expectedSize)
return msgs.BAD_BLOCK_CRC
}
bufPtr := env.bufPool.Get(1 << 20)
defer env.bufPool.Put(bufPtr)
err = verifyCrcReader(log, bufPtr.Bytes(), f, crc)
if errors.Is(err, syscall.ENODATA) {
// see <https://internal-repo/issues/106>
raiseAlertAndHardwareEvent(log, env.failureDomain, blockServiceId.String(),
fmt.Sprintf("could not open block %v, got ENODATA, this probably means that the block/disk is gone", blockPath))
// return io error, downstream code will pick it up
return syscall.EIO
}
if os.IsNotExist(err) {
return msgs.BLOCK_NOT_FOUND
}
if err != nil {
return err
}
if err := writeBlocksResponse(log, conn, &msgs.CheckBlockResp{}); err != nil {
return err
}
return nil
}
func checkWriteCertificate(log *log.Logger, cipher cipher.Block, blockServiceId msgs.BlockServiceId, req *msgs.WriteBlockReq) error {
expectedMac, good := certificate.CheckBlockWriteCertificate(cipher, blockServiceId, req)
if !good {
log.Debug("mac computed for %v %v %v %v", blockServiceId, req.BlockId, req.Crc, req.Size)
log.RaiseAlert("bad MAC, got %v, expected %v", req.Certificate, expectedMac)
return msgs.BAD_CERTIFICATE
}
return nil
}
func writeToBuf(log *log.Logger, env *env, reader io.Reader, size int64) (*bufpool.Buf, error) {
readBufPtr := env.bufPool.Get(1 << 20)
defer env.bufPool.Put(readBufPtr)
readBuffer := readBufPtr.Bytes()
var err error
writeButPtr := env.bufPool.Get(int(size / int64(msgs.TERN_PAGE_SIZE) * int64(msgs.TERN_PAGE_WITH_CRC_SIZE)))
writeBuffer := writeButPtr.Bytes()
defer func() {
if err != nil {
env.bufPool.Put(writeButPtr)
}
}()
readerHasMoreData := true
dataInReadBuffer := 0
dataInWriteBuffer := 0
endReadOffset := len(readBuffer)
for readerHasMoreData && size > int64(dataInReadBuffer) {
if int64(endReadOffset) > size {
endReadOffset = int(size)
}
read, err := reader.Read(readBuffer[dataInReadBuffer:endReadOffset])
if err != nil {
if err == io.EOF {
readerHasMoreData = false
err = nil
} else {
return nil, err
}
}
dataInReadBuffer += read
if dataInReadBuffer < int(msgs.TERN_PAGE_SIZE) {
continue
}
availablePages := dataInReadBuffer / int(msgs.TERN_PAGE_SIZE)
for i := 0; i < availablePages; i++ {
page := readBuffer[i*int(msgs.TERN_PAGE_SIZE) : (i+1)*int(msgs.TERN_PAGE_SIZE)]
dataInWriteBuffer += copy(writeBuffer[dataInWriteBuffer:], page)
pageCRC := crc32c.Sum(0, page)
binary.LittleEndian.PutUint32(writeBuffer[dataInWriteBuffer:dataInWriteBuffer+4], pageCRC)
dataInWriteBuffer += 4
}
size -= int64(availablePages) * int64(msgs.TERN_PAGE_SIZE)
dataInReadBuffer = copy(readBuffer[:], readBuffer[availablePages*int(msgs.TERN_PAGE_SIZE):dataInReadBuffer])
}
if !readerHasMoreData && (size-int64(dataInReadBuffer) > 0) {
log.Debug("failed converting block, reached EOF in input stream, missing %d bytes", size-int64(dataInReadBuffer))
err = io.EOF
return nil, err
}
if dataInReadBuffer != 0 || size != 0 {
log.Debug("failed converting block, unexpected data size. left in read buffer %d, remaining size %d", dataInReadBuffer, size)
err = msgs.BAD_BLOCK_CRC
return nil, err
}
if dataInWriteBuffer != len(writeBuffer) {
log.Debug("failed converting block, unexpected write buffer size. expected %d, got %d", len(writeBuffer), dataInWriteBuffer)
err = msgs.BAD_BLOCK_CRC
return nil, err
}
return writeButPtr, nil
}
func writeBlockInternal(
log *log.Logger,
env *env,
reader io.LimitedReader,
blockServiceId msgs.BlockServiceId,
expectedCrc msgs.Crc,
blockId msgs.BlockId,
filePath string,
) error {
// We don't check CRC here, we fully check tmpFile after it has been written and synced
bufPtr, err := writeToBuf(log, env, reader.R, reader.N)
if err != nil {
return err
}
defer env.bufPool.Put(bufPtr)
tmpFile, err := writeBufToTemp(&env.stats[blockServiceId].bytesWritten, path.Dir(filePath), bufPtr.Bytes())
if err != nil {
return err
}
defer os.Remove(tmpFile)
readBufPtr := env.bufPool.Get(1 << 20)
defer env.bufPool.Put(readBufPtr)
err = verifyCrcFile(log, readBufPtr.Bytes(), tmpFile, int64(len(bufPtr.Bytes())), expectedCrc)
if err != nil {
log.ErrorNoAlert("failed writing block %v in blockservice %v with error : %v", blockId, blockServiceId, err)
return err
}
err = moveFileAndSyncDir(tmpFile, filePath)
if err != nil {
log.ErrorNoAlert("failed writing block %v in blockservice %v with error : %v", blockId, blockServiceId, err)
return err
}
return nil
}
func writeBlock(
log *log.Logger,
env *env,
blockServiceId msgs.BlockServiceId, cipher cipher.Block, basePath string,
blockId msgs.BlockId, expectedCrc msgs.Crc, size uint32, conn *net.TCPConn,
) error {
filePath := path.Join(basePath, blockId.Path())
log.Debug("writing block %v at path %v", blockId, basePath)
err := writeBlockInternal(log, env, io.LimitedReader{R: conn, N: int64(size)}, blockServiceId, expectedCrc, blockId, filePath)
if err != nil {
return err
}
log.Debug("writing proof")
if err := writeBlocksResponse(log, conn, &msgs.WriteBlockResp{Proof: BlockWriteProof(blockServiceId, blockId, cipher)}); err != nil {
return err
}
atomic.AddUint64(&env.stats[blockServiceId].blocksWritten, 1)
return nil
}
func testWrite(
log *log.Logger, env *env, blockServiceId msgs.BlockServiceId, basePath string, size uint64, conn *net.TCPConn,
) error {
filePath := path.Join(basePath, fmt.Sprintf("tmp.test-write%d", rand.Int63()))
defer os.Remove(filePath)
err := writeBlockInternal(log, env, io.LimitedReader{R: conn, N: int64(size)}, blockServiceId, 0, msgs.BlockId(0), filePath)
if err != nil {
return err
}
if err := writeBlocksResponse(log, conn, &msgs.TestWriteResp{}); err != nil {
return err
}
return nil
}
const PAST_CUTOFF time.Duration = 22 * time.Hour
const DEFAULT_FUTURE_CUTOFF time.Duration = 1 * time.Hour
const WRITE_FUTURE_CUTOFF time.Duration = 5 * time.Minute
const MAX_OBJECT_SIZE uint32 = 100 << 20
// The bool is whether we should keep going
func handleRequestError(
log *log.Logger,
blockServices map[msgs.BlockServiceId]*blockService,
deadBlockServices map[msgs.BlockServiceId]deadBlockService,
conn *net.TCPConn,
lastError *error,
blockServiceId msgs.BlockServiceId,
req msgs.BlocksMessageKind,
err error,
) bool {
defer func() {
*lastError = err
}()
if err == io.EOF {
log.Debug("got EOF from %v, terminating", conn.RemoteAddr())
return false
}
rootErr := err
for {
unwrapped := errors.Unwrap(rootErr)
if unwrapped == nil {
break
}
rootErr = unwrapped
}
if netErr, ok := rootErr.(net.Error); ok && netErr.Timeout() {
log.Debug("got timeout from %v, terminating", conn.RemoteAddr())
return false
}
if sysErr, ok := rootErr.(syscall.Errno); ok {
if sysErr == syscall.EPIPE {
log.Info("got broken pipe error from %v, terminating", conn.RemoteAddr())
return false
}
if sysErr == syscall.ECONNRESET {
log.Info("got connection reset error from %v, terminating", conn.RemoteAddr())
return false
}
}
if errors.Is(err, syscall.EIO) && blockServiceId != 0 {
blockService := blockServices[blockServiceId]
if blockService.couldNotUpdateInfoBlocks || blockService.couldNotUpdateInfoCapacity {
err = msgs.BLOCK_IO_ERROR_DEVICE
} else {
err = msgs.BLOCK_IO_ERROR_FILE
}
atomic.AddUint64(&blockService.ioErrors, 1)
log.ErrorNoAlert("got unxpected IO error %v from %v for req kind %v, block service %v, will return %v, previous error: %v", err, conn.RemoteAddr(), req, blockServiceId, err, *lastError)
writeBlocksResponseError(log, conn, err.(msgs.TernError))
return false
}
// In general, refuse to service requests for block services that we
// don't have. In the case of checking a dead block, we don't emit
// an alert, since it's expected when the scrubber is running.
// Similarly, reading blocks from old block services can happen if
// the cached span structure is used in the kmod.
if _, isDead := deadBlockServices[blockServiceId]; isDead && (req == msgs.CHECK_BLOCK || req == msgs.FETCH_BLOCK || req == msgs.FETCH_BLOCK_WITH_CRC) {
log.Info("got fetch/check block request for dead block service %v", blockServiceId)
if ternErr, isTernErr := err.(msgs.TernError); isTernErr {
if err := writeBlocksResponseError(log, conn, ternErr); err != nil {
log.Info("could not write response error to %v, will terminate connection: %v", conn.RemoteAddr(), err)
return false
}
}
return true
}
// we always raise an alert since this is almost always bad news in the block service
if !errors.Is(err, syscall.ENOSPC) && err != msgs.BLOCK_SERVICE_NOT_FOUND && err != msgs.BLOCK_NOT_FOUND {
log.RaiseAlertStack("", 1, "got unexpected error %v from %v for req kind %v, block service %v, previous error %v", err, conn.RemoteAddr(), req, blockServiceId, *lastError)
}
if ternErr, isTernErr := err.(msgs.TernError); isTernErr {
if err := writeBlocksResponseError(log, conn, ternErr); err != nil {
log.Info("could not write response error to %v, will terminate connection: %v", conn.RemoteAddr(), err)
return false
}
// Keep the connection around using a whitelist of conditions where we know
// that the stream is safe. Right now I just added one case which I know
// is safe, we can add others conservatively in the future if we wish to.
safeError := false
safeError = safeError || ((req == msgs.CHECK_BLOCK || req == msgs.FETCH_BLOCK || req == msgs.FETCH_BLOCK_WITH_CRC) && ternErr == msgs.BLOCK_NOT_FOUND)
if safeError {
log.Info("preserving connection from %v after err %v", conn.RemoteAddr(), err)
return true
} else {
log.Info("not preserving connection from %v after err %v", conn.RemoteAddr(), err)
return false
}
} else {
// attempt to say goodbye, ignore errors
writeBlocksResponseError(log, conn, msgs.INTERNAL_ERROR)
log.Info("tearing down connection from %v after internal error %v", conn.RemoteAddr(), err)
return false
}
}
type deadBlockService struct{}
func readBlocksRequest(
log *log.Logger,
r io.Reader,
) (msgs.BlockServiceId, msgs.BlocksRequest, error) {
var protocol uint32
if err := binary.Read(r, binary.LittleEndian, &protocol); err != nil {
return 0, nil, err
}
if protocol != msgs.BLOCKS_REQ_PROTOCOL_VERSION {
log.RaiseAlert("bad blocks protocol, expected %v, got %v", msgs.BLOCKS_REQ_PROTOCOL_VERSION, protocol)
return 0, nil, msgs.MALFORMED_REQUEST
}
var blockServiceId uint64
if err := binary.Read(r, binary.LittleEndian, &blockServiceId); err != nil {
return 0, nil, err
}
var kindByte [1]byte
if _, err := io.ReadFull(r, kindByte[:]); err != nil {
return 0, nil, err
}
kind := msgs.BlocksMessageKind(kindByte[0])
var req msgs.BlocksRequest
switch kind {
case msgs.ERASE_BLOCK:
req = &msgs.EraseBlockReq{}
case msgs.FETCH_BLOCK:
req = &msgs.FetchBlockReq{}
case msgs.FETCH_BLOCK_WITH_CRC:
req = &msgs.FetchBlockWithCrcReq{}
case msgs.WRITE_BLOCK:
req = &msgs.WriteBlockReq{}
case msgs.TEST_WRITE:
req = &msgs.TestWriteReq{}
case msgs.CHECK_BLOCK:
req = &msgs.CheckBlockReq{}
default:
log.RaiseAlert("bad blocks request kind %v", kind)
return 0, nil, msgs.MALFORMED_REQUEST
}
if err := req.Unpack(r); err != nil {
return 0, nil, err
}
return msgs.BlockServiceId(blockServiceId), req, nil
}
// The bool tells us whether we should keep going
func handleSingleRequest(
log *log.Logger,
env *env,
_ chan any,
lastError *error,
blockServices map[msgs.BlockServiceId]*blockService,
deadBlockServices map[msgs.BlockServiceId]deadBlockService,
conn *net.TCPConn,
futureCutoff time.Duration,
connectionTimeout time.Duration,
) bool {
if connectionTimeout != 0 {
conn.SetReadDeadline(time.Now().Add(connectionTimeout))
}
blockServiceId, req, err := readBlocksRequest(log, conn)
if err != nil {
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, 0, 0, err)
}
kind := req.BlocksRequestKind()
t := time.Now()
defer func() {
env.counters[kind].Add(time.Since(t))
}()
log.Debug("servicing request of type %T from %v", req, conn.RemoteAddr())
log.Trace("req %+v", req)
defer log.Debug("serviced request of type %T from %v", req, conn.RemoteAddr())
if connectionTimeout != 0 {
// Reset timeout, with default settings this will give
// the request a minute to complete, given that the max
// block size is 10MiB, that is ~0.17MiB/s, so it should
// be plenty of time unless something is wrong.
//
// If we didn't reset this (or just remove the timeout)
// the previous timeout might very well trip the request
// because it might have been almost expired.
conn.SetDeadline(time.Now().Add(connectionTimeout))
}
blockService, found := blockServices[blockServiceId]
if !found {
// In general, refuse to service requests for block services that we
// don't have.
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, msgs.BLOCK_SERVICE_NOT_FOUND)
}
atomic.AddUint64(&blockService.requests, 1)
switch whichReq := req.(type) {
case *msgs.EraseBlockReq:
if err := checkEraseCertificate(log, blockServiceId, blockService.cipher, whichReq); err != nil {
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
cutoffTime := msgs.TernTime(uint64(whichReq.BlockId)).Time().Add(futureCutoff)
now := time.Now()
if now.Before(cutoffTime) {
log.ErrorNoAlert("block %v is too recent to be deleted (now=%v, cutoffTime=%v)", whichReq.BlockId, now, cutoffTime)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, msgs.BLOCK_TOO_RECENT_FOR_DELETION)
}
if err := eraseBlock(log, env, blockServiceId, blockService.path, whichReq.BlockId); err != nil {
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
resp := msgs.EraseBlockResp{
Proof: certificate.BlockEraseProof(blockServiceId, whichReq.BlockId, blockService.cipher),
}
if err := writeBlocksResponse(log, conn, &resp); err != nil {
log.Info("could not send blocks response to %v: %v", conn.RemoteAddr(), err)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
case *msgs.FetchBlockReq:
if err := sendFetchBlock(log, env, blockServiceId, blockService.path, whichReq.BlockId, whichReq.Offset, whichReq.Count, conn, false, msgs.InodeId(0)); err != nil {
log.Info("could not send block response to %v: %v", conn.RemoteAddr(), err)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
case *msgs.FetchBlockWithCrcReq:
if err := sendFetchBlock(log, env, blockServiceId, blockService.path, whichReq.BlockId, whichReq.Offset, whichReq.Count, conn, true, whichReq.FileId); err != nil {
log.Info("could not send block response to %v: %v", conn.RemoteAddr(), err)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
case *msgs.WriteBlockReq:
pastCutoffTime := msgs.TernTime(uint64(whichReq.BlockId)).Time().Add(-PAST_CUTOFF)
futureCutoffTime := msgs.TernTime(uint64(whichReq.BlockId)).Time().Add(WRITE_FUTURE_CUTOFF)
now := time.Now()
if now.Before(pastCutoffTime) {
panic(fmt.Errorf("block %v is in the future! (now=%v, pastCutoffTime=%v)", whichReq.BlockId, now, pastCutoffTime))
}
if now.After(futureCutoffTime) {
log.ErrorNoAlert("block %v is too old to be written (now=%v, futureCutoffTime=%v)", whichReq.BlockId, now, futureCutoffTime)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, msgs.BLOCK_TOO_OLD_FOR_WRITE)
}
if err := checkWriteCertificate(log, blockService.cipher, blockServiceId, whichReq); err != nil {
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
if whichReq.Size > MAX_OBJECT_SIZE {
log.RaiseAlert("block %v exceeds max object size: %v > %v", whichReq.BlockId, whichReq.Size, MAX_OBJECT_SIZE)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, msgs.BLOCK_TOO_BIG)
}
if err := writeBlock(log, env, blockServiceId, blockService.cipher, blockService.path, whichReq.BlockId, whichReq.Crc, whichReq.Size, conn); err != nil {
log.Info("could not write block: %v", err)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
case *msgs.CheckBlockReq:
if err := checkBlock(log, env, blockServiceId, blockService.path, whichReq.BlockId, whichReq.Size, whichReq.Crc, conn); err != nil {
log.Info("checking block failed, conn %v, err %v", conn.RemoteAddr(), err)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
case *msgs.TestWriteReq:
if err := testWrite(log, env, blockServiceId, blockService.path, whichReq.Size, conn); err != nil {
log.Info("could not perform test write: %v", err)
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, err)
}
default:
return handleRequestError(log, blockServices, deadBlockServices, conn, lastError, blockServiceId, kind, fmt.Errorf("bad request type %T", req))
}
return true
}
func handleRequest(
log *log.Logger,
env *env,
terminateChan chan any,
blockServices map[msgs.BlockServiceId]*blockService,
deadBlockServices map[msgs.BlockServiceId]deadBlockService,
conn *net.TCPConn,
futureCutoff time.Duration,
connectionTimeout time.Duration,
) {
defer conn.Close()
var lastError error
for {
keepGoing := handleSingleRequest(log, env, terminateChan, &lastError, blockServices, deadBlockServices, conn, futureCutoff, connectionTimeout)
if !keepGoing {
return
}
}
}
func usage() {
fmt.Fprintf(os.Stderr, "Usage: %v DIRECTORY STORAGE_CLASS...\n\n", os.Args[0])
description := `
For each directory/storage class pair specified we'll have one block
service. The block service id for each will be automatically generated
when running for the first time. The failure domain will be the same.
The intention is that a single blockservice process will service
a storage node.
Options:`
description = strings.TrimSpace(description)
fmt.Fprintln(os.Stderr, description)
flag.PrintDefaults()
}
func retrieveOrCreateKey(log *log.Logger, dir string) [16]byte {
var err error
var keyFile *os.File
keyFilePath := path.Join(dir, "secret.key")
keyFile, err = os.OpenFile(keyFilePath, os.O_APPEND|os.O_CREATE|os.O_RDWR, 0600)
if err != nil {
fmt.Fprintf(os.Stderr, "could not open key file %v: %v\n", keyFilePath, err)
os.Exit(1)
}
keyFile.Seek(0, 0)
if err := syscall.Flock(int(keyFile.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil {
fmt.Fprintf(os.Stderr, "could not lock key file %v: %v\n", keyFilePath, err)
os.Exit(1)
}
var key [16]byte
var read int
read, err = keyFile.Read(key[:])
if err != nil && err != io.EOF {
fmt.Fprintf(os.Stderr, "could not read key file %v: %v\n", keyFilePath, err)
os.Exit(1)
}
if err == io.EOF {
log.Info("creating new secret key")
if _, err := crand.Read(key[:]); err != nil {
panic(err)
}
if _, err := keyFile.Write(key[:]); err != nil {
panic(err)
}
keyCrc := crc32c.Sum(0, key[:])
if err := binary.Write(keyFile, binary.LittleEndian, keyCrc); err != nil {
panic(err)
}
log.Info("creating directory structure")
if err := os.Mkdir(path.Join(dir, "with_crc"), 0755); err != nil && !os.IsExist(err) {
panic(fmt.Errorf("failed to create folder %s error: %v", path.Join(dir, "with_crc"), err))
}
} else if read != 16 {
panic(fmt.Errorf("short secret key (%v rather than 16 bytes)", read))
} else {
expectedKeyCrc := crc32c.Sum(0, key[:])
var actualKeyCrc uint32
if err := binary.Read(keyFile, binary.LittleEndian, &actualKeyCrc); err != nil {
panic(err)
}
if expectedKeyCrc != actualKeyCrc {
panic(fmt.Errorf("expected crc %v, got %v", msgs.Crc(expectedKeyCrc), msgs.Crc(actualKeyCrc)))
}
}
return key
}
type diskStats struct {
readMs uint64
writeMs uint64
weightedIoMs uint64
}
func getDiskStats(log *log.Logger, statsPath string) (map[string]diskStats, error) {
file, err := os.Open(statsPath)
if err != nil {
return nil, err
}
defer file.Close()
scanner := bufio.NewScanner(file)
ret := make(map[string]diskStats)
for scanner.Scan() {
line := scanner.Text()
fields := strings.Fields(line)
if len(fields) < 11 {
log.RaiseAlert("malformed disk entry in %v: %v", statsPath, line)
continue
}
devId := fmt.Sprintf("%s:%s", fields[0], fields[1])
// https://www.kernel.org/doc/html/v5.4/admin-guide/iostats.html
readMs, err := strconv.ParseUint(fields[6], 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse reads ms from %s", line)
}
writeMs, err := strconv.ParseUint(fields[10], 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse write ms from %s", line)
}
weightedIoMs, err := strconv.ParseUint(fields[11], 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse weighted IO ms from %s", line)
}
ret[devId] = diskStats{
readMs: readMs,
writeMs: writeMs,
weightedIoMs: weightedIoMs,
}
}
return ret, nil
}
func raiseAlerts(log *log.Logger, env *env, blockServices map[msgs.BlockServiceId]*blockService) {
for {
for bsId, bs := range blockServices {
ioErrors := bs.lastIoErrors
requests := bs.lastRequests
bs.lastIoErrors = atomic.LoadUint64(&bs.ioErrors)
bs.lastRequests = atomic.LoadUint64(&bs.requests)
ioErrors = bs.lastIoErrors - ioErrors
requests = bs.lastRequests - requests
if requests*uint64(env.ioAlertPercent) < ioErrors*100 {
log.RaiseNC(&bs.ioErrorsAlert, "block service %v had %v ioErrors from %v requests in the last 5 minutes which is over the %d%% threshold", bsId, ioErrors, requests, env.ioAlertPercent)
log.Info("decommissioning block service %v", bs.cachedInfo.Id)
env.shuckleConn.Request(&msgs.DecommissionBlockServiceReq{Id: bs.cachedInfo.Id})
} else {
log.ClearNC(&bs.ioErrorsAlert)
}
}
time.Sleep(5 * time.Minute)
}
}
func sendMetrics(l *log.Logger, env *env, influxDB *log.InfluxDB, blockServices map[msgs.BlockServiceId]*blockService, failureDomain string) {
metrics := log.MetricsBuilder{}
rand := wyhash.New(mrand.Uint64())
alert := l.NewNCAlert(10 * time.Second)
failureDomainEscaped := strings.ReplaceAll(failureDomain, " ", "-")
for {
diskMetrics, err := getDiskStats(l, "/proc/diskstats")
if err != nil {
l.RaiseAlert("failed reading diskstats: %v", err)
}
l.Info("sending metrics")
metrics.Reset()
now := time.Now()
for bsId, bsStats := range env.stats {
metrics.Measurement("eggsfs_blocks_write")
metrics.Tag("blockservice", bsId.String())
metrics.Tag("failuredomain", failureDomainEscaped)
metrics.Tag("pathprefix", env.pathPrefix)
metrics.FieldU64("bytes", bsStats.bytesWritten)
metrics.FieldU64("blocks", bsStats.blocksWritten)
metrics.Timestamp(now)
metrics.Measurement("eggsfs_blocks_read")
metrics.Tag("blockservice", bsId.String())
metrics.Tag("failuredomain", failureDomainEscaped)
metrics.Tag("pathprefix", env.pathPrefix)
metrics.FieldU64("bytes", bsStats.bytesFetched)
metrics.FieldU64("blocks", bsStats.blocksFetched)
metrics.Timestamp(now)
metrics.Measurement("eggsfs_blocks_erase")
metrics.Tag("blockservice", bsId.String())
metrics.Tag("failuredomain", failureDomainEscaped)
metrics.Tag("pathprefix", env.pathPrefix)
metrics.FieldU64("blocks", bsStats.blocksErased)
metrics.Timestamp(now)
metrics.Measurement("eggsfs_blocks_check")
metrics.Tag("blockservice", bsId.String())
metrics.Tag("failuredomain", failureDomainEscaped)
metrics.Tag("pathprefix", env.pathPrefix)
metrics.FieldU64("blocks", bsStats.blocksChecked)
metrics.FieldU64("bytes", bsStats.bytesChecked)
metrics.Timestamp(now)
}
for bsId, bsInfo := range blockServices {
metrics.Measurement("eggsfs_blocks_storage")
metrics.Tag("blockservice", bsId.String())
metrics.Tag("failuredomain", failureDomainEscaped)
metrics.Tag("pathprefix", env.pathPrefix)
metrics.Tag("storageclass", bsInfo.storageClass.String())
metrics.FieldU64("capacity", bsInfo.cachedInfo.CapacityBytes)
metrics.FieldU64("available", bsInfo.cachedInfo.AvailableBytes)
metrics.FieldU64("blocks", bsInfo.cachedInfo.Blocks)
metrics.FieldU64("io_errors", bsInfo.ioErrors)
dm, found := diskMetrics[bsInfo.devId]
if found {
metrics.FieldU64("read_ms", dm.readMs)
metrics.FieldU64("write_ms", dm.writeMs)
metrics.FieldU64("weighted_io_ms", dm.weightedIoMs)
}
metrics.Timestamp(now)
}
err = influxDB.SendMetrics(metrics.Payload())
if err == nil {
l.ClearNC(alert)
sleepFor := time.Minute + time.Duration(rand.Uint64() & ^(uint64(1)<<63))%time.Minute
l.Info("metrics sent, sleeping for %v", sleepFor)
time.Sleep(sleepFor)
} else {
l.RaiseNC(alert, "failed to send metrics, will try again in a second: %v", err)
time.Sleep(time.Second)
}
}
}
type blockService struct {
path string
devId string
key [16]byte
cipher cipher.Block
storageClass msgs.StorageClass
cachedInfo msgs.RegisterBlockServiceInfo
couldNotUpdateInfoBlocks bool
couldNotUpdateInfoBlocksAlert log.XmonNCAlert
couldNotUpdateInfoCapacity bool
couldNotUpdateInfoCapacityAlert log.XmonNCAlert
ioErrorsAlert log.XmonNCAlert
ioErrors uint64
requests uint64
lastIoErrors uint64
lastRequests uint64
}
func getMountsInfo(log *log.Logger, mountsPath string) (map[string]string, error) {
file, err := os.Open(mountsPath)
if err != nil {
return nil, err
}
defer file.Close()
scanner := bufio.NewScanner(file)
ret := make(map[string]string)
for scanner.Scan() {
line := scanner.Text()
mountFields := strings.Fields(line)
if len(mountFields) < 11 {
log.RaiseAlert("malformed mount in %v: %v", mountsPath, line)
continue
}
path := mountFields[4]
// should be major:minor of the mounted disk
ret[path] = mountFields[2]
}
return ret, nil
}
func main() {
flag.Usage = usage
failureDomainStr := flag.String("failure-domain", "", "Failure domain")
hostname := flag.String("hostname", "", "Hostname (for hardware event reporting)")
pathPrefixStr := flag.String("path-prefix", "", "We filter our block service not only by failure domain but also by path prefix")
futureCutoff := flag.Duration("future-cutoff", DEFAULT_FUTURE_CUTOFF, "")
var addresses flags.StringArrayFlags
flag.Var(&addresses, "addr", "Addresses (up to two) to bind to, and that will be advertised to shuckle.")
verbose := flag.Bool("verbose", false, "")
xmon := flag.String("xmon", "", "Xmon address (empty for no xmon)")
trace := flag.Bool("trace", false, "")
logFile := flag.String("log-file", "", "If empty, stdout")
shuckleAddress := flag.String("shuckle", "", "Shuckle address (host:port).")
hardwareEventAddress := flag.String("hardwareevent", "", "Server address (host:port) to send hardware events to OR empty for no event logging")
profileFile := flag.String("profile-file", "", "")
syslog := flag.Bool("syslog", false, "")
connectionTimeout := flag.Duration("connection-timeout", 10*time.Minute, "")
reservedStorage := flag.Uint64("reserved-storage", 100<<30, "How many bytes to reserve and under-report capacity")
influxDBOrigin := flag.String("influx-db-origin", "", "Base URL to InfluxDB endpoint")
influxDBOrg := flag.String("influx-db-org", "", "InfluxDB org")
influxDBBucket := flag.String("influx-db-bucket", "", "InfluxDB bucket")
locationId := flag.Uint("location", 10000, "Location ID")
readWholeFile := flag.Bool("read-whole-file", false, "")
ioAlertPercent := flag.Uint("io-alert-percent", 10, "Threshold percent of I/O errors over which we alert")
shuckleConnectionTimeout := flag.Duration("shuckle-connection-timeout", 10*time.Second, "")
flag.Parse()
flagErrors := false
if flag.NArg()%2 != 0 {
fmt.Fprintf(os.Stderr, "Malformed directory/storage class pairs.\n\n")
flagErrors = true
}
if flag.NArg() < 2 {
fmt.Fprintf(os.Stderr, "Expected at least one block service.\n\n")
flagErrors = true
}
if *shuckleAddress == "" {
fmt.Fprintf(os.Stderr, "You need to specify -shuckle.\n")
flagErrors = true
}
if len(addresses) == 0 || len(addresses) > 2 {
fmt.Fprintf(os.Stderr, "at least one -addr and no more than two needs to be provided\n")
flagErrors = true
}
if *locationId > 255 {
fmt.Fprintf(os.Stderr, "Provide valid location id\n")
flagErrors = true
}
if *ioAlertPercent > 100 {
fmt.Fprintf(os.Stderr, "io-alert-percent should not be above 100\n")
flagErrors = true
}
if *failureDomainStr == "" {
fmt.Fprintf(os.Stderr, "failure-domain can not be empty\n")
flagErrors = true
}
if *pathPrefixStr == "" {
*pathPrefixStr = *failureDomainStr
}
if *hardwareEventAddress != "" && *hostname == "" {
fmt.Fprintf(os.Stderr, "-hostname must be provided if you need hardware event reporting\n")
flagErrors = true
}
var influxDB *log.InfluxDB
if *influxDBOrigin == "" {
if *influxDBOrg != "" || *influxDBBucket != "" {
fmt.Fprintf(os.Stderr, "Either all or none of the -influx-db flags must be passed\n")
flagErrors = true
}
} else {
if *influxDBOrg == "" || *influxDBBucket == "" {
fmt.Fprintf(os.Stderr, "Either all or none of the -influx-db flags must be passed\n")
flagErrors = true
}
influxDB = &log.InfluxDB{
Origin: *influxDBOrigin,
Org: *influxDBOrg,
Bucket: *influxDBBucket,
}
}
if flagErrors {
usage()
os.Exit(2)
}
ownIp1, port1, err := flags.ParseIPV4Addr(addresses[0])
if err != nil {
panic(err)
}
var ownIp2 [4]byte
var port2 uint16
if len(addresses) == 2 {
ownIp2, port2, err = flags.ParseIPV4Addr(addresses[1])
if err != nil {
panic(err)
}
}
var failureDomain [16]byte
if copy(failureDomain[:], []byte(*failureDomainStr)) != len(*failureDomainStr) {
fmt.Fprintf(os.Stderr, "Failure domain too long -- must be at most 16 characters: %v\n\n", *failureDomainStr)
usage()
os.Exit(2)
}
// create all directories first, we might need them for the log output
for i := 0; i < flag.NArg(); i += 2 {
dir := flag.Args()[i]
if err := os.Mkdir(dir, 0777); err != nil && !os.IsExist(err) {
panic(fmt.Errorf("could not create data dir %v", dir))
}
}
logOut := os.Stdout
if *logFile != "" {
var err error
logOut, err = os.OpenFile(*logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
fmt.Fprintf(os.Stderr, "could not open log file %v: %v\n", *logFile, err)
os.Exit(1)
}
defer logOut.Close()
}
level := log.INFO
if *verbose {
level = log.DEBUG
}
if *trace {
level = log.TRACE
}
l := log.NewLogger(logOut, &log.LoggerOptions{
Level: level,
Syslog: *syslog,
XmonAddr: *xmon,
HardwareEventServerURL: *hardwareEventAddress,
AppInstance: "eggsblocks",
AppType: "restech_eggsfs.daytime",
PrintQuietAlerts: true,
})
if *profileFile != "" {
f, err := os.Create(*profileFile)
if err != nil {
fmt.Fprintf(os.Stderr, "Could not open profile file %v", *profileFile)
os.Exit(1)
}
pprof.StartCPUProfile(f)
stopCpuProfile := func() {
l.Info("stopping cpu profile")
pprof.StopCPUProfile()
}
defer stopCpuProfile()
// Save CPU profile if we get killed by a signal
signalChan := make(chan os.Signal, 1)
signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGILL, syscall.SIGTRAP, syscall.SIGABRT, syscall.SIGSTKFLT, syscall.SIGSYS)
go func() {
sig := <-signalChan
signal.Stop(signalChan)
stopCpuProfile()
syscall.Kill(syscall.Getpid(), sig.(syscall.Signal))
}()
}
l.Info("Running block service with options:")
l.Info(" locationId = %v", *locationId)
l.Info(" failureDomain = %v", *failureDomainStr)
l.Info(" pathPrefix = %v", *pathPrefixStr)
l.Info(" futureCutoff = %v", *futureCutoff)
l.Info(" addr = '%v'", addresses)
l.Info(" logLevel = %v", level)
l.Info(" logFile = '%v'", *logFile)
l.Info(" shuckleAddress = '%v'", *shuckleAddress)
l.Info(" connectionTimeout = %v", *connectionTimeout)
l.Info(" reservedStorage = %v", *reservedStorage)
l.Info(" shuckleConnectionTimeout = %v", *shuckleConnectionTimeout)
bufPool := bufpool.NewBufPool()
env := &env{
bufPool: bufPool,
stats: make(map[msgs.BlockServiceId]*blockServiceStats),
eraseLocks: make(map[msgs.BlockServiceId]*sync.Mutex),
readWholeFile: *readWholeFile,
failureDomain: *failureDomainStr,
pathPrefix: *pathPrefixStr,
ioAlertPercent: uint8(*ioAlertPercent),
shuckleConn: client.MakeShuckleConn(l, nil, *shuckleAddress, 1),
}
mountsInfo, err := getMountsInfo(l, "/proc/self/mountinfo")
if err != nil {
l.RaiseAlert("Disk stats for mounted paths will not be collected due to failure collecting mount info: %v", err)
}
blockServices := make(map[msgs.BlockServiceId]*blockService)
for i := 0; i < flag.NArg(); i += 2 {
dir := flag.Args()[i]
storageClass := msgs.StorageClassFromString(flag.Args()[i+1])
if storageClass == msgs.EMPTY_STORAGE || storageClass == msgs.INLINE_STORAGE {
fmt.Fprintf(os.Stderr, "Storage class cannot be EMPTY/INLINE")
os.Exit(2)
}
key := retrieveOrCreateKey(l, dir)
id := blockServiceIdFromKey(key)
cipher, err := aes.NewCipher(key[:])
if err != nil {
panic(fmt.Errorf("could not create AES-128 key: %w", err))
}
devId, found := mountsInfo[dir]
if !found {
devId = ""
}
blockServices[id] = &blockService{
path: dir,
devId: devId,
key: key,
cipher: cipher,
storageClass: storageClass,
couldNotUpdateInfoBlocksAlert: *l.NewNCAlert(time.Second),
couldNotUpdateInfoCapacityAlert: *l.NewNCAlert(time.Second),
ioErrorsAlert: *l.NewNCAlert(time.Second),
}
}
for id, blockService := range blockServices {
l.Info("block service %v at %v, storage class %v", id, blockService.path, blockService.storageClass)
}
if len(blockServices) != flag.NArg()/2 {
panic(fmt.Errorf("duplicate block services"))
}
// Now ask shuckle for block services we _had_ before. We need to know this to honor
// erase block requests for old block services safely.
deadBlockServices := make(map[msgs.BlockServiceId]deadBlockService)
{
var shuckleBlockServices []msgs.BlockServiceDeprecatedInfo
{
alert := l.NewNCAlert(0)
l.RaiseNC(alert, "fetching block services")
resp, err := env.shuckleConn.Request(&msgs.AllBlockServicesDeprecatedReq{})
if err != nil {
panic(fmt.Errorf("could not request block services from shuckle: %v", err))
}
l.ClearNC(alert)
shuckleBlockServices = resp.(*msgs.AllBlockServicesDeprecatedResp).BlockServices
}
for i := range shuckleBlockServices {
bs := &shuckleBlockServices[i]
ourBs, weHaveBs := blockServices[bs.Id]
sameFailureDomain := bs.FailureDomain.Name == failureDomain
if len(env.pathPrefix) > 0 {
pathParts := strings.Split(bs.Path, ":")
if len(pathParts) == 2 {
sameFailureDomain = pathParts[0] == env.pathPrefix
}
}
isDecommissioned := (bs.Flags & msgs.TERNFS_BLOCK_SERVICE_DECOMMISSIONED) != 0
// No disagreement on failure domain with shuckle (otherwise we could end up with
// a split brain scenario where two eggsblocks processes assume control of two dead
// block services)
if weHaveBs && !sameFailureDomain {
panic(fmt.Errorf("we have block service %v, and we're failure domain %v, but shuckle thinks it should be failure domain %v. If you've moved this block service, change the failure domain on shuckle", bs.Id, failureDomain, bs.FailureDomain))
}
// block services in the same failure domain, which we do not have, must be
// decommissioned
if !weHaveBs && sameFailureDomain {
if !isDecommissioned {
panic(fmt.Errorf("shuckle has block service %v for our failure domain %v, but we don't have this block service, and it is not decommissioned. If the block service is dead, mark it as decommissioned", bs.Id, failureDomain))
}
deadBlockServices[bs.Id] = deadBlockService{}
}
// we can't have a decommissioned block service
if weHaveBs && isDecommissioned {
l.ErrorNoAlert("We have block service %v, which is decommissioned according to shuckle. We will treat it as if it doesn't exist.", bs.Id)
delete(blockServices, bs.Id)
deadBlockServices[bs.Id] = deadBlockService{}
}
// fill in information from shuckle, if it's recent enough
if weHaveBs && time.Since(bs.LastSeen.Time()) < maximumRegisterInterval*2 {
// everything else is filled in by initBlockServicesInfo
ourBs.cachedInfo = msgs.RegisterBlockServiceInfo{
CapacityBytes: bs.CapacityBytes,
AvailableBytes: bs.AvailableBytes,
Blocks: bs.Blocks,
}
}
}
}
listener1, err := net.Listen("tcp4", fmt.Sprintf("%v:%v", net.IP(ownIp1[:]), port1))
if err != nil {
panic(err)
}
defer listener1.Close()
l.Info("running 1 on %v", listener1.Addr())
actualPort1 := uint16(listener1.Addr().(*net.TCPAddr).Port)
var listener2 net.Listener
var actualPort2 uint16
if len(addresses) == 2 {
listener2, err = net.Listen("tcp4", fmt.Sprintf("%v:%v", net.IP(ownIp2[:]), port2))
if err != nil {
panic(err)
}
defer listener2.Close()
l.Info("running 2 on %v", listener2.Addr())
actualPort2 = uint16(listener2.Addr().(*net.TCPAddr).Port)
}
initBlockServicesInfo(env, l, msgs.Location(*locationId), msgs.AddrsInfo{Addr1: msgs.IpPort{Addrs: ownIp1, Port: actualPort1}, Addr2: msgs.IpPort{Addrs: ownIp2, Port: actualPort2}}, failureDomain, blockServices, *reservedStorage)
l.Info("finished updating block service info, will now start")
terminateChan := make(chan any)
for bsId := range blockServices {
env.stats[bsId] = &blockServiceStats{}
env.eraseLocks[bsId] = &sync.Mutex{}
}
for bsId := range deadBlockServices {
env.stats[bsId] = &blockServiceStats{}
env.eraseLocks[bsId] = &sync.Mutex{}
}
env.counters = make(map[msgs.BlocksMessageKind]*timing.Timings)
for _, k := range msgs.AllBlocksMessageKind {
env.counters[k] = timing.NewTimings(40, 100*time.Microsecond, 1.5)
}
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
registerPeriodically(l, blockServices, env)
}()
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
updateBlockServiceInfoBlocksForever(l, blockServices)
}()
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
updateBlockServiceInfoCapacityForever(l, blockServices, *reservedStorage)
}()
if influxDB != nil {
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
sendMetrics(l, env, influxDB, blockServices, *failureDomainStr)
}()
}
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
raiseAlerts(l, env, blockServices)
}()
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
for {
conn, err := listener1.Accept()
l.Trace("new conn %+v", conn)
if err != nil {
terminateChan <- err
return
}
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
handleRequest(l, env, terminateChan, blockServices, deadBlockServices, conn.(*net.TCPConn), *futureCutoff, *connectionTimeout)
}()
}
}()
if listener2 != nil {
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
for {
conn, err := listener2.Accept()
l.Trace("new conn %+v", conn)
if err != nil {
terminateChan <- err
return
}
go func() {
defer func() { lrecover.HandleRecoverChan(l, terminateChan, recover()) }()
handleRequest(l, env, terminateChan, blockServices, deadBlockServices, conn.(*net.TCPConn), *futureCutoff, *connectionTimeout)
}()
}
}()
}
{
err := <-terminateChan
if err != nil {
panic(err)
}
}
}
func eraseFileIfExistsAndSyncDir(path string) error {
err := os.Remove(path)
if err != nil {
return err
}
dir, err := os.Open(filepath.Dir(path))
if err != nil {
return err
}
defer dir.Close()
return dir.Sync()
}
func writeBufToTemp(statBytes *uint64, basePath string, buf []byte) (string, error) {
if err := os.Mkdir(basePath, 0777); err != nil && !os.IsExist(err) {
return "", err
}
f, err := os.CreateTemp(basePath, "tmp.")
if err != nil {
return "", err
}
tmpName := f.Name()
defer func() {
if err != nil {
os.Remove(tmpName)
}
f.Close()
}()
bufSize := len(buf)
for len(buf) > 0 {
n, err := f.Write(buf)
if err != nil {
return "", err
}
buf = buf[n:]
}
if err = f.Sync(); err != nil {
return "", err
}
atomic.AddUint64(statBytes, uint64(bufSize))
return tmpName, err
}
func verifyCrcFile(log *log.Logger, readBuffer []byte, path string, expectedSize int64, expectedCrc msgs.Crc) error {
f, err := os.Open(path)
if err != nil {
log.Debug("failed opening file %s with error: %v", path, err)
return err
}
defer f.Close()
fs, err := f.Stat()
if err != nil {
log.Debug("failed stating file %s with error: %v", path, err)
return err
}
if fs.Size() != expectedSize {
log.Debug("failed file size when checking crc for file %s. expected size: %d size %d", path, expectedSize, fs.Size())
return msgs.BAD_BLOCK_CRC
}
return verifyCrcReader(log, readBuffer, f, expectedCrc)
}
func verifyCrcReader(log *log.Logger, readBuffer []byte, r io.Reader, expectedCrc msgs.Crc) error {
cursor := uint32(0)
remainingData := 0
actualCrc := uint32(0)
processChunkSize := int(msgs.TERN_PAGE_WITH_CRC_SIZE)
if len(readBuffer) < processChunkSize {
readBuffer = make([]byte, processChunkSize)
}
readerHasMoreData := true
for readerHasMoreData {
read, err := r.Read(readBuffer[remainingData:])
if err != nil {
if err == io.EOF {
err = nil
readerHasMoreData = false
} else {
log.Debug("failed reading source while checking crc. error: %v", err)
return err
}
}
remainingData += read
cursor += uint32(read)
if remainingData < int(msgs.TERN_PAGE_WITH_CRC_SIZE) {
continue
}
numAvailableChunks := remainingData / processChunkSize
for i := 0; i < numAvailableChunks; i++ {
actualPageCrc := crc32c.Sum(0, readBuffer[i*processChunkSize:i*processChunkSize+int(msgs.TERN_PAGE_SIZE)])
storedPageCrc := binary.LittleEndian.Uint32(readBuffer[i*processChunkSize+int(msgs.TERN_PAGE_SIZE) : i*processChunkSize+int(msgs.TERN_PAGE_WITH_CRC_SIZE)])
if storedPageCrc != actualPageCrc {
log.Debug("failed checking crc. incorrect page crc at offset %d, expected %v, got %v", cursor-uint32(remainingData)+uint32(i*processChunkSize), msgs.Crc(storedPageCrc), msgs.Crc(actualPageCrc))
return msgs.BAD_BLOCK_CRC
}
actualCrc = crc32c.Append(actualCrc, actualPageCrc, int(msgs.TERN_PAGE_SIZE))
}
copy(readBuffer[:], readBuffer[numAvailableChunks*processChunkSize:remainingData])
remainingData -= numAvailableChunks * processChunkSize
}
if actualCrc != uint32(expectedCrc) {
log.Debug("failed checking crc. invalid block crc. expected %v, got %v", expectedCrc, msgs.Crc(actualCrc))
return msgs.BAD_BLOCK_CRC
}
if remainingData > 0 {
log.Debug("failed checking crc. unexpected data left")
return msgs.BAD_BLOCK_CRC
}
return nil
}
func moveFileAndSyncDir(src, dst string) error {
err := os.Rename(src, dst)
if err != nil {
return err
}
dir, err := os.Open(filepath.Dir(dst))
if err != nil {
return err
}
defer dir.Close()
return dir.Sync()
}