Files
ternfs-XTXMarkets/go/client/client.go
Francesco Mazzoli 01f9d5addf Improve FUSE, run all tests with it
The FUSE driver, up to now, had no way to know when the user had
"explicitly" closed a file. Instead it linked the TernFS file on
flush, which could cause nasty situation. The classic example
is a fork causing the FD to a TernFS file being present in the forked
process, and then the process dying causing a spurious flush.

This commit adds a way to detect when a flush is due to a close(),
which allows us to link the file only in the cases where that happened,
which is a much better heuristic and close to what we do in the kernel
module.

This commit also contains various other improvements to make all tests
pass under FUSE. The big remaining item is changing how files are read
(they're currently read all upfront and then kept in memory).
2025-09-18 18:09:43 +01:00

1588 lines
51 KiB
Go

// Copyright 2025 XTX Markets Technologies Limited
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// The client package provides a interface to the ternfs cluster that should be
// sufficient for most user level operations (modifying metadata, reading and
// writing blocks, etc).
//
// To use this library you still require an understanding of the way ternfs
// operations, and you will still need to construct the request and reply
// messages (defined in the [msgs] package), but all service discovery and
// network communication is handled for you.
package client
import (
"bytes"
"container/heap"
"encoding/binary"
"errors"
"fmt"
"io"
"math/rand"
"net"
"os"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"unsafe"
"xtx/ternfs/core/bincode"
"xtx/ternfs/core/crc32c"
"xtx/ternfs/core/log"
"xtx/ternfs/core/timing"
"xtx/ternfs/msgs"
)
type ReqCounters struct {
Timings timing.Timings
Attempts uint64
}
func MergeReqCounters(cs []ReqCounters) *ReqCounters {
counters := cs[0] // important to copy
for i := 1; i < len(cs); i++ {
counters.Attempts += cs[i].Attempts
counters.Timings.Merge(&cs[i].Timings)
}
return &counters
}
type ClientCounters struct {
Shard map[uint8]*[256]ReqCounters
CDC map[uint8]*ReqCounters
}
func NewClientCounters() *ClientCounters {
counters := ClientCounters{
Shard: make(map[uint8]*[256]ReqCounters),
CDC: make(map[uint8]*ReqCounters),
}
for _, k := range msgs.AllShardMessageKind {
// max = ~1min
var shards [256]ReqCounters
counters.Shard[uint8(k)] = &shards
for i := 0; i < 256; i++ {
shards[i].Timings = *timing.NewTimings(40, time.Microsecond*10, 1.5)
}
}
for _, k := range msgs.AllCDCMessageKind {
// max = ~2min
counters.CDC[uint8(k)] = &ReqCounters{
Timings: *timing.NewTimings(35, time.Millisecond, 1.5),
}
}
return &counters
}
func (counters *ClientCounters) Log(log *log.Logger) {
formatCounters := func(c *ReqCounters) {
totalCount := uint64(0)
for _, bin := range c.Timings.Histogram() {
totalCount += bin.Count
}
log.Info(" count: %v", totalCount)
if totalCount == 0 {
log.Info(" attempts: %v", c.Attempts)
} else {
log.Info(" attempts: %v (%v)", c.Attempts, float64(c.Attempts)/float64(totalCount))
}
log.Info(" total time: %v", c.Timings.TotalTime())
log.Info(" avg time: %v", c.Timings.Mean())
log.Info(" median time: %v", c.Timings.Median())
hist := bytes.NewBuffer([]byte{})
first := true
countSoFar := uint64(0)
lowerBound := time.Duration(0)
for _, bin := range c.Timings.Histogram() {
if bin.Count == 0 {
continue
}
countSoFar += bin.Count
if first {
fmt.Fprintf(hist, "%v < ", lowerBound)
} else {
fmt.Fprintf(hist, ", ")
}
first = false
fmt.Fprintf(hist, "%v (%0.2f%%) < %v", bin.Count, float64(countSoFar*100)/float64(totalCount), bin.UpperBound)
}
log.Info(" hist: %v", hist.String())
}
var shardTime time.Duration
for _, k := range msgs.AllShardMessageKind {
for i := 0; i < 256; i++ {
shardTime += counters.Shard[uint8(k)][i].Timings.TotalTime()
}
}
log.Info("Shard stats (total shard time %v):", shardTime)
for _, k := range msgs.AllShardMessageKind {
c := MergeReqCounters(counters.Shard[uint8(k)][:])
if c.Attempts == 0 {
continue
}
log.Info(" %v", k)
formatCounters(c)
}
var cdcTime time.Duration
for _, k := range msgs.AllCDCMessageKind {
cdcTime += counters.CDC[uint8(k)].Timings.TotalTime()
}
log.Info("CDC stats (total CDC time %v):", cdcTime)
for _, k := range msgs.AllCDCMessageKind {
c := counters.CDC[uint8(k)]
if c.Attempts == 0 {
continue
}
log.Info(" %v", k)
formatCounters(c)
}
}
var DefaultShardTimeout = timing.ReqTimeouts{
Initial: 100 * time.Millisecond,
Max: 2 * time.Second,
Overall: 10 * time.Second,
Growth: 1.5,
Jitter: 0.1,
}
var DefaultCDCTimeout = timing.ReqTimeouts{
Initial: time.Second,
Max: 10 * time.Second,
Overall: time.Minute,
Growth: 1.5,
Jitter: 0.1,
}
var DefaultBlockTimeout = timing.ReqTimeouts{
Initial: time.Second,
Max: 10 * time.Second,
Overall: 5 * time.Minute,
Growth: 1.5,
Jitter: 0.1,
}
type metadataProcessorRequest struct {
requestId uint64
timeout time.Duration
shard int16 // -1 = cdc
req bincode.Packable
resp bincode.Unpackable
extra any
respCh chan *metadataProcessorResponse
// filled in by request processor
deadline time.Time
index int // index in the heap
}
type metadataProcessorResponse struct {
requestId uint64
resp any
extra any
err error
}
type metadataRequestsPQ []*metadataProcessorRequest
func (pq metadataRequestsPQ) Len() int { return len(pq) }
func (pq metadataRequestsPQ) Less(i, j int) bool {
return pq[i].deadline.UnixNano() < pq[j].deadline.UnixNano()
}
func (pq metadataRequestsPQ) Swap(i, j int) {
pq[i], pq[j] = pq[j], pq[i]
pq[i].index = i
pq[j].index = j
}
func (pq *metadataRequestsPQ) Push(x any) {
n := len(*pq)
item := x.(*metadataProcessorRequest)
item.index = n
*pq = append(*pq, item)
}
func (pq *metadataRequestsPQ) Pop() any {
old := *pq
n := len(old)
item := old[n-1]
old[n-1] = nil // avoid memory leak
item.index = -1 // for safety
*pq = old[0 : n-1]
return item
}
type rawMetadataResponse struct {
receivedAt time.Time
protocol uint32
requestId uint64
kind uint8
respLen int
buf *[]byte // the buf contains the header
}
type clientMetadata struct {
client *Client
sock *net.UDPConn
requestsById map[uint64]*metadataProcessorRequest // requests we've sent, by req id
requestsByTimeout metadataRequestsPQ // requests we've sent, by timeout (earlier first)
earlyRequests map[uint64]rawMetadataResponse // requests we've received a response for, but that we haven't seen that we've sent yet. should be uncommon.
lastCleanedUpEarlyRequestsAt time.Time
quitResponseProcessor chan struct{} // channel to quit the response processor, which in turn closes the socket
incoming chan *metadataProcessorRequest // channel where user requests come in
inFlight chan *metadataProcessorRequest // channel going from request processor to response processor
rawResponses chan rawMetadataResponse // channel going from the socket drainer to the response processor
responsesBufs chan *[]byte // channel to store a cache of buffers to read into
}
var whichMetadatataAddr int
func (cm *clientMetadata) init(log *log.Logger, client *Client) error {
log.Debug("initiating clientMetadata")
defer log.Debug("finished initializing clientMetadata")
cm.client = client
sock, err := net.ListenPacket("udp", ":0")
if err != nil {
return err
}
cm.sock = sock.(*net.UDPConn)
// 10MiB/100byte ~ 100k requests in the pipe. 100byte is
// kinda conservative.
if err := cm.sock.SetReadBuffer(1 << 20); err != nil {
cm.sock.Close()
return err
}
cm.requestsById = make(map[uint64]*metadataProcessorRequest)
cm.requestsByTimeout = make(metadataRequestsPQ, 0)
cm.earlyRequests = make(map[uint64]rawMetadataResponse)
cm.quitResponseProcessor = make(chan struct{})
cm.incoming = make(chan *metadataProcessorRequest, 10_000)
cm.inFlight = make(chan *metadataProcessorRequest, 10_000)
cm.rawResponses = make(chan rawMetadataResponse, 10_000)
cm.responsesBufs = make(chan *[]byte, 128)
for i := 0; i < cap(cm.responsesBufs); i++ {
buf := make([]byte, clientMtu)
cm.responsesBufs <- &buf
}
go cm.processRequests(log)
go cm.processResponses(log)
go cm.drainSocket(log)
return nil
}
func (cm *clientMetadata) close() {
cm.quitResponseProcessor <- struct{}{}
close(cm.incoming)
}
// terminates when cm.incoming gets nil
func (cm *clientMetadata) processRequests(log *log.Logger) {
buf := bytes.NewBuffer([]byte{})
for req := range cm.incoming {
dontWait := req.resp == nil
log.Debug("sending request %T %+v req id %v to shard %v", req.req, req.req, req.requestId, req.shard)
buf.Reset()
var addrs *[2]net.UDPAddr
var kind uint8
var protocol uint32
if req.shard >= 0 { // shard
addrs = cm.client.shardAddrs(msgs.ShardId(req.shard))
kind = uint8(req.req.(msgs.ShardRequest).ShardRequestKind())
protocol = msgs.SHARD_REQ_PROTOCOL_VERSION
} else { // CDC
addrs = cm.client.cdcAddrs()
kind = uint8(req.req.(msgs.CDCRequest).CDCRequestKind())
protocol = msgs.CDC_REQ_PROTOCOL_VERSION
}
binary.Write(buf, binary.LittleEndian, protocol)
binary.Write(buf, binary.LittleEndian, req.requestId)
binary.Write(buf, binary.LittleEndian, kind)
if err := req.req.Pack(buf); err != nil {
log.RaiseAlert("could not pack request %v to shard %v: %v", req.req, req.shard, err)
if !dontWait {
req.respCh <- &metadataProcessorResponse{
requestId: req.requestId,
err: err,
extra: req.extra,
resp: nil,
}
}
// keep running even if the socket is totally broken to process all the requests
continue
}
addr := &addrs[whichMetadatataAddr%2]
if addr.Port == 0 {
addr = &addrs[0]
}
whichMetadatataAddr++
var written int
var err error
epermAlert := log.NewNCAlert(0)
for attempts := 0; ; attempts++ {
written, err = cm.sock.WriteToUDP(buf.Bytes(), addr)
var opError *net.OpError
// We get EPERM when nf drops packets, at least on fsf1/fsf2.
// This is rare.
if errors.As(err, &opError) && os.IsPermission(opError.Err) {
log.RaiseNC(epermAlert, "could not send metadata packet because of EPERM (attempt %v), will retry in 100ms: %v", attempts, err)
time.Sleep(100 * time.Millisecond)
} else {
log.ClearNC(epermAlert)
break
}
}
if err != nil {
log.RaiseAlert("could not send request %v to shard %v addr %v: %v", req.req, req.shard, addr, err)
if !dontWait {
req.respCh <- &metadataProcessorResponse{
requestId: req.requestId,
err: err,
extra: req.extra,
resp: nil,
}
}
// keep running even if the socket is totally broken to process all the requests
continue
}
if written != len(buf.Bytes()) {
panic(fmt.Errorf("%v != %v", written, len(buf.Bytes())))
}
if !dontWait {
req.deadline = time.Now().Add(req.timeout)
cm.inFlight <- req
}
}
log.Debug("got close request in request processor, winding down")
}
func (cm *clientMetadata) parseResponse(log *log.Logger, req *metadataProcessorRequest, rawResp *rawMetadataResponse, dischargeBuf bool) {
defer func() {
if !dischargeBuf {
return
}
select {
case cm.responsesBufs <- rawResp.buf:
default:
panic(fmt.Errorf("impossible: could not put back response buffer which we got from socket drainer"))
}
}()
// check protocol
if req.shard < 0 { // CDC
if rawResp.protocol != msgs.CDC_RESP_PROTOCOL_VERSION {
log.RaiseAlert("got bad cdc protocol %v for request id %v, ignoring", rawResp.protocol, req.requestId)
return
}
} else {
if rawResp.protocol != msgs.SHARD_RESP_PROTOCOL_VERSION {
log.RaiseAlert("got bad shard protocol %v for request id %v, shard %v, ignoring", rawResp.protocol, req.shard, req.requestId)
return
}
}
// remove everywhere
delete(cm.earlyRequests, req.requestId)
if _, found := cm.requestsById[req.requestId]; found {
delete(cm.requestsById, req.requestId)
heap.Remove(&cm.requestsByTimeout, req.index)
}
if rawResp.kind == msgs.ERROR {
var err error
if rawResp.respLen != 4+8+1+2 {
log.RaiseAlert("bad error response length %v, expected %v", rawResp.respLen, 4+8+1+2)
err = msgs.MALFORMED_RESPONSE
} else {
err = msgs.TernError(binary.LittleEndian.Uint16((*rawResp.buf)[4+8+1:]))
}
req.respCh <- &metadataProcessorResponse{
requestId: req.requestId,
err: err,
extra: req.extra,
resp: nil,
}
} else {
// check kind
if req.shard < 0 { // CDC
expectedKind := req.req.(msgs.CDCRequest).CDCRequestKind()
if uint8(expectedKind) != rawResp.kind {
log.RaiseAlert("got bad cdc kind %v for request id %v, expected %v", msgs.CDCMessageKind(rawResp.kind), req.requestId, expectedKind)
req.respCh <- &metadataProcessorResponse{
requestId: req.requestId,
err: msgs.MALFORMED_RESPONSE,
extra: req.extra,
resp: nil,
}
return
}
} else {
expectedKind := req.req.(msgs.ShardRequest).ShardRequestKind()
if uint8(expectedKind) != rawResp.kind {
log.RaiseAlert("got bad shard kind %v for request id %v, shard %v, expected %v", msgs.ShardMessageKind(rawResp.kind), req.requestId, req.shard, expectedKind)
req.respCh <- &metadataProcessorResponse{
requestId: req.requestId,
err: msgs.MALFORMED_RESPONSE,
extra: req.extra,
resp: nil,
}
return
}
}
// unpack
if err := bincode.Unpack((*rawResp.buf)[4+8+1:rawResp.respLen], req.resp); err != nil {
log.RaiseAlert("could not unpack resp %T for request id %v, shard %v: %v", req.resp, req.requestId, req.shard, err)
req.respCh <- &metadataProcessorResponse{
requestId: req.requestId,
err: err,
extra: req.extra,
resp: nil,
}
return
}
log.Debug("received resp %T %v req id %v from shard %v", req.resp, req.resp, req.requestId, req.shard)
// done
req.respCh <- &metadataProcessorResponse{
requestId: req.requestId,
err: nil,
extra: req.extra,
resp: req.resp,
}
}
}
func (cm *clientMetadata) processRawResponse(log *log.Logger, rawResp *rawMetadataResponse) {
if rawResp.buf != nil {
if req, found := cm.requestsById[rawResp.requestId]; found {
// Common case, the request is already there
cm.parseResponse(log, req, rawResp, true)
} else {
// Uncommon case, the request is missing. In this (rare) case
// we still discharge the buffer immediately so that it's already
// available for use
buf := make([]byte, clientMtu)
select {
case cm.responsesBufs <- &buf:
default:
panic(fmt.Errorf("impossible: could not return buffer"))
}
cm.earlyRequests[rawResp.requestId] = *rawResp
}
}
now := time.Now()
// expire requests
for len(cm.requestsById) > 0 {
first := cm.requestsByTimeout[0]
if now.After(first.deadline) {
log.Debug("request %v %T to shard %v has timed out", first.requestId, first.req, first.shard)
heap.Pop(&cm.requestsByTimeout)
delete(cm.requestsById, first.requestId)
// consumer might very plausibly be gone by now,
// don't risk it
go func() {
first.respCh <- &metadataProcessorResponse{
requestId: first.requestId,
err: msgs.TIMEOUT,
extra: first.extra,
resp: nil,
}
}()
} else {
log.Debug("first request %v %T has not passed deadline %v", first.requestId, first.req, first.deadline)
break
}
}
// expire request we got past timeouts
if now.Sub(cm.lastCleanedUpEarlyRequestsAt) > time.Minute {
cm.lastCleanedUpEarlyRequestsAt = now
for reqId, rawReq := range cm.earlyRequests {
if now.Sub(rawReq.receivedAt) > 10*time.Minute {
delete(cm.earlyRequests, reqId)
}
}
}
}
// terminates when `cm.quitResponseProcessor` gets a message
func (cm *clientMetadata) processResponses(log *log.Logger) {
for {
// prioritize responses to requests, we want to get
// them soon to not get spurious timeouts
select {
case rawResp := <-cm.rawResponses:
cm.processRawResponse(log, &rawResp)
continue
case <-cm.quitResponseProcessor:
log.Info("winding down response processor")
cm.sock.Close()
return
default:
}
select {
case req := <-cm.inFlight:
if rawResp, found := cm.earlyRequests[req.requestId]; found {
// uncommon case: we have a response for this already.
cm.parseResponse(log, req, &rawResp, false)
} else {
// common case: we don't have the response yet, put it in the data structures and wait.
// if the request was there before, we remove it from the heap so that we don't have
// dupes and the deadline is right
if _, found := cm.requestsById[req.requestId]; found {
heap.Remove(&cm.requestsByTimeout, req.index)
}
cm.requestsById[req.requestId] = req
heap.Push(&cm.requestsByTimeout, req)
}
case rawResp := <-cm.rawResponses:
cm.processRawResponse(log, &rawResp)
case <-cm.quitResponseProcessor:
log.Info("winding down response processor")
cm.sock.Close()
return
}
}
}
// terminates when the socket is closed
func (cm *clientMetadata) drainSocket(log *log.Logger) {
for {
buf := <-cm.responsesBufs
cm.sock.SetReadDeadline(time.Now().Add(DefaultShardTimeout.Initial / 2))
read, _, err := cm.sock.ReadFromUDP(*buf)
if os.IsTimeout(err) {
cm.responsesBufs <- buf
cm.rawResponses <- rawMetadataResponse{}
continue
}
if err != nil {
if errors.Is(err, net.ErrClosed) {
log.Info("socket is closed, winding down")
cm.responsesBufs <- buf
return
} else {
log.RaiseAlert("got error when reading socket: %v", err)
}
}
if read < 4+8+1 {
log.RaiseAlert("got runt metadata message, expected at least %v bytes, got %v", 4+8+1, read)
cm.responsesBufs <- buf
continue
}
rawResp := rawMetadataResponse{
receivedAt: time.Now(),
respLen: read,
buf: buf,
protocol: binary.LittleEndian.Uint32(*buf),
requestId: binary.LittleEndian.Uint64((*buf)[4:]),
kind: (*buf)[4+8],
}
cm.rawResponses <- rawResp
}
}
type blockCompletion struct {
Resp msgs.BlocksResponse
Extra any
Error error
}
type clientBlockResponse struct {
req msgs.BlocksRequest
resp msgs.BlocksResponse
extra any
// when fetching block, this gets written into
additionalBodyWriter io.ReaderFrom
// stores the error, if any
err error
// called when we're done
completionChan chan *blockCompletion
}
func (resp *clientBlockResponse) done(log *log.Logger, addr1 *net.TCPAddr, addr2 *net.TCPAddr, extra any, err error) {
if resp.err == nil && err != nil {
log.InfoStack(1, "failing request %T %+v addr1=%+v addr2=%+v extra=%+v: %v", resp.req, resp.req, addr1, addr2, extra, err)
resp.err = err
}
completion := &blockCompletion{
Resp: resp.resp,
Error: resp.err,
Extra: resp.extra,
}
resp.completionChan <- completion
}
type clientBlockRequest struct {
blockService msgs.BlockServiceId
req msgs.BlocksRequest
additionalBodyReader io.Reader // when writing block, this will be written after the request
resp *clientBlockResponse
}
type blocksProcessorConn struct {
conn *net.TCPConn
generation uint64
}
type clientBlockResponseWithGeneration struct {
generation uint64
resp *clientBlockResponse
}
type blocksProcessor struct {
reqChan chan *clientBlockRequest
inFlightReqChan chan clientBlockResponseWithGeneration
addr1 net.TCPAddr
addr2 net.TCPAddr
sourceAddr1 *net.TCPAddr
sourceAddr2 *net.TCPAddr
what string
timeout **timing.ReqTimeouts
_conn *blocksProcessorConn // this must be loaded through loadConn
}
func (proc *blocksProcessor) loadConn() *blocksProcessorConn {
return (*blocksProcessorConn)(atomic.LoadPointer((*unsafe.Pointer)(unsafe.Pointer(&proc._conn))))
}
func (proc *blocksProcessor) storeConn(conn *net.TCPConn) *blocksProcessorConn {
gen := proc.loadConn().generation
newConn := &blocksProcessorConn{
conn: conn,
generation: gen + 1,
}
atomic.StorePointer((*unsafe.Pointer)(unsafe.Pointer(&proc._conn)), unsafe.Pointer(newConn))
return newConn
}
var whichBlockIp uint64
var whichSourceIp uint64
func (proc *blocksProcessor) connect(log *log.Logger) (*net.TCPConn, error) {
var err error
sourceIpSelector := atomic.AddUint64(&whichSourceIp, 1)
var sourceAddr *net.TCPAddr
if sourceIpSelector&1 == 0 {
sourceAddr = proc.sourceAddr1
} else {
sourceAddr = proc.sourceAddr2
}
blockIpSelector := atomic.AddUint64(&whichBlockIp, 1)
for i := blockIpSelector; i < blockIpSelector+2; i++ {
var addr *net.TCPAddr
if i&1 == 0 {
addr = &proc.addr1
} else {
addr = &proc.addr2
}
if addr.Port == 0 {
continue
}
log.Debug("trying to connect to block service %v", addr)
dialer := net.Dialer{LocalAddr: sourceAddr, Timeout: (*proc.timeout).Max}
var conn net.Conn
conn, err = dialer.Dial("tcp4", addr.String())
if err == nil {
sock := conn.(*net.TCPConn)
log.Debug("connected to block service at %v", addr)
return sock, nil
}
log.Info("could not connect to block service %v, might try next connection: %v", addr, err)
}
if err == nil {
panic(fmt.Errorf("impossible: got out without errors"))
}
return nil, err
}
// From <https://stackoverflow.com/a/58664631>, checks if a connection
// is still alive.
func connCheck(conn *net.TCPConn) error {
var sysErr error = nil
rc, err := conn.SyscallConn()
if err != nil {
return err
}
err = rc.Read(func(fd uintptr) bool {
var buf []byte = []byte{0}
n, _, err := syscall.Recvfrom(int(fd), buf, syscall.MSG_PEEK|syscall.MSG_DONTWAIT)
switch {
case n == 0 && err == nil:
sysErr = io.EOF
case err == syscall.EAGAIN || err == syscall.EWOULDBLOCK:
sysErr = nil
default:
sysErr = err
}
return true
})
if err != nil {
return err
}
return sysErr
}
func (proc *blocksProcessor) processRequests(log *log.Logger) {
log.Debug("%v: starting request processor for addr1=%v addr2=%v", proc.what, proc.addr1, proc.addr2)
// one iteration = one request
for {
conn := proc.loadConn()
req, ok := <-proc.reqChan
if !ok {
log.Debug("%v: got nil request, tearing down", proc.what)
if conn.conn != nil {
conn.conn.Close()
}
close(proc.inFlightReqChan) // this tears down the response processor
return
}
// empty queue, check that conn is alive, otherwise we might still succeed sending,
// but inevitably fail when reading.
if conn.conn != nil && len(proc.inFlightReqChan) == 0 {
if err := connCheck(conn.conn); err != nil {
log.Debug("connection for addr1=%+v addr2=%+v is dead: %v", proc.addr1, proc.addr2, err)
conn.conn.Close()
conn.conn = nil
}
}
// clear stale connections otherwise we might succeed writing but fail reading on the other side
if conn.conn == nil {
tcpConn, err := proc.connect(log)
if err != nil { // we couldn't connect, not much to do
req.resp.done(log, &proc.addr1, &proc.addr2, req.resp.extra, err)
continue
}
// we did connect
conn = proc.storeConn(tcpConn)
}
log.Debug("writing block request %T %+v for addr1=%v addr2=%v", req.req, req.req, proc.addr1, proc.addr2)
if err := writeBlocksRequest(log, conn.conn, req.blockService, req.req); err != nil {
log.Info("got error when writing block request of kind %v in %v->%v: %v", req.req.BlocksRequestKind(), conn.conn.LocalAddr(), conn.conn.RemoteAddr(), err)
req.resp.done(log, &proc.addr1, &proc.addr2, req.resp.extra, err)
conn.conn.Close()
conn.conn = nil
continue
}
if req.req.BlocksRequestKind() == msgs.WRITE_BLOCK {
writeReq := req.req.(*msgs.WriteBlockReq)
lr := &io.LimitedReader{
R: req.additionalBodyReader,
N: int64(writeReq.Size),
}
log.Debug("writing block body to %v->%v", conn.conn.LocalAddr(), conn.conn.RemoteAddr())
writtenBytes, err := conn.conn.ReadFrom(lr)
if err != nil || writtenBytes < int64(writeReq.Size) {
if err == nil {
err = io.EOF
}
log.Info("got error when writing block body: %v", err)
req.resp.done(log, &proc.addr1, &proc.addr2, req.resp.extra, err)
conn.conn.Close()
conn.conn = nil
continue
}
}
// we wrote it fine, proceed
proc.inFlightReqChan <- clientBlockResponseWithGeneration{
generation: conn.generation,
resp: req.resp,
}
}
}
func (proc *blocksProcessor) processResponses(log *log.Logger) {
log.Debug("%v: starting response processor for addr1=%v addr2=%v", proc.what, proc.addr1, proc.addr2)
// one iteration = one request
for resp := range proc.inFlightReqChan {
conn := proc.loadConn()
connr := conn.conn
if connr == nil {
log.Info("%v: resp %T %+v has no conn, skipping", proc.what, resp.resp.resp, resp.resp.resp)
resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, io.EOF)
continue
}
if conn.generation != resp.generation {
log.Info("%v: resp %T %+v has bad generation %v vs %v, skipping", proc.what, resp.resp.resp, resp.resp.resp, resp.generation, conn.generation)
resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, io.EOF)
continue
}
log.Debug("reading block response %T for req %+v from %v->%v", resp.resp.resp, resp.resp.req, connr.LocalAddr(), connr.RemoteAddr())
// the responsibility for cleaning up the connection is always in the request processor
if err := readBlocksResponse(log, connr, resp.resp.resp); err != nil {
resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, err)
continue
}
if resp.resp.resp.BlocksResponseKind() == msgs.FETCH_BLOCK_WITH_CRC {
req := resp.resp.req.(*msgs.FetchBlockWithCrcReq)
pageCount := (req.Count / msgs.TERN_PAGE_SIZE)
log.Debug("reading block body from %v->%v", connr.LocalAddr(), connr.RemoteAddr())
var page [msgs.TERN_PAGE_WITH_CRC_SIZE]byte
pageFailed := false
for i := uint32(0); i < pageCount; i++ {
bytesRead := uint32(0)
for bytesRead < msgs.TERN_PAGE_WITH_CRC_SIZE {
read, err := connr.Read(page[bytesRead:])
if err != nil {
resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, err)
pageFailed = true
break
}
bytesRead += uint32(read)
}
if pageFailed {
break
}
crc := binary.LittleEndian.Uint32(page[msgs.TERN_PAGE_SIZE:])
actualCrc := crc32c.Sum(0, page[:msgs.TERN_PAGE_SIZE])
if crc != actualCrc {
resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, msgs.BAD_BLOCK_CRC)
pageFailed = true
break
}
readBytes, err := resp.resp.additionalBodyWriter.ReadFrom(bytes.NewReader(page[:msgs.TERN_PAGE_SIZE]))
if err != nil || uint32(readBytes) < msgs.TERN_PAGE_SIZE {
if err == nil {
err = io.EOF
}
resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, err)
pageFailed = true
break
}
}
if pageFailed {
continue
}
}
log.Debug("read block response %T %+v for req %v from %v->%v", resp.resp.resp, resp.resp.resp, resp.resp.req, connr.LocalAddr(), connr.RemoteAddr())
resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, nil)
}
}
type blocksProcessorKey struct {
blockServiceKey uint64
addrs msgs.AddrsInfo
}
type blocksProcessors struct {
what string
timeouts **timing.ReqTimeouts
// how many bits of the block service id to use for blockServiceKey
blockServiceBits uint8
// blocksProcessorKey -> *blocksProcessor
processors sync.Map
sourceAddr1 net.TCPAddr
sourceAddr2 net.TCPAddr
}
func (procs *blocksProcessors) init(what string, timeouts **timing.ReqTimeouts, localAddresses msgs.AddrsInfo) {
procs.what = what
procs.timeouts = timeouts
if localAddresses.Addr1.Addrs[0] != 0 {
procs.sourceAddr1 = net.TCPAddr{IP: net.IP(localAddresses.Addr1.Addrs[:]), Port: int(localAddresses.Addr1.Port)}
}
if localAddresses.Addr2.Addrs[0] != 0 {
procs.sourceAddr2 = net.TCPAddr{IP: net.IP(localAddresses.Addr2.Addrs[:]), Port: int(localAddresses.Addr2.Port)}
}
}
type sendArgs struct {
blockService msgs.BlockServiceId
addrs msgs.AddrsInfo
req msgs.BlocksRequest
reqAdditionalBody io.ReadSeeker // this will _only_ be used to seek to start on retries
resp msgs.BlocksResponse
respAdditionalBody io.ReaderFrom
extra any
}
// This currently never fails (everything network related happens in
// the processor loops), keeping error since it might fail in the future
func (procs *blocksProcessors) send(
log *log.Logger,
args *sendArgs,
completionChan chan *blockCompletion,
) error {
if args.addrs.Addr1.Port == 0 && args.addrs.Addr2.Port == 0 {
panic(fmt.Errorf("got zero ports for both addresses for block service %v: %v:%v %v:%v", args.blockService, args.addrs.Addr1.Addrs, args.addrs.Addr1.Port, args.addrs.Addr2.Addrs, args.addrs.Addr2.Port))
}
resp := &clientBlockResponse{
req: args.req,
resp: args.resp,
additionalBodyWriter: args.respAdditionalBody,
completionChan: completionChan,
extra: args.extra,
}
req := &clientBlockRequest{
blockService: args.blockService,
req: args.req,
additionalBodyReader: args.reqAdditionalBody,
resp: resp,
}
key := blocksProcessorKey{
blockServiceKey: uint64(args.blockService) & ((1 << uint64(procs.blockServiceBits)) - 1),
addrs: args.addrs,
}
// likely case, we already have something. we could
// LoadOrStore directly but this saves us allocating new
// chans
if procAny, found := procs.processors.Load(key); found {
proc := procAny.(*blocksProcessor)
proc.reqChan <- req
return nil
}
// unlikely case, create new one
procAny, loaded := procs.processors.LoadOrStore(key, &blocksProcessor{
reqChan: make(chan *clientBlockRequest, 128),
inFlightReqChan: make(chan clientBlockResponseWithGeneration, 128),
addr1: net.TCPAddr{IP: net.IP(args.addrs.Addr1.Addrs[:]), Port: int(args.addrs.Addr1.Port)},
addr2: net.TCPAddr{IP: net.IP(args.addrs.Addr2.Addrs[:]), Port: int(args.addrs.Addr2.Port)},
sourceAddr1: &procs.sourceAddr1,
sourceAddr2: &procs.sourceAddr2,
what: procs.what,
timeout: procs.timeouts,
_conn: &blocksProcessorConn{},
})
proc := procAny.(*blocksProcessor)
if !loaded {
// we're the first ones here, start the routines
go proc.processRequests(log)
go proc.processResponses(log)
}
proc.reqChan <- req
return nil
}
func (procs *blocksProcessors) close() {
procs.processors.Range(func(key, value any) bool {
close(value.(*blocksProcessor).reqChan)
return true
})
}
type Client struct {
shardRawAddrs [256][2]uint64
cdcRawAddr [2]uint64
clientMetadata clientMetadata
counters *ClientCounters
writeBlockProcessors blocksProcessors
fetchBlockProcessors blocksProcessors
fetchBlockBufs sync.Pool
eraseBlockProcessors blocksProcessors
checkBlockProcessors blocksProcessors
shardTimeout *timing.ReqTimeouts
cdcTimeout *timing.ReqTimeouts
blockTimeout *timing.ReqTimeouts
requestIdCounter uint64
registryAddress string
addrsRefreshTicker *time.Ticker
addrsRefreshClose chan (struct{})
registryConn *RegistryConn
fetchBlockServices bool
blockServicesLock *sync.RWMutex
blockServiceToFailureDomain map[msgs.BlockServiceId]msgs.FailureDomain
}
func (c *Client) refreshAddrs(log *log.Logger) error {
var shardAddrs [256]msgs.AddrsInfo
var cdcAddrs msgs.AddrsInfo
{
log.Info("Getting shard/CDC info from registry at '%v'", c.registryAddress)
resp, err := c.registryConn.Request(&msgs.LocalShardsReq{})
if err != nil {
return fmt.Errorf("could not request shards from registry: %w", err)
}
shards := resp.(*msgs.LocalShardsResp)
for i, shard := range shards.Shards {
if shard.Addrs.Addr1.Port == 0 {
return fmt.Errorf("shard %v not present in registry", i)
}
shardAddrs[i] = shard.Addrs
}
resp, err = c.registryConn.Request(&msgs.LocalCdcReq{})
if err != nil {
return fmt.Errorf("could not request CDC from registry: %w", err)
}
cdc := resp.(*msgs.LocalCdcResp)
if cdc.Addrs.Addr1.Port == 0 {
return fmt.Errorf("CDC not present in registry")
}
cdcAddrs = cdc.Addrs
}
c.SetAddrs(cdcAddrs, &shardAddrs)
fetchBlockServices := func() bool {
c.blockServicesLock.RLock()
defer c.blockServicesLock.RUnlock()
return c.fetchBlockServices
}()
if !fetchBlockServices {
return nil
}
blockServicesResp, err := c.registryConn.Request(&msgs.AllBlockServicesDeprecatedReq{})
if err != nil {
return fmt.Errorf("could not request block services from registry: %w", err)
}
blockServices := blockServicesResp.(*msgs.AllBlockServicesDeprecatedResp)
var blockServicesToAdd []msgs.BlacklistEntry
func() {
for _, bs := range blockServices.BlockServices {
if _, ok := c.blockServiceToFailureDomain[bs.Id]; !ok {
blockServicesToAdd = append(blockServicesToAdd, msgs.BlacklistEntry{bs.FailureDomain, bs.Id})
}
}
}()
if len(blockServicesToAdd) > 0 {
func() {
c.blockServicesLock.Lock()
defer c.blockServicesLock.Unlock()
for _, bs := range blockServicesToAdd {
c.blockServiceToFailureDomain[bs.BlockService] = bs.FailureDomain
}
}()
}
return nil
}
// Create a new client by acquiring the CDC and Shard connection details from Registry. It
// also refreshes the shard/cdc infos every minute.
func NewClient(
log *log.Logger,
registryTimeout *RegistryTimeouts,
registryAddress string,
localAddresses msgs.AddrsInfo,
) (*Client, error) {
c, err := NewClientDirectNoAddrs(log, localAddresses)
if err != nil {
return nil, err
}
c.registryAddress = registryAddress
c.registryConn = MakeRegistryConn(log, registryTimeout, registryAddress, 1)
if err := c.refreshAddrs(log); err != nil {
c.registryConn.Close()
return nil, err
}
c.addrsRefreshTicker = time.NewTicker(time.Minute)
c.addrsRefreshClose = make(chan struct{})
addressRefreshAlert := log.NewNCAlert(5 * time.Minute)
go func() {
for {
select {
case <-c.addrsRefreshClose:
return
case <-c.addrsRefreshTicker.C:
if err := c.refreshAddrs(log); err != nil {
log.RaiseNC(addressRefreshAlert, "could not refresh shard & cdc addresses: %v", err)
} else {
log.ClearNC(addressRefreshAlert)
}
}
}
}()
return c, nil
}
// Attach an optional counters object to track requests.
// This is only safe to use during initialization.
func (c *Client) SetCounters(counters *ClientCounters) {
c.counters = counters
}
// Override the shard timeout parameters.
// This is only safe to use during initialization.
func (c *Client) SetShardTimeouts(t *timing.ReqTimeouts) {
c.shardTimeout = t
}
// Override the CDC timeout parameters.
// This is only safe to use during initialization.
func (c *Client) SetCDCTimeouts(t *timing.ReqTimeouts) {
c.cdcTimeout = t
}
// Override the block timeout parameters.
// This is only safe to use during initialization.
func (c *Client) SetBlockTimeouts(t *timing.ReqTimeouts) {
c.blockTimeout = t
}
func (c *Client) IncreaseNumRegistryHandlersTo(numHandlers uint) {
c.registryConn.IncreaseNumHandlersTo(numHandlers)
}
var clientMtu uint16 = msgs.DEFAULT_UDP_MTU
// MTU used for large responses (file spans etc)
// This is only safe to use during initialization.
func SetMTU(mtu uint64) {
if mtu < msgs.DEFAULT_UDP_MTU {
panic(fmt.Errorf("mtu (%v) < DEFAULT_UDP_MTU (%v)", mtu, msgs.DEFAULT_UDP_MTU))
}
if mtu > msgs.MAX_UDP_MTU {
panic(fmt.Errorf("mtu (%v) > MAX_UDP_MTU (%v)", mtu, msgs.MAX_UDP_MTU))
}
clientMtu = uint16(mtu)
}
func NewClientDirectNoAddrs(
log *log.Logger,
localAddresses msgs.AddrsInfo,
) (c *Client, err error) {
c = &Client{
// do not catch requests from previous executions
requestIdCounter: rand.Uint64(),
fetchBlockBufs: sync.Pool{
New: func() any {
return bytes.NewBuffer([]byte{})
},
},
fetchBlockServices: false,
blockServicesLock: &sync.RWMutex{},
blockServiceToFailureDomain: make(map[msgs.BlockServiceId]msgs.FailureDomain),
}
c.shardTimeout = &DefaultShardTimeout
c.cdcTimeout = &DefaultCDCTimeout
c.blockTimeout = &DefaultBlockTimeout
if err := c.clientMetadata.init(log, c); err != nil {
return nil, err
}
// Ideally, for write/fetch we'd want to have one socket
// per block service. However we don't have flash yet, so
// it's hard to saturate a socket because of seek time.
//
// Currently we have 102 disks per server, 96 servers. 5
// bits splits the block services in 32 buckets, and the
// block service id is uniformly distributed.
//
// So we'll have roughly 30 connections per server for the first
// two, and, for a maximum of (currently) 30*96*2 + 102*96*2 =
// 25k connections, which is within limits.
//
// (The exact expected number of connections per server is
// ~30.7447, I'll let you figure out why.)
c.writeBlockProcessors.init("write", &c.blockTimeout, localAddresses)
c.writeBlockProcessors.blockServiceBits = 5
c.fetchBlockProcessors.init("fetch", &c.blockTimeout, localAddresses)
c.fetchBlockProcessors.blockServiceBits = 5
c.eraseBlockProcessors.init("erase", &c.blockTimeout, localAddresses)
// we're not constrained by bandwidth here, we want to have requests
// for all block services in parallel.
c.eraseBlockProcessors.blockServiceBits = 5
// here we're also not constrained by bandwidth, but the requests
// take a long time, so have a separate channel from the erase ones.
c.checkBlockProcessors.init("check", &c.blockTimeout, localAddresses)
c.checkBlockProcessors.blockServiceBits = 5
return c, nil
}
func uint64ToUDPAddr(addr uint64) *net.UDPAddr {
udpAddr := &net.UDPAddr{}
udpAddr.IP = []byte{byte(addr >> 24), byte(addr >> 16), byte(addr >> 8), byte(addr)}
udpAddr.Port = int(addr >> 32)
return udpAddr
}
func (c *Client) cdcAddrs() *[2]net.UDPAddr {
return &[2]net.UDPAddr{
*uint64ToUDPAddr(atomic.LoadUint64(&c.cdcRawAddr[0])),
*uint64ToUDPAddr(atomic.LoadUint64(&c.cdcRawAddr[1])),
}
}
func (c *Client) shardAddrs(shid msgs.ShardId) *[2]net.UDPAddr {
return &[2]net.UDPAddr{
*uint64ToUDPAddr(atomic.LoadUint64(&c.shardRawAddrs[shid][0])),
*uint64ToUDPAddr(atomic.LoadUint64(&c.shardRawAddrs[shid][1])),
}
}
// Modify the CDC and Shard IP addresses and ports dynamically.
// The update is atomic, this can be done at any time from any context.
func (c *Client) SetAddrs(
cdcAddrs msgs.AddrsInfo,
shardAddrs *[256]msgs.AddrsInfo,
) {
for i := 0; i < len(shardAddrs); i++ {
atomic.StoreUint64(
&c.shardRawAddrs[i][0],
uint64(shardAddrs[i].Addr1.Port)<<32|uint64(shardAddrs[i].Addr1.Addrs[0])<<24|uint64(shardAddrs[i].Addr1.Addrs[1])<<16|uint64(shardAddrs[i].Addr1.Addrs[2])<<8|uint64(shardAddrs[i].Addr1.Addrs[3]),
)
atomic.StoreUint64(
&c.shardRawAddrs[i][1],
uint64(shardAddrs[i].Addr2.Port)<<32|uint64(shardAddrs[i].Addr2.Addrs[0])<<24|uint64(shardAddrs[i].Addr2.Addrs[1])<<16|uint64(shardAddrs[i].Addr2.Addrs[2])<<8|uint64(shardAddrs[i].Addr2.Addrs[3]),
)
}
atomic.StoreUint64(
&c.cdcRawAddr[0],
uint64(cdcAddrs.Addr1.Port)<<32|uint64(cdcAddrs.Addr1.Addrs[0])<<24|uint64(cdcAddrs.Addr1.Addrs[1])<<16|uint64(cdcAddrs.Addr1.Addrs[2])<<8|uint64(cdcAddrs.Addr1.Addrs[3]),
)
atomic.StoreUint64(
&c.cdcRawAddr[1],
uint64(cdcAddrs.Addr2.Port)<<32|uint64(cdcAddrs.Addr2.Addrs[0])<<24|uint64(cdcAddrs.Addr2.Addrs[1])<<16|uint64(cdcAddrs.Addr2.Addrs[2])<<8|uint64(cdcAddrs.Addr2.Addrs[3]),
)
}
func (c *Client) Close() {
c.addrsRefreshClose <- struct{}{}
c.addrsRefreshTicker.Stop()
c.clientMetadata.close()
c.writeBlockProcessors.close()
c.fetchBlockProcessors.close()
c.eraseBlockProcessors.close()
c.checkBlockProcessors.close()
if c.registryConn != nil {
c.registryConn.Close()
}
}
// Not atomic between the read/write
func (c *Client) MergeDirectoryInfo(log *log.Logger, id msgs.InodeId, entry msgs.IsDirectoryInfoEntry) error {
packedEntry := msgs.DirectoryInfoEntry{
Body: bincode.Pack(entry),
Tag: entry.Tag(),
}
statResp := msgs.StatDirectoryResp{}
if err := c.ShardRequest(log, id.Shard(), &msgs.StatDirectoryReq{Id: id}, &statResp); err != nil {
return err
}
info := statResp.Info
found := false
for i := 0; i < len(info.Entries); i++ {
if info.Entries[i].Tag == packedEntry.Tag {
info.Entries[i] = packedEntry
found = true
break
}
}
if !found {
info.Entries = append(info.Entries, packedEntry)
}
if err := c.ShardRequest(log, id.Shard(), &msgs.SetDirectoryInfoReq{Id: id, Info: info}, &msgs.SetDirectoryInfoResp{}); err != nil {
return err
}
return nil
}
// Not atomic between the read/write
func (c *Client) RemoveDirectoryInfoEntry(log *log.Logger, id msgs.InodeId, tag msgs.DirectoryInfoTag) error {
statResp := msgs.StatDirectoryResp{}
if err := c.ShardRequest(log, id.Shard(), &msgs.StatDirectoryReq{Id: id}, &statResp); err != nil {
return err
}
info := statResp.Info
for i := 0; i < len(info.Entries); i++ {
if info.Entries[i].Tag == tag {
info.Entries = append(info.Entries[:i], info.Entries[i+1:]...)
break
}
}
if err := c.ShardRequest(log, id.Shard(), &msgs.SetDirectoryInfoReq{Id: id, Info: info}, &msgs.SetDirectoryInfoResp{}); err != nil {
return err
}
return nil
}
func (c *Client) ResolveDirectoryInfoEntry(
log *log.Logger,
dirInfoCache *DirInfoCache,
dirId msgs.InodeId,
entry msgs.IsDirectoryInfoEntry, // output will be stored in here
) (inheritedFrom msgs.InodeId, err error) {
statReq := msgs.StatDirectoryReq{
Id: dirId,
}
statResp := msgs.StatDirectoryResp{}
visited := []msgs.InodeId{}
TraverseDirectories:
for {
inheritedFrom = dirInfoCache.LookupCachedDirInfoEntry(statReq.Id, entry)
if inheritedFrom != msgs.NULL_INODE_ID {
break
}
visited = append(visited, statReq.Id)
if err := c.ShardRequest(log, statReq.Id.Shard(), &statReq, &statResp); err != nil {
return msgs.NULL_INODE_ID, err
}
for i := len(statResp.Info.Entries) - 1; i >= 0; i-- {
respEntry := &statResp.Info.Entries[i]
if entry.Tag() == respEntry.Tag {
inheritedFrom = statReq.Id
if err := bincode.Unpack(respEntry.Body, entry); err != nil {
return msgs.NULL_INODE_ID, fmt.Errorf("could not decode dir info entry for dir %v, inherited from %v, with tag %v, body %v: %v", dirId, statReq.Id, entry.Tag(), respEntry.Body, err)
}
break TraverseDirectories
}
}
if statReq.Id == msgs.ROOT_DIR_INODE_ID {
return msgs.NULL_INODE_ID, fmt.Errorf("could not find directory info entry with tag %v %+v", entry.Tag(), statResp)
}
statReq.Id = statResp.Owner
}
// since we're traversing upwards, and we want to insert the policies first,
// update the cache from root to leaf
for i := len(visited) - 1; i >= 0; i-- {
id := visited[i]
if id == inheritedFrom {
dirInfoCache.UpdatePolicy(id, entry)
}
dirInfoCache.UpdateInheritedFrom(id, entry.Tag(), inheritedFrom)
}
return inheritedFrom, nil
}
// High-level helper function to take a string path and return the inode and parent inode
func (c *Client) ResolvePathWithParent(log *log.Logger, path string) (id msgs.InodeId, creationTime msgs.TernTime, parent msgs.InodeId, err error) {
if !filepath.IsAbs(path) {
return msgs.NULL_INODE_ID, 0, msgs.NULL_INODE_ID, fmt.Errorf("expected absolute path, got '%v'", path)
}
parent = msgs.NULL_INODE_ID
id = msgs.ROOT_DIR_INODE_ID
for _, segment := range strings.Split(filepath.Clean(path), "/")[1:] {
if segment == "" {
continue
}
resp := msgs.LookupResp{}
if err := c.ShardRequest(log, id.Shard(), &msgs.LookupReq{DirId: id, Name: segment}, &resp); err != nil {
return msgs.NULL_INODE_ID, 0, msgs.NULL_INODE_ID, err
}
parent = id
id = resp.TargetId
creationTime = resp.CreationTime
}
return id, creationTime, parent, nil
}
// High-level helper function to take a string path and return the inode
func (c *Client) ResolvePath(log *log.Logger, path string) (msgs.InodeId, error) {
id, _, _, err := c.ResolvePathWithParent(log, path)
return id, err
}
func writeBlockSendArgs(block *msgs.AddSpanInitiateBlockInfo, r io.ReadSeeker, size uint32, crc msgs.Crc, extra any) *sendArgs {
return &sendArgs{
block.BlockServiceId,
block.BlockServiceAddrs,
&msgs.WriteBlockReq{
BlockId: block.BlockId,
Crc: crc,
Size: size,
Certificate: block.Certificate,
},
r,
&msgs.WriteBlockResp{},
nil,
extra,
}
}
// An asynchronous version of [StartBlock] that is currently unused.
func (c *Client) StartWriteBlock(log *log.Logger, block *msgs.AddSpanInitiateBlockInfo, r io.ReadSeeker, size uint32, crc msgs.Crc, extra any, completion chan *blockCompletion) error {
return c.writeBlockProcessors.send(log, writeBlockSendArgs(block, r, size, crc, extra), completion)
}
func retriableBlockError(err error) bool {
return errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, syscall.EPIPE) || errors.Is(err, syscall.ECONNRESET) || errors.Is(err, io.EOF) || errors.Is(err, net.ErrClosed)
}
func (c *Client) singleBlockReq(log *log.Logger, timeouts *timing.ReqTimeouts, processor *blocksProcessors, args *sendArgs) (msgs.BlocksResponse, error) {
if timeouts == nil {
timeouts = c.blockTimeout
}
startedAt := time.Now()
for attempt := 0; ; attempt++ {
ch := make(chan *blockCompletion, 1)
err := processor.send(log, args, ch)
if err != nil {
log.Debug("failed to send block request to %v:%v %v:%v: %v", net.IP(args.addrs.Addr1.Addrs[:]), args.addrs.Addr1.Port, net.IP(args.addrs.Addr2.Addrs[:]), args.addrs.Addr2.Port, err)
return nil, err
}
resp := <-ch
err = resp.Error
if err == nil {
return resp.Resp, nil
}
if retriableBlockError(err) {
next := timeouts.Next(startedAt)
if next == 0 {
log.ErrorNoAlert("block request to %v %v:%v %v:%v failed with retriable error (attempt %v), will not retry since time is up: %v", args.blockService, net.IP(args.addrs.Addr1.Addrs[:]), args.addrs.Addr1.Port, net.IP(args.addrs.Addr2.Addrs[:]), args.addrs.Addr2.Port, attempt, err)
return nil, err
}
log.ErrorNoAlert("block request to %v %v:%v %v:%v failed with retriable error (attempt %v), will retry in %v: %v", args.blockService, net.IP(args.addrs.Addr1.Addrs[:]), args.addrs.Addr1.Port, net.IP(args.addrs.Addr2.Addrs[:]), args.addrs.Addr2.Port, attempt, next, err)
if args.reqAdditionalBody != nil {
_, err := args.reqAdditionalBody.Seek(0, io.SeekStart)
if err != nil {
log.RaiseAlertStack("", 2, "could not seek req additional body, will fail request: %v", err)
return nil, err
}
}
time.Sleep(next)
} else {
// No alert here because there are many circumstances where errors are
// expected (e.g. scrubbing), and we want the caller to handle this.
return nil, err
}
}
}
func (c *Client) WriteBlock(log *log.Logger, timeouts *timing.ReqTimeouts, block *msgs.AddSpanInitiateBlockInfo, r io.ReadSeeker, size uint32, crc msgs.Crc) (proof [8]byte, err error) {
resp, err := c.singleBlockReq(log, timeouts, &c.writeBlockProcessors, writeBlockSendArgs(block, r, size, crc, nil))
if err != nil {
return proof, err
}
return resp.(*msgs.WriteBlockResp).Proof, nil
}
func fetchBlockSendArgs(blockService *msgs.BlockService, blockId msgs.BlockId, offset uint32, count uint32, w io.ReaderFrom, extra any, crc msgs.Crc) *sendArgs {
return &sendArgs{
blockService.Id,
blockService.Addrs,
&msgs.FetchBlockWithCrcReq{
BlockId: blockId,
BlockCrc: crc,
Offset: offset,
Count: count,
},
nil,
&msgs.FetchBlockWithCrcResp{},
w,
extra,
}
}
// An asynchronous version of [FetchBlock] that is currently unused.
func (c *Client) StartFetchBlock(log *log.Logger, blockService *msgs.BlockService, blockId msgs.BlockId, offset uint32, count uint32, w io.ReaderFrom, extra any, completion chan *blockCompletion, crc msgs.Crc) error {
return c.fetchBlockProcessors.send(log, fetchBlockSendArgs(blockService, blockId, offset, count, w, extra, crc), completion)
}
// Return a buffer that was provided by [FetchBlock] to the internal pool.
func (c *Client) PutFetchedBlock(body *bytes.Buffer) {
c.fetchBlockBufs.Put(body)
}
// Retrieve a single block from the block server.
//
// The block is identified by the BlockService (which holds the id, status flags and ips/ports) and the BlockId.
// Offset and count control the amount of data that is read and the position within the block.
// The default timeout is controlled at the client level if timeouts is nil, otherwise a custom timeout can be specified.
//
// The function returns a buffer that is allocated from an internal pool. Once you have finished with this buffer it must
// be returned via the [PutFetchedBlock] function.
func (c *Client) FetchBlock(log *log.Logger, timeouts *timing.ReqTimeouts, blockService *msgs.BlockService, blockId msgs.BlockId, offset uint32, count uint32, crc msgs.Crc) (body *bytes.Buffer, err error) {
buf := c.fetchBlockBufs.Get().(*bytes.Buffer)
buf.Reset()
_, err = c.singleBlockReq(log, timeouts, &c.fetchBlockProcessors, fetchBlockSendArgs(blockService, blockId, offset, count, buf, nil, crc))
if err != nil {
return nil, err
}
return buf, nil
}
func eraseBlockSendArgs(block *msgs.RemoveSpanInitiateBlockInfo, extra any) *sendArgs {
return &sendArgs{
block.BlockServiceId,
block.BlockServiceAddrs,
&msgs.EraseBlockReq{
BlockId: block.BlockId,
Certificate: block.Certificate,
},
nil,
&msgs.EraseBlockResp{},
nil,
extra,
}
}
// An asynchronous version of [EraseBlock] that is currently unused.
func (c *Client) StartEraseBlock(log *log.Logger, block *msgs.RemoveSpanInitiateBlockInfo, extra any, completion chan *blockCompletion) error {
return c.eraseBlockProcessors.send(log, eraseBlockSendArgs(block, extra), completion)
}
func (c *Client) EraseBlock(log *log.Logger, block *msgs.RemoveSpanInitiateBlockInfo) (proof [8]byte, err error) {
resp, err := c.singleBlockReq(log, nil, &c.eraseBlockProcessors, eraseBlockSendArgs(block, nil))
if err != nil {
return proof, err
}
return resp.(*msgs.EraseBlockResp).Proof, nil
}
func (c *Client) RegistryAddress() string {
return c.registryAddress
}
func (c *Client) EraseDecommissionedBlock(block *msgs.RemoveSpanInitiateBlockInfo) (proof [8]byte, err error) {
resp, err := c.registryConn.Request(&msgs.EraseDecommissionedBlockReq{block.BlockServiceId, block.BlockId, block.Certificate})
if err != nil {
return [8]byte{}, err
}
return resp.(*msgs.EraseDecommissionedBlockResp).Proof, nil
}
func checkBlockSendArgs(blockService *msgs.BlockService, blockId msgs.BlockId, size uint32, crc msgs.Crc, extra any) *sendArgs {
return &sendArgs{
blockService.Id,
blockService.Addrs,
&msgs.CheckBlockReq{
BlockId: blockId,
Size: size,
Crc: crc,
},
nil,
&msgs.CheckBlockResp{},
nil,
extra,
}
}
// An asynchronous version of [CheckBlock] that is currently unused.
func (c *Client) StartCheckBlock(log *log.Logger, blockService *msgs.BlockService, blockId msgs.BlockId, size uint32, crc msgs.Crc, extra any, completion chan *blockCompletion) error {
return c.checkBlockProcessors.send(log, checkBlockSendArgs(blockService, blockId, size, crc, extra), completion)
}
func (c *Client) CheckBlock(log *log.Logger, blockService *msgs.BlockService, blockId msgs.BlockId, size uint32, crc msgs.Crc) error {
_, err := c.singleBlockReq(log, nil, &c.checkBlockProcessors, checkBlockSendArgs(blockService, blockId, size, crc, nil))
return err
}
func (c *Client) SetFetchBlockServices() {
var needToInitFetch = false
func() {
c.blockServicesLock.Lock()
defer c.blockServicesLock.Unlock()
needToInitFetch = !c.fetchBlockServices
c.fetchBlockServices = true
}()
if needToInitFetch {
c.refreshAddrs(c.registryConn.log)
}
}
func (c *Client) GetFailureDomainForBlockService(blockServiceId msgs.BlockServiceId) (msgs.FailureDomain, bool) {
c.blockServicesLock.RLock()
defer c.blockServicesLock.RUnlock()
if !c.fetchBlockServices {
panic("GetFailureDomainForBlockService called and flag to keep block services information not set")
}
failureDomain, ok := c.blockServiceToFailureDomain[blockServiceId]
return failureDomain, ok
}
func (c *Client) RegistryRequest(logger *log.Logger, reqBody msgs.RegistryRequest) (msgs.RegistryResponse, error) {
return c.registryConn.Request(reqBody)
}