ternfs-XTXMarkets/go/lib/client.go

package lib

import (
	"bytes"
	"container/heap"
	"crypto/cipher"
	"encoding/binary"
	"errors"
	"fmt"
	"io"
	"math/rand"
	"net"
	"path/filepath"
	"strings"
	"sync"
	"sync/atomic"
	"syscall"
	"time"
	"unsafe"
	"xtx/eggsfs/bincode"
	"xtx/eggsfs/msgs"
	"xtx/eggsfs/wyhash"
)

type ReqCounters struct {
	Timings  Timings
	Attempts uint64
}

func MergeReqCounters(cs []ReqCounters) *ReqCounters {
	counters := cs[0] // important to copy
	for i := 1; i < len(cs); i++ {
		counters.Attempts += cs[i].Attempts
		counters.Timings.Merge(&cs[i].Timings)
	}
	return &counters
}

type ClientCounters struct {
	Shard map[uint8]*[256]ReqCounters
	CDC   map[uint8]*ReqCounters
}

func NewClientCounters() *ClientCounters {
	counters := ClientCounters{
		Shard: make(map[uint8]*[256]ReqCounters),
		CDC:   make(map[uint8]*ReqCounters),
	}
	for _, k := range msgs.AllShardMessageKind {
		// max = ~1min
		var shards [256]ReqCounters
		counters.Shard[uint8(k)] = &shards
		for i := 0; i < 256; i++ {
			shards[i].Timings = *NewTimings(40, time.Microsecond*10, 1.5)
		}
	}
	for _, k := range msgs.AllCDCMessageKind {
		// max = ~2min
		counters.CDC[uint8(k)] = &ReqCounters{
			Timings: *NewTimings(35, time.Millisecond, 1.5),
		}
	}
	return &counters
}

func (counters *ClientCounters) Log(log *Logger) {
	formatCounters := func(c *ReqCounters) {
		totalCount := uint64(0)
		for _, bin := range c.Timings.Histogram() {
			totalCount += bin.Count
		}
		log.Info("    count: %v", totalCount)
		if totalCount == 0 {
			log.Info("    attempts: %v", c.Attempts)
		} else {
			log.Info("    attempts: %v (%v)", c.Attempts, float64(c.Attempts)/float64(totalCount))
		}
		log.Info("    total time: %v", c.Timings.TotalTime())
		log.Info("    avg time: %v", c.Timings.Mean())
		log.Info("    median time: %v", c.Timings.Median())
		hist := bytes.NewBuffer([]byte{})
		first := true
		countSoFar := uint64(0)
		lowerBound := time.Duration(0)
		for _, bin := range c.Timings.Histogram() {
			if bin.Count == 0 {
				continue
			}
			countSoFar += bin.Count
			if first {
				fmt.Fprintf(hist, "%v < ", lowerBound)
			} else {
				fmt.Fprintf(hist, ", ")
			}
			first = false
			fmt.Fprintf(hist, "%v (%0.2f%%) < %v", bin.Count, float64(countSoFar*100)/float64(totalCount), bin.UpperBound)
		}
		log.Info("    hist: %v", hist.String())
	}
	var shardTime time.Duration
	for _, k := range msgs.AllShardMessageKind {
		for i := 0; i < 256; i++ {
			shardTime += counters.Shard[uint8(k)][i].Timings.TotalTime()
		}
	}
	log.Info("Shard stats (total shard time %v):", shardTime)
	for _, k := range msgs.AllShardMessageKind {
		c := MergeReqCounters(counters.Shard[uint8(k)][:])
		if c.Attempts == 0 {
			continue
		}
		log.Info("  %v", k)
		formatCounters(c)
	}
	var cdcTime time.Duration
	for _, k := range msgs.AllCDCMessageKind {
		cdcTime += counters.CDC[uint8(k)].Timings.TotalTime()
	}
	log.Info("CDC stats (total CDC time %v):", cdcTime)
	for _, k := range msgs.AllCDCMessageKind {
		c := counters.CDC[uint8(k)]
		if c.Attempts == 0 {
			continue
		}
		log.Info("  %v", k)
		formatCounters(c)
	}

}

var DefaultShardTimeout = ReqTimeouts{
	Initial: 100 * time.Millisecond,
	Max:     2 * time.Second,
	Overall: 10 * time.Second,
	Growth:  1.5,
	Jitter:  0.1,
	rand:    wyhash.Rand{State: 0},
}

var DefaultCDCTimeout = ReqTimeouts{
	Initial: time.Second,
	Max:     10 * time.Second,
	Overall: time.Minute,
	Growth:  1.5,
	Jitter:  0.1,
	rand:    wyhash.Rand{State: 0},
}

var DefaultBlockTimeout = ReqTimeouts{
	Initial: time.Second,
	Max:     10 * time.Second,
	Overall: 5 * time.Minute,
	Growth:  1.5,
	Jitter:  0.1,
	rand:    wyhash.Rand{State: 0},
}

type metadataProcessorRequest struct {
	requestId uint64
	timeout   time.Duration
	shard     int16 // -1 = cdc
	req       bincode.Packable
	resp      bincode.Unpackable
	extra     any
	respCh    chan *metadataProcessorResponse
	// filled in by request processor
	deadline time.Time
	index    int // index in the heap
}

type metadataProcessorResponse struct {
	requestId uint64
	resp      any
	err       error
}

type metadataRequestsPQ []*metadataProcessorRequest

func (pq metadataRequestsPQ) Len() int { return len(pq) }

func (pq metadataRequestsPQ) Less(i, j int) bool {
	return pq[i].deadline.UnixNano() < pq[j].deadline.UnixNano()
}

func (pq metadataRequestsPQ) Swap(i, j int) {
	pq[i], pq[j] = pq[j], pq[i]
	pq[i].index = i
	pq[j].index = j
}

func (pq *metadataRequestsPQ) Push(x any) {
	n := len(*pq)
	item := x.(*metadataProcessorRequest)
	item.index = n
	*pq = append(*pq, item)
}

func (pq *metadataRequestsPQ) Pop() any {
	old := *pq
	n := len(old)
	item := old[n-1]
	old[n-1] = nil  // avoid memory leak
	item.index = -1 // for safety
	*pq = old[0 : n-1]
	return item
}

type rawMetadataResponse struct {
	receivedAt time.Time
	protocol   uint32
	requestId  uint64
	kind       uint8
	respLen    int
	buf        *[]byte // the buf contains the header
}

type clientMetadata struct {
	client *Client
	sock   *net.UDPConn

	requestsById      map[uint64]*metadataProcessorRequest // requests we've sent, by req id
	requestsByTimeout metadataRequestsPQ                   // requests we've sent, by timeout (earlier first)
	earlyRequests     map[uint64]rawMetadataResponse       // requests we've received a response for, but that we haven't seen that we've sent yet. should be uncommon.

	quitResponseProcessor chan struct{}                  // channel to quit the response processor, which in turn closes the socket
	incoming              chan *metadataProcessorRequest // channel where user requests come in
	inFlight              chan *metadataProcessorRequest // channel going from request processor to response processor
	rawResponses          chan rawMetadataResponse       // channel going from the socket drainer to the response processor
	responsesBufs         chan *[]byte                   // channel to store a cache of buffers to read into
	timeoutTicker         *time.Ticker                   // channel to notify the response processor to time out requests
}

var whichMetadatataAddr int

func (cm *clientMetadata) init(log *Logger, client *Client) error {
	log.Debug("initiating clientMetadata")
	defer log.Debug("finished initializing clientMetadata")
	cm.client = client
	sock, err := net.ListenPacket("udp", ":0")
	if err != nil {
		return err
	}
	cm.sock = sock.(*net.UDPConn)
	// 10MiB/100byte ~ 100k requests in the pipe. 100byte is
	// kinda conservative.
	if err := cm.sock.SetReadBuffer(1 << 20); err != nil {
		cm.sock.Close()
		return err
	}

	cm.requestsById = make(map[uint64]*metadataProcessorRequest)
	cm.requestsByTimeout = make(metadataRequestsPQ, 0)
	cm.earlyRequests = make(map[uint64]rawMetadataResponse)

	cm.quitResponseProcessor = make(chan struct{})
	cm.incoming = make(chan *metadataProcessorRequest, 10_000)
	cm.inFlight = make(chan *metadataProcessorRequest, 10_000)
	cm.rawResponses = make(chan rawMetadataResponse, 10_000)
	cm.responsesBufs = make(chan *[]byte, 128)
	for i := 0; i < len(cm.responsesBufs); i++ {
		buf := make([]byte, clientMtu)
		cm.responsesBufs <- &buf
	}

	cm.timeoutTicker = time.NewTicker(DefaultShardTimeout.Initial / 2)

	go cm.processRequests(log)
	go cm.processResponses(log)
	go cm.drainSocket(log)

	return nil
}

func (cm *clientMetadata) close() {
	cm.timeoutTicker.Stop()
	cm.quitResponseProcessor <- struct{}{}
	cm.incoming <- nil
}

// terminates when cm.incoming gets nil
func (cm *clientMetadata) processRequests(log *Logger) {
	buf := bytes.NewBuffer([]byte{})
	for {
		req := <-cm.incoming
		if req == nil {
			log.Debug("got nil request in request processor, winding down")
			return
		}
		dontWait := req.resp == nil
		log.Debug("sending request %T %+v req id %v to shard %v", req.req, req.req, req.requestId, req.shard)
		buf.Reset()
		var addrs *[2]net.UDPAddr
		var kind uint8
		var protocol uint32
		if req.shard >= 0 { // shard
			addrs = cm.client.ShardAddrs(msgs.ShardId(req.shard))
			kind = uint8(req.req.(msgs.ShardRequest).ShardRequestKind())
			protocol = msgs.SHARD_REQ_PROTOCOL_VERSION
		} else { // CDC
			addrs = cm.client.CDCAddrs()
			kind = uint8(req.req.(msgs.CDCRequest).CDCRequestKind())
			protocol = msgs.CDC_REQ_PROTOCOL_VERSION
		}
		binary.Write(buf, binary.LittleEndian, protocol)
		binary.Write(buf, binary.LittleEndian, req.requestId)
		binary.Write(buf, binary.LittleEndian, kind)
		if err := req.req.Pack(buf); err != nil {
			log.RaiseAlert("could not pack request %v to shard %v: %v", req.req, req.shard, err)
			if !dontWait {
				req.respCh <- &metadataProcessorResponse{
					requestId: req.requestId,
					err:       err,
					resp:      nil,
				}
			}
			// keep running even if the socket is totally broken to process all the requests
			continue
		}
		addr := &addrs[whichMetadatataAddr%2]
		if addr.Port == 0 {
			addr = &addrs[0]
		}
		whichMetadatataAddr++
		written, err := cm.sock.WriteToUDP(buf.Bytes(), addr)
		if err != nil {
			log.RaiseAlert("could not send request %v to shard %v addr %v: %v", req.req, req.shard, addr, err)
			if !dontWait {
				req.respCh <- &metadataProcessorResponse{
					requestId: req.requestId,
					err:       err,
					resp:      nil,
				}
			}
			// keep running even if the socket is totally broken to process all the requests
			continue
		}
		if written != len(buf.Bytes()) {
			panic(fmt.Errorf("%v != %v", written, len(buf.Bytes())))
		}
		if !dontWait {
			req.deadline = time.Now().Add(req.timeout)
			cm.inFlight <- req
		}
	}
}

func (cm *clientMetadata) parseResponse(log *Logger, req *metadataProcessorRequest, rawResp *rawMetadataResponse) {
	// discharge the raw request at the end
	defer func() {
		select {
		case cm.responsesBufs <- rawResp.buf:
		default:
		}
	}()
	// check protocol
	if req.shard < 0 { // CDC
		if rawResp.protocol != msgs.CDC_RESP_PROTOCOL_VERSION {
			log.RaiseAlert("got bad cdc protocol %v for request id %v, ignoring", rawResp.protocol, req.requestId)
			return
		}
	} else {
		if rawResp.protocol != msgs.SHARD_RESP_PROTOCOL_VERSION {
			log.RaiseAlert("got bad shard protocol %v for request id %v, shard %v, ignoring", rawResp.protocol, req.shard, req.requestId)
			return
		}
	}
	// remove everywhere
	delete(cm.earlyRequests, req.requestId)
	if _, found := cm.requestsById[req.requestId]; found {
		delete(cm.requestsById, req.requestId)
		heap.Remove(&cm.requestsByTimeout, req.index)
	}
	if rawResp.kind == msgs.ERROR {
		var err error
		if rawResp.respLen != 4+8+1+2 {
			log.RaiseAlert("bad error response length %v, expected %v", rawResp.respLen, 4+8+1+2)
			err = msgs.MALFORMED_RESPONSE
		} else {
			err = msgs.ErrCode(binary.LittleEndian.Uint16((*rawResp.buf)[4+8+1:]))
		}
		req.respCh <- &metadataProcessorResponse{
			requestId: req.requestId,
			err:       err,
			resp:      nil,
		}
	} else {
		// check kind
		if req.shard < 0 { // CDC
			expectedKind := req.req.(msgs.CDCRequest).CDCRequestKind()
			if uint8(expectedKind) != rawResp.kind {
				log.RaiseAlert("got bad cdc kind %v for request id %v, expected %v", msgs.CDCMessageKind(rawResp.kind), req.requestId, expectedKind)
				req.respCh <- &metadataProcessorResponse{
					requestId: req.requestId,
					err:       msgs.MALFORMED_RESPONSE,
					resp:      nil,
				}
				return
			}
		} else {
			expectedKind := req.req.(msgs.ShardRequest).ShardRequestKind()
			if uint8(expectedKind) != rawResp.kind {
				log.RaiseAlert("got bad shard kind %v for request id %v, shard %v, expected %v", msgs.ShardMessageKind(rawResp.kind), req.requestId, req.shard, expectedKind)
				req.respCh <- &metadataProcessorResponse{
					requestId: req.requestId,
					err:       msgs.MALFORMED_RESPONSE,
					resp:      nil,
				}
				return
			}
		}
		// unpack
		if err := bincode.Unpack((*rawResp.buf)[4+8+1:rawResp.respLen], req.resp); err != nil {
			log.RaiseAlert("could not unpack resp %T for request id %v, shard %v: %v", req.resp, req.requestId, req.shard, err)
			req.respCh <- &metadataProcessorResponse{
				requestId: req.requestId,
				err:       err,
				resp:      nil,
			}
			return
		}
		log.Debug("received resp %v req id %v from shard %v", req.resp, req.requestId, req.shard)
		// done
		req.respCh <- &metadataProcessorResponse{
			requestId: req.requestId,
			err:       nil,
			resp:      req.resp,
		}
	}
}

// terminates when `cm.quitResponseProcessor` gets a message
func (cm *clientMetadata) processResponses(log *Logger) {
	for {
		select {
		case req := <-cm.inFlight:
			if rawResp, found := cm.earlyRequests[req.requestId]; found {
				// uncommon case: we have a response for this already.
				cm.parseResponse(log, req, &rawResp)
			} else {
				// common case: we don't have the response yet, put it in the data structures and wait.
				// if the request was there before, we remove it from the heap so that we don't have
				// dupes and the deadline is right
				if _, found := cm.requestsById[req.requestId]; found {
					heap.Remove(&cm.requestsByTimeout, req.index)
				}
				cm.requestsById[req.requestId] = req
				heap.Push(&cm.requestsByTimeout, req)
			}
		case rawResp := <-cm.rawResponses:
			if req, found := cm.requestsById[rawResp.requestId]; found {
				// common case, the request is already there
				cm.parseResponse(log, req, &rawResp)
			} else {
				// uncommon case, the request is missing
				cm.earlyRequests[rawResp.requestId] = rawResp
			}
		case now := <-cm.timeoutTicker.C:
			// expire requests
			for len(cm.requestsById) > 0 {
				first := cm.requestsByTimeout[0]
				if now.After(first.deadline) {
					log.Debug("request %v %T to shard %v has timed out", first.requestId, first.req, first.shard)
					heap.Pop(&cm.requestsByTimeout)
					delete(cm.requestsById, first.requestId)
					// consumer might very plausibly be gone by now,
					// don't risk it
					go func() {
						first.respCh <- &metadataProcessorResponse{
							requestId: first.requestId,
							err:       msgs.TIMEOUT,
							resp:      nil,
						}
					}()
				} else {
					log.Debug("first request %v %T has not passed deadline %v", first.requestId, first.req, first.deadline)
					break
				}
			}
			// expire request we got past timeouts -- this map should always be small
			for reqId, rawReq := range cm.earlyRequests {
				if now.Sub(rawReq.receivedAt) > 10*time.Minute {
					delete(cm.earlyRequests, reqId)
				}
			}
		case <-cm.quitResponseProcessor:
			log.Info("got quit signal, closing socket and terminating")
			cm.sock.Close()
			return
		}
	}
}

// terminates when the socket is closed
func (cm *clientMetadata) drainSocket(log *Logger) {
	for {
		var buf *[]byte
		select {
		case buf = <-cm.responsesBufs:
		default:
		}
		if buf == nil {
			log.Debug("allocating new MTU buffer")
			bufv := make([]byte, clientMtu)
			buf = &bufv
		}
		read, _, err := cm.sock.ReadFromUDP(*buf)
		if err != nil {
			if errors.Is(err, net.ErrClosed) {
				log.Info("socket is closed, winding down")
				return
			} else {
				log.RaiseAlert("got error when reading socket: %v", err)
			}
		}
		if read < 4+8+1 {
			log.RaiseAlert("got runt metadata message, expected at least %v bytes, got %v", 4+8+1, read)
			continue
		}
		rawResp := rawMetadataResponse{
			receivedAt: time.Now(),
			respLen:    read,
			buf:        buf,
			protocol:   binary.LittleEndian.Uint32(*buf),
			requestId:  binary.LittleEndian.Uint64((*buf)[4:]),
			kind:       (*buf)[4+8],
		}
		cm.rawResponses <- rawResp
	}
}

type BlockCompletion struct {
	Resp  msgs.BlocksResponse
	Extra any
	Error error
}

type clientBlockResponse struct {
	req   msgs.BlocksRequest
	resp  msgs.BlocksResponse
	extra any
	// when fetching block, this gets written into
	additionalBodyWriter io.ReaderFrom
	// stores the error, if any
	err error
	// called when we're done
	completionChan chan *BlockCompletion
}

func (resp *clientBlockResponse) done(log *Logger, addr1 *net.TCPAddr, addr2 *net.TCPAddr, extra any, err error) {
	if resp.err == nil && err != nil {
		log.InfoStack(1, "failing request %T %+v addr1=%+v addr2=%+v extra=%+v: %v", resp.req, resp.req, addr1, addr2, extra, err)
		resp.err = err
	}
	completion := &BlockCompletion{
		Resp:  resp.resp,
		Error: resp.err,
		Extra: resp.extra,
	}
	resp.completionChan <- completion
}

type clientBlockRequest struct {
	blockService         msgs.BlockServiceId
	req                  msgs.BlocksRequest
	additionalBodyReader io.Reader // when writing block, this will be written after the request
	resp                 *clientBlockResponse
}

type blocksProcessorConn struct {
	conn       *net.TCPConn
	generation uint64
}

type clientBlockResponseWithGeneration struct {
	generation uint64
	resp       *clientBlockResponse
}

type blocksProcessor struct {
	reqChan         chan *clientBlockRequest
	inFlightReqChan chan clientBlockResponseWithGeneration
	addr1           net.TCPAddr
	addr2           net.TCPAddr
	what            string
	_conn           *blocksProcessorConn // this must be loaded through loadConn
}

func (proc *blocksProcessor) loadConn() *blocksProcessorConn {
	return (*blocksProcessorConn)(atomic.LoadPointer((*unsafe.Pointer)(unsafe.Pointer(&proc._conn))))
}

func (proc *blocksProcessor) storeConn(conn *net.TCPConn) *blocksProcessorConn {
	gen := proc.loadConn().generation
	newConn := &blocksProcessorConn{
		conn:       conn,
		generation: gen + 1,
	}
	atomic.StorePointer((*unsafe.Pointer)(unsafe.Pointer(&proc._conn)), unsafe.Pointer(newConn))
	return newConn
}

var whichBlockIp uint

func (proc *blocksProcessor) connect(log *Logger) (*net.TCPConn, error) {
	var err error
	whichBlockIp++
	for i := whichBlockIp; i < whichBlockIp+2; i++ {
		var addr *net.TCPAddr
		if i&1 == 0 {
			addr = &proc.addr1
		} else {
			addr = &proc.addr2
		}
		if addr.Port == 0 {
			continue
		}
		log.Debug("trying to connect to block service %v", addr)
		var sock *net.TCPConn
		sock, err = net.DialTCP("tcp4", nil, addr)
		if err == nil {
			log.Debug("connected to block service at %v", addr)
			return sock, nil
		}
		log.Info("could not connect to block service %v, might try next connection: %v", addr, err)
	}
	if err == nil {
		panic(fmt.Errorf("impossible: got out without errors"))
	}
	return nil, err
}

// From <https://stackoverflow.com/a/58664631>, checks if a connection
// is still alive.
func connCheck(conn *net.TCPConn) error {
	var sysErr error = nil
	rc, err := conn.SyscallConn()
	if err != nil {
		return err
	}
	err = rc.Read(func(fd uintptr) bool {
		var buf []byte = []byte{0}
		n, _, err := syscall.Recvfrom(int(fd), buf, syscall.MSG_PEEK|syscall.MSG_DONTWAIT)
		switch {
		case n == 0 && err == nil:
			sysErr = io.EOF
		case err == syscall.EAGAIN || err == syscall.EWOULDBLOCK:
			sysErr = nil
		default:
			sysErr = err
		}
		return true
	})
	if err != nil {
		return err
	}
	return sysErr
}

func (proc *blocksProcessor) processRequests(log *Logger) {
	log.Debug("%v: starting request processor for addr1=%v addr2=%v", proc.what, proc.addr1, proc.addr2)
	// one iteration = one request
	for {
		conn := proc.loadConn()
		req := <-proc.reqChan
		if req == nil {
			log.Debug("%v: got nil request, tearing down", proc.what)
			if conn.conn != nil {
				conn.conn.Close()
			}
			proc.inFlightReqChan <- clientBlockResponseWithGeneration{
				resp:       nil, // this tears down the response processor
				generation: 0,
			}
			return
		}
		// empty queue, check that conn is alive, otherwise we might still succeed sending,
		// but inevitably fail when reading.
		if conn.conn != nil && len(proc.inFlightReqChan) == 0 {
			if err := connCheck(conn.conn); err != nil {
				log.Debug("connection for addr1=%+v addr2=%+v is dead: %v", proc.addr1, proc.addr2, err)
				conn.conn.Close()
				conn.conn = nil
			}
		}
		// clear stale connections otherwise we might succeed writing but fail reading on the other side
		if conn.conn == nil {
			tcpConn, err := proc.connect(log)
			if err != nil { // we couldn't connect, not much to do
				req.resp.done(log, &proc.addr1, &proc.addr2, req.resp.extra, err)
				continue
			}
			// we did connect
			conn = proc.storeConn(tcpConn)
		}
		log.Debug("writing block request %T %+v for addr1=%v addr2=%v", req.req, req.req, proc.addr1, proc.addr2)
		if err := WriteBlocksRequest(log, conn.conn, req.blockService, req.req); err != nil {
			log.Info("got error when writing block request of kind %v in %v->%v: %v", req.req.BlocksRequestKind(), conn.conn.LocalAddr(), conn.conn.RemoteAddr(), err)
			req.resp.done(log, &proc.addr1, &proc.addr2, req.resp.extra, err)
			conn.conn.Close()
			conn.conn = nil
			continue
		}
		if req.req.BlocksRequestKind() == msgs.WRITE_BLOCK {
			writeReq := req.req.(*msgs.WriteBlockReq)
			lr := &io.LimitedReader{
				R: req.additionalBodyReader,
				N: int64(writeReq.Size),
			}
			log.Debug("writing block body to %v->%v", conn.conn.LocalAddr(), conn.conn.RemoteAddr())
			writtenBytes, err := conn.conn.ReadFrom(lr)
			if err != nil || writtenBytes < int64(writeReq.Size) {
				if err == nil {
					err = io.EOF
				}
				log.Info("got error when writing block body: %v", err)
				req.resp.done(log, &proc.addr1, &proc.addr2, req.resp.extra, err)
				conn.conn.Close()
				conn.conn = nil
				continue
			}
		}
		// we wrote it fine, proceed
		proc.inFlightReqChan <- clientBlockResponseWithGeneration{
			generation: conn.generation,
			resp:       req.resp,
		}
	}
}

func (proc *blocksProcessor) processResponses(log *Logger) {
	log.Debug("%v: starting response processor for addr1=%v addr2=%v", proc.what, proc.addr1, proc.addr2)
	// one iteration = one request
	for {
		resp := <-proc.inFlightReqChan
		if resp.resp == nil {
			return
		}
		conn := proc.loadConn()
		connr := conn.conn
		if connr == nil {
			log.Info("%v: resp %T %+v has no conn, skipping", proc.what, resp.resp.resp, resp.resp.resp)
			resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, io.EOF)
			continue
		}
		if conn.generation != resp.generation {
			log.Info("%v: resp %T %+v has bad generation %v vs %v, skipping", proc.what, resp.resp.resp, resp.resp.resp, resp.generation, conn.generation)
			resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, io.EOF)
			continue
		}
		log.Debug("reading block response %T for req %+v from %v->%v", resp.resp.resp, resp.resp.req, connr.LocalAddr(), connr.RemoteAddr())
		// the responsibility for cleaning up the connection is always in the request processor
		if err := ReadBlocksResponse(log, connr, resp.resp.resp); err != nil {
			resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, err)
			continue
		}
		if resp.resp.resp.BlocksResponseKind() == msgs.FETCH_BLOCK {
			req := resp.resp.req.(*msgs.FetchBlockReq)
			lr := &io.LimitedReader{
				R: connr,
				N: int64(req.Count),
			}
			log.Debug("reading block body from %v->%v", connr.LocalAddr(), connr.RemoteAddr())
			readBytes, err := resp.resp.additionalBodyWriter.ReadFrom(lr)
			if err != nil || readBytes < int64(req.Count) {
				if err == nil {
					err = io.EOF
				}
				resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, err)
				continue
			}
		}
		log.Debug("read block response %T %+v for req %v from %v->%v", resp.resp.resp, resp.resp.resp, resp.resp.req, connr.LocalAddr(), connr.RemoteAddr())
		resp.resp.done(log, &proc.addr1, &proc.addr2, resp.resp.extra, nil)
	}
}

type blocksProcessorKey struct {
	blockServiceKey uint64
	ip1             [4]byte
	port1           uint16
	ip2             [4]byte
	port2           uint16
}

type blocksProcessors struct {
	what string
	// how many bits of the block service id to use for blockServiceKey
	blockServiceBits uint8
	// blocksProcessorKey -> *blocksProcessor
	processors sync.Map
}

func (procs *blocksProcessors) init(what string) {
	procs.what = what
}

type sendArgs struct {
	blockService       msgs.BlockServiceId
	ip1                [4]byte
	port1              uint16
	ip2                [4]byte
	port2              uint16
	req                msgs.BlocksRequest
	reqAdditionalBody  io.Reader
	resp               msgs.BlocksResponse
	respAdditionalBody io.ReaderFrom
	extra              any
}

// This currently never fails (everything network related happens in
// the processor loops), keeping error since it might fail in the future
func (procs *blocksProcessors) send(
	log *Logger,
	args *sendArgs,
	completionChan chan *BlockCompletion,
) error {
	if args.port1 == 0 && args.port2 == 0 {
		panic(fmt.Errorf("got zero ports for both addresses for block service %v: %v:%v %v:%v", args.blockService, args.ip1, args.port1, args.ip2, args.port2))
	}
	resp := &clientBlockResponse{
		req:                  args.req,
		resp:                 args.resp,
		additionalBodyWriter: args.respAdditionalBody,
		completionChan:       completionChan,
		extra:                args.extra,
	}
	req := &clientBlockRequest{
		blockService:         args.blockService,
		req:                  args.req,
		additionalBodyReader: args.reqAdditionalBody,
		resp:                 resp,
	}
	key := blocksProcessorKey{
		ip1:   args.ip1,
		port1: args.port1,
		ip2:   args.ip2,
		port2: args.port2,
	}
	key.blockServiceKey = uint64(args.blockService) & ((1 << uint64(procs.blockServiceBits)) - 1)
	// likely case, we already have something. we could
	// LoadOrStore directly but this saves us allocating new
	// chans
	if procAny, found := procs.processors.Load(key); found {
		proc := procAny.(*blocksProcessor)
		proc.reqChan <- req
		return nil
	}
	// unlikely case, create new one
	procAny, loaded := procs.processors.LoadOrStore(key, &blocksProcessor{
		reqChan:         make(chan *clientBlockRequest, 128),
		inFlightReqChan: make(chan clientBlockResponseWithGeneration, 128),
		addr1:           net.TCPAddr{IP: net.IP(args.ip1[:]), Port: int(args.port1)},
		addr2:           net.TCPAddr{IP: net.IP(args.ip2[:]), Port: int(args.port2)},
		what:            procs.what,
		_conn:           &blocksProcessorConn{},
	})
	proc := procAny.(*blocksProcessor)
	if !loaded {
		// we're the first ones here, start the routines
		go proc.processRequests(log)
		go proc.processResponses(log)
	}
	proc.reqChan <- req
	return nil
}

func (procs *blocksProcessors) close() {
	procs.processors.Range(func(key, value any) bool {
		value.(*blocksProcessor).reqChan <- nil
		return true
	})
}

type Client struct {
	shardRawAddrs        [256][2]uint64
	cdcRawAddr           [2]uint64
	clientMetadata       clientMetadata
	counters             *ClientCounters
	cdcKey               cipher.Block
	writeBlockProcessors blocksProcessors
	fetchBlockProcessors blocksProcessors
	fetchBlockBufs       sync.Pool
	eraseBlockProcessors blocksProcessors
	checkBlockProcessors blocksProcessors
	shardTimeout         *ReqTimeouts
	cdcTimeout           *ReqTimeouts
	blockTimeout         *ReqTimeouts
	requestIdCounter     uint64
}

func NewClient(
	log *Logger,
	shuckleTimeout *ReqTimeouts,
	shuckleAddress string,
) (*Client, error) {
	var shardIps [256][2][4]byte
	var shardPorts [256][2]uint16
	var cdcIps [2][4]byte
	var cdcPorts [2]uint16
	{
		log.Info("Getting shard/CDC info from shuckle at '%v'", shuckleAddress)
		resp, err := ShuckleRequest(log, shuckleTimeout, shuckleAddress, &msgs.ShardsReq{})
		if err != nil {
			return nil, fmt.Errorf("could not request shards from shuckle: %w", err)
		}
		shards := resp.(*msgs.ShardsResp)
		for i, shard := range shards.Shards {
			if shard.Port1 == 0 {
				return nil, fmt.Errorf("shard %v not present in shuckle", i)
			}
			shardIps[i][0] = shard.Ip1
			shardPorts[i][0] = shard.Port1
			shardIps[i][1] = shard.Ip2
			shardPorts[i][1] = shard.Port2
		}
		resp, err = ShuckleRequest(log, shuckleTimeout, shuckleAddress, &msgs.CdcReq{})
		if err != nil {
			return nil, fmt.Errorf("could not request CDC from shuckle: %w", err)
		}
		cdc := resp.(*msgs.CdcResp)
		if cdc.Port1 == 0 {
			return nil, fmt.Errorf("CDC not present in shuckle")
		}
		cdcIps[0] = cdc.Ip1
		cdcPorts[0] = cdc.Port1
		cdcIps[1] = cdc.Ip2
		cdcPorts[1] = cdc.Port2
	}
	return NewClientDirect(log, &cdcIps, &cdcPorts, &shardIps, &shardPorts)
}

func (c *Client) SetCounters(counters *ClientCounters) {
	c.counters = counters
}

func (c *Client) SetCDCKey(cdcKey cipher.Block) {
	c.cdcKey = cdcKey
}

func (c *Client) SetShardTimeouts(t *ReqTimeouts) {
	c.shardTimeout = t
}

func (c *Client) SetCDCTimeouts(t *ReqTimeouts) {
	c.cdcTimeout = t
}

func (c *Client) SetBlockTimeout(t *ReqTimeouts) {
	c.blockTimeout = t
}

var clientMtu uint16 = 1472

// MTU used for large responses (file spans etc)
func SetMTU(mtu uint64) {
	if mtu < msgs.DEFAULT_UDP_MTU {
		panic(fmt.Errorf("mtu (%v) < DEFAULT_UDP_MTU (%v)", mtu, msgs.DEFAULT_UDP_MTU))
	}
	if mtu > msgs.MAX_UDP_MTU {
		panic(fmt.Errorf("mtu (%v) > MAX_UDP_MTU (%v)", mtu, msgs.MAX_UDP_MTU))
	}
	clientMtu = uint16(mtu)
}

func NewClientDirectNoAddrs(
	log *Logger,
) (c *Client, err error) {
	c = &Client{
		// do not catch requests from previous executions
		requestIdCounter: rand.Uint64(),
		fetchBlockBufs: sync.Pool{
			New: func() any {
				return bytes.NewBuffer([]byte{})
			},
		},
	}
	c.shardTimeout = &DefaultShardTimeout
	c.cdcTimeout = &DefaultCDCTimeout
	c.blockTimeout = &DefaultBlockTimeout
	if err := c.clientMetadata.init(log, c); err != nil {
		return nil, err
	}
	// Ideally, for write/fetch we'd want to have one socket
	// per block service. However we don't have flash yet, so
	// it's hard to saturate a socket because of seek time.
	//
	// Currently we have 102 disks per server, 96 servers. 5
	// bits splits the block services in 32 buckets, and the
	// block service id is uniformly distributed.
	//
	// So we'll have roughly 30 connections per server for the first
	// two, and, for a maximum of (currently) 30*96*2 + 102*96*2 =
	// 25k connections, which is within limits.
	//
	// (The exact expected number of connections per server is
	// ~30.7447, I'll let you figure out why.)
	c.writeBlockProcessors.init("write")
	c.writeBlockProcessors.blockServiceBits = 5
	c.fetchBlockProcessors.init("fetch")
	c.fetchBlockProcessors.blockServiceBits = 5
	c.eraseBlockProcessors.init("erase")
	// we're not constrained by bandwidth here, we want to have requests
	// for all block services in parallel.
	c.eraseBlockProcessors.blockServiceBits = 63
	// here we're also not constrained by bandwidth, but the requests
	// take a long time, so have a separate channel from the erase ones.
	c.checkBlockProcessors.init("erase")
	c.checkBlockProcessors.blockServiceBits = 63
	return c, nil
}

func NewClientDirect(
	log *Logger,
	cdcIps *[2][4]byte,
	cdcPorts *[2]uint16,
	shardIps *[256][2][4]byte,
	shardPorts *[256][2]uint16,
) (c *Client, err error) {
	c, err = NewClientDirectNoAddrs(log)
	if err != nil {
		return nil, err
	}
	c.UpdateAddrs(cdcIps, cdcPorts, shardIps, shardPorts)
	return c, nil
}

func uint64ToUDPAddr(addr uint64) *net.UDPAddr {
	udpAddr := &net.UDPAddr{}
	udpAddr.IP = []byte{byte(addr >> 24), byte(addr >> 16), byte(addr >> 8), byte(addr)}
	udpAddr.Port = int(addr >> 32)
	return udpAddr
}

func (c *Client) CDCAddrs() *[2]net.UDPAddr {
	return &[2]net.UDPAddr{
		*uint64ToUDPAddr(atomic.LoadUint64(&c.cdcRawAddr[0])),
		*uint64ToUDPAddr(atomic.LoadUint64(&c.cdcRawAddr[1])),
	}
}

func (c *Client) ShardAddrs(shid msgs.ShardId) *[2]net.UDPAddr {
	return &[2]net.UDPAddr{
		*uint64ToUDPAddr(atomic.LoadUint64(&c.shardRawAddrs[shid][0])),
		*uint64ToUDPAddr(atomic.LoadUint64(&c.shardRawAddrs[shid][1])),
	}
}

func (c *Client) UpdateAddrs(
	cdcIps *[2][4]byte,
	cdcPorts *[2]uint16,
	shardIps *[256][2][4]byte,
	shardPorts *[256][2]uint16,
) {
	for i := 0; i < 2; i++ {
		for j := 0; j < 256; j++ {
			atomic.StoreUint64(
				&c.shardRawAddrs[j][i],
				uint64(shardPorts[j][i])<<32|uint64(shardIps[j][i][0])<<24|uint64(shardIps[j][i][1])<<16|uint64(shardIps[j][i][2])<<8|uint64(shardIps[j][i][3]),
			)
		}
		atomic.StoreUint64(
			&c.cdcRawAddr[i],
			uint64(cdcPorts[i])<<32|uint64(cdcIps[i][0])<<24|uint64(cdcIps[i][1])<<16|uint64(cdcIps[i][2])<<8|uint64(cdcIps[i][3]),
		)
	}
}

func (c *Client) Close() {
	c.clientMetadata.close()
	c.writeBlockProcessors.close()
	c.fetchBlockProcessors.close()
	c.eraseBlockProcessors.close()
	c.checkBlockProcessors.close()
}

// Not atomic between the read/write
func (c *Client) MergeDirectoryInfo(log *Logger, id msgs.InodeId, entry msgs.IsDirectoryInfoEntry) error {
	packedEntry := msgs.DirectoryInfoEntry{
		Body: bincode.Pack(entry),
		Tag:  entry.Tag(),
	}
	statResp := msgs.StatDirectoryResp{}
	if err := c.ShardRequest(log, id.Shard(), &msgs.StatDirectoryReq{Id: id}, &statResp); err != nil {
		return err
	}
	info := statResp.Info
	found := false
	for i := 0; i < len(info.Entries); i++ {
		if info.Entries[i].Tag == packedEntry.Tag {
			info.Entries[i] = packedEntry
			found = true
			break
		}
	}
	if !found {
		info.Entries = append(info.Entries, packedEntry)
	}
	if err := c.ShardRequest(log, id.Shard(), &msgs.SetDirectoryInfoReq{Id: id, Info: info}, &msgs.SetDirectoryInfoResp{}); err != nil {
		return err
	}
	return nil
}

// Not atomic between the read/write
func (c *Client) RemoveDirectoryInfoEntry(log *Logger, id msgs.InodeId, tag msgs.DirectoryInfoTag) error {
	statResp := msgs.StatDirectoryResp{}
	if err := c.ShardRequest(log, id.Shard(), &msgs.StatDirectoryReq{Id: id}, &statResp); err != nil {
		return err
	}
	info := statResp.Info
	for i := 0; i < len(info.Entries); i++ {
		if info.Entries[i].Tag == tag {
			info.Entries = append(info.Entries[:i], info.Entries[i+1:]...)
			break
		}
	}
	if err := c.ShardRequest(log, id.Shard(), &msgs.SetDirectoryInfoReq{Id: id, Info: info}, &msgs.SetDirectoryInfoResp{}); err != nil {
		return err
	}
	return nil
}

func (client *Client) ResolveDirectoryInfoEntry(
	log *Logger,
	dirInfoCache *DirInfoCache,
	dirId msgs.InodeId,
	entry msgs.IsDirectoryInfoEntry, // output will be stored in here
) (inheritedFrom msgs.InodeId, err error) {
	statReq := msgs.StatDirectoryReq{
		Id: dirId,
	}
	statResp := msgs.StatDirectoryResp{}
	visited := []msgs.InodeId{}
TraverseDirectories:
	for {
		inheritedFrom = dirInfoCache.LookupCachedDirInfoEntry(statReq.Id, entry)
		if inheritedFrom != msgs.NULL_INODE_ID {
			break
		}
		visited = append(visited, statReq.Id)
		if err := client.ShardRequest(log, statReq.Id.Shard(), &statReq, &statResp); err != nil {
			return msgs.NULL_INODE_ID, err
		}
		for i := len(statResp.Info.Entries) - 1; i >= 0; i-- {
			respEntry := &statResp.Info.Entries[i]
			if entry.Tag() == respEntry.Tag {
				inheritedFrom = statReq.Id
				if err := bincode.Unpack(respEntry.Body, entry); err != nil {
					return msgs.NULL_INODE_ID, fmt.Errorf("could not decode dir info entry for dir %v, inherited from %v, with tag %v, body %v: %v", dirId, statReq.Id, entry.Tag(), respEntry.Body, err)
				}
				break TraverseDirectories
			}
		}
		if statReq.Id == msgs.ROOT_DIR_INODE_ID {
			return msgs.NULL_INODE_ID, fmt.Errorf("could not find directory info entry with tag %v %+v", entry.Tag(), statResp)
		}
		statReq.Id = statResp.Owner
	}
	for _, id := range visited {
		dirInfoCache.UpdateCachedDirInfo(id, entry, inheritedFrom)
	}
	return inheritedFrom, nil
}

func (client *Client) ResolvePathWithParent(log *Logger, path string) (msgs.InodeId, msgs.InodeId, error) {
	if !filepath.IsAbs(path) {
		return msgs.NULL_INODE_ID, msgs.NULL_INODE_ID, fmt.Errorf("expected absolute path, got '%v'", path)
	}
	parent := msgs.NULL_INODE_ID
	id := msgs.ROOT_DIR_INODE_ID
	for _, segment := range strings.Split(filepath.Clean(path), "/")[1:] {
		if segment == "" {
			continue
		}
		resp := msgs.LookupResp{}
		if err := client.ShardRequest(log, id.Shard(), &msgs.LookupReq{DirId: id, Name: segment}, &resp); err != nil {
			return msgs.NULL_INODE_ID, msgs.NULL_INODE_ID, err
		}
		parent = id
		id = resp.TargetId
	}
	return id, parent, nil
}

func (client *Client) ResolvePath(log *Logger, path string) (msgs.InodeId, error) {
	id, _, err := client.ResolvePathWithParent(log, path)
	return id, err
}

func writeBlockSendArgs(block *msgs.AddSpanInitiateBlockInfo, r io.Reader, size uint32, crc msgs.Crc, extra any) *sendArgs {
	return &sendArgs{
		block.BlockServiceId,
		block.BlockServiceIp1,
		block.BlockServicePort1,
		block.BlockServiceIp2, block.BlockServicePort2,
		&msgs.WriteBlockReq{
			BlockId:     block.BlockId,
			Crc:         crc,
			Size:        size,
			Certificate: block.Certificate,
		},
		r,
		&msgs.WriteBlockResp{},
		nil,
		extra,
	}
}

func (client *Client) StartWriteBlock(log *Logger, block *msgs.AddSpanInitiateBlockInfo, r io.Reader, size uint32, crc msgs.Crc, extra any, completion chan *BlockCompletion) error {
	return client.writeBlockProcessors.send(log, writeBlockSendArgs(block, r, size, crc, extra), completion)
}

func RetriableBlockError(err error) bool {
	return errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, syscall.EPIPE) || errors.Is(err, syscall.ECONNRESET) || errors.Is(err, io.EOF)
}

func (client *Client) singleBlockReq(log *Logger, timeouts *ReqTimeouts, processor *blocksProcessors, args *sendArgs) (msgs.BlocksResponse, error) {
	if timeouts == nil {
		timeouts = client.blockTimeout
	}
	timeoutAlert := log.NewNCAlert(0)
	defer log.ClearNC(timeoutAlert)
	startedAt := time.Now()
	for {
		ch := make(chan *BlockCompletion, 1)
		err := processor.send(log, args, ch)
		if err != nil {
			log.Debug("failed to send block request to %v:%v %v:%v: %v", net.IP(args.ip1[:]), args.port1, net.IP(args.ip2[:]), args.port2, err)
			return nil, err
		}
		resp := <-ch
		err = resp.Error
		if err == nil {
			return resp.Resp, nil
		}
		if RetriableBlockError(err) {
			next := timeouts.Next(startedAt)
			if next == 0 {
				log.RaiseNCStack(timeoutAlert, ERROR, 2, "block request to %v:%v %v:%v failed with retriable error, will not retry since time is up: %v", net.IP(args.ip1[:]), args.port1, net.IP(args.ip2[:]), args.port2, err)
				return nil, err
			}
			log.RaiseNCStack(timeoutAlert, ERROR, 2, "block request to %v:%v %v:%v failed with retriable error, might retry: %v", net.IP(args.ip1[:]), args.port1, net.IP(args.ip2[:]), args.port2, err)
			time.Sleep(next)
		} else {
			return nil, err
		}
	}
}

func (client *Client) WriteBlock(log *Logger, timeouts *ReqTimeouts, block *msgs.AddSpanInitiateBlockInfo, r io.Reader, size uint32, crc msgs.Crc) (proof [8]byte, err error) {
	resp, err := client.singleBlockReq(log, timeouts, &client.writeBlockProcessors, writeBlockSendArgs(block, r, size, crc, nil))
	if err != nil {
		return proof, err
	}
	return resp.(*msgs.WriteBlockResp).Proof, nil
}

func fetchBlockSendArgs(blockService *msgs.BlockService, blockId msgs.BlockId, offset uint32, count uint32, w io.ReaderFrom, extra any) *sendArgs {
	return &sendArgs{
		blockService.Id,
		blockService.Ip1,
		blockService.Port1,
		blockService.Ip2,
		blockService.Port2,
		&msgs.FetchBlockReq{
			BlockId: blockId,
			Offset:  offset,
			Count:   count,
		},
		nil,
		&msgs.FetchBlockResp{},
		w,
		extra,
	}
}

func (client *Client) StartFetchBlock(log *Logger, blockService *msgs.BlockService, blockId msgs.BlockId, offset uint32, count uint32, w io.ReaderFrom, extra any, completion chan *BlockCompletion) error {
	return client.fetchBlockProcessors.send(log, fetchBlockSendArgs(blockService, blockId, offset, count, w, extra), completion)
}

func (c *Client) PutFetchedBlock(body *bytes.Buffer) {
	c.fetchBlockBufs.Put(body)
}

func (client *Client) FetchBlock(log *Logger, timeouts *ReqTimeouts, blockService *msgs.BlockService, blockId msgs.BlockId, offset uint32, count uint32) (body *bytes.Buffer, err error) {
	buf := client.fetchBlockBufs.Get().(*bytes.Buffer)
	buf.Reset()
	_, err = client.singleBlockReq(log, timeouts, &client.fetchBlockProcessors, fetchBlockSendArgs(blockService, blockId, offset, count, buf, nil))
	if err != nil {
		return nil, err
	}
	return buf, nil
}

func eraseBlockSendArgs(block *msgs.RemoveSpanInitiateBlockInfo, extra any) *sendArgs {
	return &sendArgs{
		block.BlockServiceId,
		block.BlockServiceIp1,
		block.BlockServicePort1,
		block.BlockServiceIp2,
		block.BlockServicePort2,
		&msgs.EraseBlockReq{
			BlockId:     block.BlockId,
			Certificate: block.Certificate,
		},
		nil,
		&msgs.EraseBlockResp{},
		nil,
		extra,
	}
}

func (client *Client) StartEraseBlock(log *Logger, block *msgs.RemoveSpanInitiateBlockInfo, extra any, completion chan *BlockCompletion) error {
	return client.eraseBlockProcessors.send(log, eraseBlockSendArgs(block, extra), completion)
}

func (client *Client) EraseBlock(log *Logger, block *msgs.RemoveSpanInitiateBlockInfo) (proof [8]byte, err error) {
	resp, err := client.singleBlockReq(log, nil, &client.eraseBlockProcessors, eraseBlockSendArgs(block, nil))
	if err != nil {
		return proof, err
	}
	return resp.(*msgs.EraseBlockResp).Proof, nil
}

func checkBlockSendArgs(blockService *msgs.BlockService, blockId msgs.BlockId, size uint32, crc msgs.Crc, extra any) *sendArgs {
	return &sendArgs{
		blockService.Id,
		blockService.Ip1,
		blockService.Port1,
		blockService.Ip2,
		blockService.Port2,
		&msgs.CheckBlockReq{
			BlockId: blockId,
			Size:    size,
			Crc:     crc,
		},
		nil,
		&msgs.CheckBlockResp{},
		nil,
		extra,
	}
}

func (client *Client) StartCheckBlock(log *Logger, blockService *msgs.BlockService, blockId msgs.BlockId, size uint32, crc msgs.Crc, extra any, completion chan *BlockCompletion) error {
	return client.checkBlockProcessors.send(log, checkBlockSendArgs(blockService, blockId, size, crc, extra), completion)
}

func (client *Client) CheckBlock(log *Logger, blockService *msgs.BlockService, blockId msgs.BlockId, size uint32, crc msgs.Crc) error {
	_, err := client.singleBlockReq(log, nil, &client.checkBlockProcessors, checkBlockSendArgs(blockService, blockId, size, crc, nil))
	return err
}