Files
ternfs-XTXMarkets/cpp/core/Shuckle.cpp
Francesco Mazzoli 110705db8d EggsFS -> TernFS rename
Things not done because probably disruptive:

* kmod filesystem string
* sysctl/debugfs/trace
* metrics names
* xmon instance names

Some of these might be renamed too, but starting with a relatively
safe set.
2025-09-03 09:29:53 +01:00

387 lines
13 KiB
C++

#include <cstdint>
#include <fcntl.h>
#include <sstream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sys/time.h>
#include <array>
#include <netinet/tcp.h>
#include <unordered_map>
#include <unordered_set>
#include "Bincode.hpp"
#include "Msgs.hpp"
#include "MsgsGen.hpp"
#include "Protocol.hpp"
#include "Shuckle.hpp"
#include "Exception.hpp"
#include "Connect.hpp"
#include "Loop.hpp"
static std::string explicitGenerateErrString(const std::string& what, int err, const char* str) {
std::stringstream ss;
ss << "could not " << what << ": " << err << "/" << str;
return ss.str();
}
static std::string generateErrString(const std::string& what, int err) {
return explicitGenerateErrString(what, err, (std::string(translateErrno(err)) + "=" + safe_strerror(err)).c_str());
}
static std::pair<Sock, std::string> shuckleSock(const std::string& host, uint16_t port, Duration timeout) {
auto [sock, err] = connectToHost(host, port, timeout);
if (sock.error()) {
return {std::move(sock), err};
}
struct timeval tv;
tv.tv_sec = timeout.ns/1'000'000'000ull;
tv.tv_usec = (timeout.ns%1'000'000'000ull)/1'000;
if (setsockopt(sock.get(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) {
throw SYSCALL_EXCEPTION("setsockopt");
}
return {std::move(sock), ""};
}
static std::pair<int, std::string> writeShuckleRequest(int fd, const ShuckleReqContainer& req, Duration timeout) {
static_assert(std::endian::native == std::endian::little);
// Serialize
std::vector<char> buf(req.packedSize());
BincodeBuf bbuf(buf.data(), buf.size());
req.pack(bbuf);
struct pollfd pfd{.fd = fd, .events = POLLOUT};
// Write out
#define WRITE_OUT(buf, len) \
do { \
if (unlikely(Loop::poll(&pfd, 1, timeout) < 0)) { \
return {errno, generateErrString("poll socket", errno)}; \
} \
ssize_t written = write(fd, buf, len); \
if (written < 0) { \
return {errno, generateErrString("write request", errno)}; \
} \
if (written != len) { \
return {EIO, "couldn't write full request"}; \
} \
} while (false)
WRITE_OUT(&SHUCKLE_REQ_PROTOCOL_VERSION, sizeof(SHUCKLE_REQ_PROTOCOL_VERSION));
uint32_t len = bbuf.len();
WRITE_OUT(&len, sizeof(len));
WRITE_OUT(bbuf.data, bbuf.len());
#undef WRITE_OUT
return {};
}
static std::pair<int, std::string> readShuckleResponse(int fd, ShuckleRespContainer& resp, Duration timeout) {
static_assert(std::endian::native == std::endian::little);
struct pollfd pfd{.fd = fd, .events = POLLIN};
#define READ_IN(buf, count) \
do { \
ssize_t readSoFar = 0; \
while (readSoFar < count) { \
if (unlikely(Loop::poll(&pfd, 1, timeout) < 0)) { \
return {errno, generateErrString("read request", errno)}; \
} \
ssize_t r = read(fd, buf+readSoFar, count-readSoFar); \
if (r < 0) { \
return {errno, generateErrString("read response", errno)}; \
} \
if (r == 0) { \
return {EIO, "unexpected EOF"}; \
} \
readSoFar += r; \
} \
} while (0)
uint32_t protocol;
READ_IN(&protocol, sizeof(protocol));
if (protocol != SHUCKLE_RESP_PROTOCOL_VERSION) {
std::ostringstream ss;
ss << "bad shuckle protocol (expected " << SHUCKLE_RESP_PROTOCOL_VERSION << ", got " << protocol << ")";
return {EIO, ss.str()};
}
uint32_t len;
READ_IN(&len, sizeof(len));
std::vector<char> buf(len);
READ_IN(buf.data(), buf.size());
#undef READ_IN
BincodeBuf bbuf(buf.data(), buf.size());
resp.unpack(bbuf);
if (resp.kind() == ShuckleMessageKind::ERROR) {
std::stringstream ss;
ss << "got error " << resp.getError();
return {EIO, ss.str()};
}
return {};
}
std::pair<int, std::string> fetchBlockServices(const std::string& addr, uint16_t port, Duration timeout, ShardId shid, std::vector<BlockServiceDeprecatedInfo>& blockServices, std::vector<BlockServiceInfoShort>& currentBlockServices) {
blockServices.clear();
currentBlockServices.clear();
#define FAIL(err, errStr) do { blockServices.clear(); currentBlockServices.clear(); return {err, errStr}; } while (0)
auto [sock, err] = shuckleSock(addr, port, timeout);
if (sock.error()) {
return {sock.getErrno(), err};
}
// all block services
{
ShuckleReqContainer reqContainer;
auto& req = reqContainer.setAllBlockServicesDeprecated();
{
const auto [err, errStr] = writeShuckleRequest(sock.get(), reqContainer, timeout);
if (err) { FAIL(err, errStr); }
}
ShuckleRespContainer respContainer;
{
const auto [err, errStr] = readShuckleResponse(sock.get(), respContainer, timeout);
if (err) { FAIL(err, errStr); }
}
blockServices = respContainer.getAllBlockServicesDeprecated().blockServices.els;
}
// current block services
{
ShuckleReqContainer reqContainer;
auto& req = reqContainer.setShardBlockServices();
req.shardId = shid;
{
const auto [err, errStr] = writeShuckleRequest(sock.get(), reqContainer, timeout);
if (err) { FAIL(err, errStr); }
}
ShuckleRespContainer respContainer;
{
const auto [err, errStr] = readShuckleResponse(sock.get(), respContainer, timeout);
if (err) { FAIL(err, errStr); }
}
currentBlockServices = respContainer.getShardBlockServices().blockServices.els;
}
// check that all current block services are known -- there's a small race here
// the caller should just retry in these cases.
// check that all current block services are from different failure domains
// shuckle should guarantee that when sending response but verify the invariant
{
std::unordered_set<uint64_t> knownBlockServices;
std::unordered_map<uint64_t, const BlockServiceDeprecatedInfo* > bsIdToBlockService;
std::unordered_set<std::string> fdSet;
for (const auto& bs : blockServices) {
knownBlockServices.insert(bs.id.u64);
bsIdToBlockService[bs.id.u64] = &bs;
}
for (auto storageClass : {HDD_STORAGE, FLASH_STORAGE}) {
fdSet.clear();
for (BlockServiceInfoShort bs : currentBlockServices) {
if (bs.storageClass != storageClass) { continue; }
if (!knownBlockServices.contains(bs.id.u64)) {
std::stringstream ss;
ss << "got unknown block service " << bs.id << " in current block services, was probably added in the meantime, please retry";
FAIL(EIO, ss.str());
}
auto fdName = std::string((const char*)bs.failureDomain.name.data.data(), bs.failureDomain.name.data.size());
if (!fdSet.insert(fdName).second) {
std::stringstream ss;
ss << "got multiple block services in the same failure domain: " << fdName;
FAIL(EIO, ss.str());
}
}
if (fdSet.size() < 14) {
std::stringstream ss;
ss << "we need at least 14 block services per storage class but we got " << storageClass << ": " << fdSet.size();
FAIL(EIO, ss.str());
}
}
}
return {};
#undef FAIL
}
std::pair<int, std::string> registerShard(
const std::string& addr, uint16_t port, Duration timeout, ShardReplicaId shrid, uint8_t location, bool isLeader,
const AddrsInfo& addrs
) {
const auto [sock, errStr] = shuckleSock(addr, port, timeout);
if (sock.error()) {
return {sock.getErrno(), errStr};
}
ShuckleReqContainer reqContainer;
auto& req = reqContainer.setRegisterShard();
req.shrid = shrid;
req.location = location;
req.isLeader = isLeader;
req.addrs = addrs;
{
const auto [err, errStr] = writeShuckleRequest(sock.get(), reqContainer, timeout);
if (err) { return {err, errStr}; }
}
ShuckleRespContainer respContainer;
{
const auto [err, errStr] = readShuckleResponse(sock.get(), respContainer, timeout);
if (err) { return {err, errStr}; }
}
respContainer.getRegisterShard(); // check that the response is of the right type
return {};
}
std::pair<int, std::string> fetchShardReplicas(
const std::string& addr, uint16_t port, Duration timeout, ShardId shid, std::vector<FullShardInfo>& replicas
) {
const auto [sock, errStr] = shuckleSock(addr, port, timeout);
if (sock.error()) {
return {sock.getErrno(), errStr};
}
ShuckleReqContainer reqContainer;
auto& req = reqContainer.setAllShards();
{
const auto [err, errStr] = writeShuckleRequest(sock.get(), reqContainer, timeout);
if (err) { return {err, errStr}; }
}
ShuckleRespContainer respContainer;
{
const auto [err, errStr] = readShuckleResponse(sock.get(), respContainer, timeout);
if (err) { return {err, errStr}; }
}
for(auto& shard : respContainer.getAllShards().shards.els) {
if (shard.id.shardId() == shid) {
replicas.emplace_back(std::move(shard));
}
}
return {};
}
std::pair<int, std::string> registerCDCReplica(const std::string& host, uint16_t port, Duration timeout, ReplicaId replicaId, uint8_t location, bool isLeader, const AddrsInfo& addrs) {
const auto [sock, errStr] = shuckleSock(host, port, timeout);
if (sock.error()) {
return {sock.getErrno(), errStr};
}
ShuckleReqContainer reqContainer;
auto& req = reqContainer.setRegisterCdc();
req.replica = replicaId;
req.location = location;
req.isLeader = isLeader;
req.addrs = addrs;
{
const auto [err, errStr] = writeShuckleRequest(sock.get(), reqContainer, timeout);
if (err) { return {err, errStr}; }
}
ShuckleRespContainer respContainer;
{
const auto [err, errStr] = readShuckleResponse(sock.get(), respContainer, timeout);
if (err) { return {err, errStr}; }
}
respContainer.getRegisterCdc();
return {};
}
std::pair<int, std::string> fetchCDCReplicas(
const std::string& addr, uint16_t port, Duration timeout, std::array<AddrsInfo, 5>& replicas
) {
const auto [sock, errStr] = shuckleSock(addr, port, timeout);
if (sock.error()) {
return {sock.getErrno(), errStr};
}
ShuckleReqContainer reqContainer;
auto& req = reqContainer.setCdcReplicasDEPRECATED();
{
const auto [err, errStr] = writeShuckleRequest(sock.get(), reqContainer, timeout);
if (err) { return {err, errStr}; }
}
ShuckleRespContainer respContainer;
{
const auto [err, errStr] = readShuckleResponse(sock.get(), respContainer, timeout);
if (err) { return {err, errStr}; }
}
if (respContainer.getCdcReplicasDEPRECATED().replicas.els.size() != replicas.size()) {
throw TERN_EXCEPTION("expecting %s replicas, got %s", replicas.size(), respContainer.getCdcReplicasDEPRECATED().replicas.els.size());
}
for (int i = 0; i < replicas.size(); i++) {
replicas[i] = respContainer.getCdcReplicasDEPRECATED().replicas.els[i];
}
return {};
}
std::pair<int, std::string> fetchLocalShards(const std::string& host, uint16_t port, Duration timeout, std::array<ShardInfo, 256>& shards) {
const auto [sock, errStr] = shuckleSock(host, port, timeout);
if (sock.error()) {
return {sock.getErrno(), errStr};
}
ShuckleReqContainer reqContainer;
reqContainer.setLocalShards();
{
const auto [err, errStr] = writeShuckleRequest(sock.get(), reqContainer, timeout);
if (err) { return {err, errStr}; }
}
ShuckleRespContainer respContainer;
{
const auto [err, errStr] = readShuckleResponse(sock.get(), respContainer, timeout);
if (err) { return {err, errStr}; }
}
if (respContainer.getLocalShards().shards.els.size() != shards.size()) {
throw TERN_EXCEPTION("expecting %s shards, got %s", shards.size(), respContainer.getLocalShards().shards.els.size());
}
for (int i = 0; i < shards.size(); i++) {
shards[i] = respContainer.getLocalShards().shards.els[i];
}
return {};
}
bool parseShuckleAddress(const std::string& fullShuckleAddress, std::string& shuckleHost, uint16_t& shucklePort) {
// split host:port
auto colon = fullShuckleAddress.find(":");
if (colon == fullShuckleAddress.size()) {
std::cerr << "Could not find colon" << std::endl;
return false;
}
// parse port
if (fullShuckleAddress.size() - colon > 6) {
return false; // port too long
}
uint32_t port = 0;
for (int i = colon + 1; i < fullShuckleAddress.size(); i++) {
char ch = fullShuckleAddress[i];
if ((i == colon + 1 && ch == '0') || ch < '0' || ch > '9') {
return false;
}
port = port*10 + (ch-'0');
}
if (port == 0 || port > 65535) {
return false;
}
shucklePort = port;
shuckleHost = {fullShuckleAddress.begin(), fullShuckleAddress.begin()+colon};
return true;
}