mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-04-28 13:49:27 -05:00
shard/cdc: cleanup logsdb options, hostmon name match service name
This commit is contained in:
+51
-91
@@ -224,7 +224,7 @@ private:
|
||||
InFlightShardRequests _inFlightShardReqs;
|
||||
|
||||
const bool _dontDoReplication;
|
||||
std::unique_ptr<LogsDB> _logsDB;
|
||||
LogsDB _logsDB;
|
||||
std::vector<LogsDBRequest> _logsDBRequests;
|
||||
std::vector<LogsDBResponse> _logsDBResponses;
|
||||
std::vector<LogsDBRequest *> _logsDBOutRequests;
|
||||
@@ -237,25 +237,23 @@ public:
|
||||
Loop(logger, xmon, "req_server"),
|
||||
_shared(shared),
|
||||
_seenShards(false),
|
||||
_currentLogIndex(_shared.db.lastAppliedLogEntry()),
|
||||
// important to not catch stray requests from previous executions
|
||||
_shardRequestIdCounter(wyhash64_rand()),
|
||||
_shardTimeout(options.shardTimeout),
|
||||
_receiver({.maxMsgSize = MAX_UDP_MTU}),
|
||||
_cdcSender({.maxMsgSize = MAX_UDP_MTU}),
|
||||
_dontDoReplication(options.dontDoReplication)
|
||||
_dontDoReplication(options.dontDoReplication),
|
||||
_logsDB(_env,_shared.sharedDb, options.replicaId, _currentLogIndex, options.dontDoReplication, options.forceLeader, !options.forceLeader, options.forcedLastReleased)
|
||||
{
|
||||
_currentLogIndex = _shared.db.lastAppliedLogEntry();
|
||||
expandKey(CDCKey, _expandedCDCKey);
|
||||
|
||||
if (options.writeToLogsDB) {
|
||||
_logsDB.reset(new LogsDB(_env,_shared.sharedDb, options.replicaId, _currentLogIndex, options.dontDoReplication, options.forceLeader, !options.forceLeader, options.forcedLastReleased));
|
||||
if (options.forceLeader) {
|
||||
// In case of force leader it immediately detects and promotes us to leader
|
||||
_logsDB->processIncomingMessages(_logsDBRequests, _logsDBResponses);
|
||||
_shared.isLeader.store(_logsDB->isLeader(), std::memory_order_relaxed);
|
||||
_logsDBLogIndex = _logsDB->getLastReleased();
|
||||
} else {
|
||||
_shared.isLeader.store(options.replicaId == 0, std::memory_order_relaxed);
|
||||
_logsDB.processIncomingMessages(_logsDBRequests, _logsDBResponses);
|
||||
}
|
||||
_shared.isLeader.store(_logsDB.isLeader(), std::memory_order_relaxed);
|
||||
_logsDBLogIndex = _logsDB.getLastReleased();
|
||||
LOG_INFO(_env, "Waiting for shard info to be filled in");
|
||||
}
|
||||
|
||||
@@ -302,7 +300,7 @@ public:
|
||||
_recordCDCShardRespError(requestId, *resp, EggsError::TIMEOUT);
|
||||
}
|
||||
}
|
||||
auto timeout = _logsDB ? _logsDB->getNextTimeout() : ((_inFlightShardReqs.size() > 0) ? _shardTimeout : -1);
|
||||
auto timeout = _logsDB.getNextTimeout();
|
||||
// we need to process bootstrap entry
|
||||
if (unlikely(entries.size())) {
|
||||
timeout = 0;
|
||||
@@ -336,51 +334,45 @@ public:
|
||||
_inFlightLogEntries[entry.logIdx()] = std::move(entry);
|
||||
}
|
||||
}
|
||||
if (_logsDB) {
|
||||
if (_logsDB->isLeader()) {
|
||||
auto err = _logsDB->appendEntries(entries);
|
||||
ALWAYS_ASSERT(err == NO_ERROR);
|
||||
// we need to drop information about entries which might have been dropped due to append window being full
|
||||
if (unlikely(entries.size() && entries.back().idx == 0)) {
|
||||
uint64_t lastGood = 0;
|
||||
size_t i = 0;
|
||||
for (; i < entries.size(); ++i) {
|
||||
if (lastGood >= entries[i].idx.u64) {
|
||||
break;
|
||||
}
|
||||
lastGood = entries[i].idx.u64;
|
||||
}
|
||||
_logsDBLogIndex.u64 = lastGood;
|
||||
for (;i < entries.size(); ++i) {
|
||||
ALWAYS_ASSERT(entries[i].idx == 0);
|
||||
++lastGood;
|
||||
_logEntryIdxToReqInfos.erase(lastGood);
|
||||
_inFlightLogEntries.erase(lastGood);
|
||||
|
||||
if (_logsDB.isLeader()) {
|
||||
auto err = _logsDB.appendEntries(entries);
|
||||
ALWAYS_ASSERT(err == NO_ERROR);
|
||||
// we need to drop information about entries which might have been dropped due to append window being full
|
||||
if (unlikely(entries.size() && entries.back().idx == 0)) {
|
||||
uint64_t lastGood = 0;
|
||||
size_t i = 0;
|
||||
for (; i < entries.size(); ++i) {
|
||||
if (lastGood >= entries[i].idx.u64) {
|
||||
break;
|
||||
}
|
||||
lastGood = entries[i].idx.u64;
|
||||
}
|
||||
_logsDBLogIndex.u64 = lastGood;
|
||||
for (;i < entries.size(); ++i) {
|
||||
ALWAYS_ASSERT(entries[i].idx == 0);
|
||||
++lastGood;
|
||||
_logEntryIdxToReqInfos.erase(lastGood);
|
||||
_inFlightLogEntries.erase(lastGood);
|
||||
}
|
||||
entries.clear();
|
||||
}
|
||||
// Log if not active is not chaty but it's messages are higher priority as they make us progress state under high load.
|
||||
// We want to have priority when sending out
|
||||
_logsDB->getOutgoingMessages(_logsDBOutRequests, _logsDBOutResponses);
|
||||
|
||||
for (auto& response : _logsDBOutResponses) {
|
||||
_packLogsDBResponse(response);
|
||||
}
|
||||
|
||||
for (auto request : _logsDBOutRequests) {
|
||||
_packLogsDBRequest(*request);
|
||||
}
|
||||
if (_dontDoReplication) {
|
||||
_logsDB->processIncomingMessages(_logsDBRequests, _logsDBResponses);
|
||||
}
|
||||
_logsDB->readEntries(entries);
|
||||
} else {
|
||||
auto currentLogIndex = _currentLogIndex;
|
||||
for (auto& entry: entries) {
|
||||
entry.idx.u64 = ++currentLogIndex;
|
||||
}
|
||||
entries.clear();
|
||||
}
|
||||
// Log if not active is not chaty but it's messages are higher priority as they make us progress state under high load.
|
||||
// We want to have priority when sending out
|
||||
_logsDB.getOutgoingMessages(_logsDBOutRequests, _logsDBOutResponses);
|
||||
|
||||
for (auto& response : _logsDBOutResponses) {
|
||||
_packLogsDBResponse(response);
|
||||
}
|
||||
|
||||
for (auto request : _logsDBOutRequests) {
|
||||
_packLogsDBRequest(*request);
|
||||
}
|
||||
if (_dontDoReplication) {
|
||||
_logsDB.processIncomingMessages(_logsDBRequests, _logsDBResponses);
|
||||
}
|
||||
_logsDB.readEntries(entries);
|
||||
|
||||
// Apply replicated log entries
|
||||
for(auto& logEntry : entries) {
|
||||
@@ -423,9 +415,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (_logsDB) {
|
||||
_logsDB->flush(true);
|
||||
}
|
||||
_logsDB.flush(true);
|
||||
|
||||
_shardSender.sendMessages(_env, _shared.socks[SHARD_SOCK]);
|
||||
_cdcSender.sendMessages(_env, _shared.socks[CDC_SOCK]);
|
||||
@@ -564,9 +554,6 @@ private:
|
||||
}
|
||||
|
||||
void _processLogMessages() {
|
||||
if (!_logsDB) {
|
||||
return;
|
||||
}
|
||||
std::vector<LogsDBRequest> requests;
|
||||
std::vector<LogsDBResponse> responses;
|
||||
auto& requestMessages = _channel.protocolMessages(LOG_REQ_PROTOCOL_VERSION);
|
||||
@@ -651,7 +638,7 @@ private:
|
||||
}
|
||||
LOG_DEBUG(_env, "Received response %s with requests id %s from replica id %s", resp.header.kind, resp.header.requestId, resp.replicaId);
|
||||
}
|
||||
_logsDB->processIncomingMessages(requests, responses);
|
||||
_logsDB.processIncomingMessages(requests, responses);
|
||||
}
|
||||
|
||||
void _processCDCMessages() {
|
||||
@@ -856,30 +843,6 @@ private:
|
||||
});
|
||||
}
|
||||
|
||||
void _packResp(int sock, struct sockaddr_in* dest, const char* data, size_t len) {
|
||||
// We need to handle EAGAIN/EPERM when trying to send. Here we take a ...
|
||||
// lazy approach and just loop with a delay. This seems to happen when
|
||||
// we restart everything while under load, it's not great to block here
|
||||
// but it's probably OK to do so in those cases. We should also automatically
|
||||
// clear the alert when done with this.
|
||||
XmonNCAlert alert(1_sec);
|
||||
for (;;) {
|
||||
if (likely(sendto(sock, data, len, 0, (struct sockaddr*)&dest, sizeof(dest)) == len)) {
|
||||
break;
|
||||
}
|
||||
int err = errno;
|
||||
// Note that we get EPERM on `sendto` when nf drops packets.
|
||||
if (likely(err == EAGAIN || err == EPERM)) {
|
||||
_env.updateAlert(alert, "we got %s/%s=%s when trying to send shard message, will wait and retry", err, translateErrno(err), safe_strerror(err));
|
||||
(100_ms).sleepRetry();
|
||||
} else {
|
||||
_env.clearAlert(alert);
|
||||
throw EXPLICIT_SYSCALL_EXCEPTION(err, "sendto");
|
||||
}
|
||||
}
|
||||
_env.clearAlert(alert);
|
||||
}
|
||||
|
||||
uint64_t _advanceLogIndex() {
|
||||
return ++_currentLogIndex;
|
||||
}
|
||||
@@ -1172,13 +1135,10 @@ void runCDC(const std::string& dbDir, CDCOptions& options) {
|
||||
LOG_INFO(env, " shucklePort = %s", options.shucklePort);
|
||||
LOG_INFO(env, " cdcAddrs = %s", options.cdcAddrs);
|
||||
LOG_INFO(env, " syslog = %s", (int)options.syslog);
|
||||
if (options.writeToLogsDB) {
|
||||
LOG_INFO(env, "Using LogsDB with options:");
|
||||
LOG_INFO(env, " dontDoReplication = '%s'", (int)options.dontDoReplication);
|
||||
LOG_INFO(env, " forceLeader = '%s'", (int)options.forceLeader);
|
||||
LOG_INFO(env, " avoidBeingLeader = '%s'", (int)options.avoidBeingLeader);
|
||||
LOG_INFO(env, " forcedLastReleased = '%s'", options.forcedLastReleased);
|
||||
}
|
||||
LOG_INFO(env, "Using LogsDB with options:");
|
||||
LOG_INFO(env, " dontDoReplication = '%s'", (int)options.dontDoReplication);
|
||||
LOG_INFO(env, " forceLeader = '%s'", (int)options.forceLeader);
|
||||
LOG_INFO(env, " forcedLastReleased = '%s'", options.forcedLastReleased);
|
||||
|
||||
std::vector<std::unique_ptr<LoopThread>> threads;
|
||||
|
||||
@@ -1187,7 +1147,7 @@ void runCDC(const std::string& dbDir, CDCOptions& options) {
|
||||
XmonConfig config;
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << "eggscdc" << options.replicaId;
|
||||
ss << "eggscdc_" << options.replicaId;
|
||||
config.appInstance = ss.str();
|
||||
}
|
||||
config.appType = XmonAppType::CRITICAL;
|
||||
|
||||
@@ -24,10 +24,8 @@ struct CDCOptions {
|
||||
ReplicaId replicaId = 0;
|
||||
|
||||
// LogsDB settings
|
||||
bool writeToLogsDB = false;
|
||||
bool dontDoReplication = false;
|
||||
bool forceLeader = false;
|
||||
bool avoidBeingLeader = true;
|
||||
LogIdx forcedLastReleased = 0;
|
||||
};
|
||||
|
||||
|
||||
+8
-9
@@ -29,8 +29,8 @@ void usage(const char* binary) {
|
||||
fprintf(stderr, " Enable Xmon alerts.\n");
|
||||
fprintf(stderr, " -metrics\n");
|
||||
fprintf(stderr, " Enable metrics.\n");
|
||||
fprintf(stderr, " -use-logsdb LEADER|LEADER_NO_FOLLOWERS|FOLLOWER|NONE\n");
|
||||
fprintf(stderr, " Specify in which mode to use LogsDB, as LEADER|LEADER_NO_FOLLOWERS|FOLLOWER or don't use. Default is don't use.\n");
|
||||
fprintf(stderr, " -use-logsdb LEADER|LEADER_NO_FOLLOWERS|FOLLOWER\n");
|
||||
fprintf(stderr, " Specify in which mode to use LogsDB, as LEADER|LEADER_NO_FOLLOWERS|FOLLOWER. Default is FOLLOWER.\n");
|
||||
fprintf(stderr, " -force-last-released LogIdx\n");
|
||||
fprintf(stderr, " Force forward last released. Used for manual leader election. Can not be combined with starting in any LEADER mode\n");
|
||||
}
|
||||
@@ -185,16 +185,10 @@ int main(int argc, char** argv) {
|
||||
std::string logsDBMode = getNextArg();
|
||||
if (logsDBMode == "LEADER") {
|
||||
options.forceLeader = true;
|
||||
options.avoidBeingLeader = false;
|
||||
options.writeToLogsDB = true;
|
||||
} else if (logsDBMode == "FOLLOWER") {
|
||||
options.writeToLogsDB = true;
|
||||
} else if (logsDBMode == "LEADER_NO_FOLLOWERS") {
|
||||
options.forceLeader = true;
|
||||
options.avoidBeingLeader = false;
|
||||
options.writeToLogsDB = true;
|
||||
options.dontDoReplication = true;
|
||||
} else if (logsDBMode == "NONE") {
|
||||
} else if (logsDBMode == "FOLLOWER") {
|
||||
} else {
|
||||
fprintf(stderr, "Invalid logsDB mode %s", logsDBMode.c_str());
|
||||
dieWithUsage();
|
||||
@@ -214,6 +208,11 @@ int main(int argc, char** argv) {
|
||||
args.emplace_back("0");
|
||||
}
|
||||
|
||||
if (options.forceLeader && options.forcedLastReleased != 0) {
|
||||
fprintf(stderr, "You can not forward release point on a LEADER replica.");
|
||||
dieWithUsage();
|
||||
}
|
||||
|
||||
#ifndef EGGS_DEBUG
|
||||
if (options.logLevel <= LogLevel::LOG_TRACE) {
|
||||
die("Cannot use log level trace trace for non-debug builds (it won't work).");
|
||||
|
||||
+2
-2
@@ -1157,8 +1157,8 @@ void runShard(ShardReplicaId shrid, const std::string& dbDir, ShardOptions& opti
|
||||
XmonConfig config;
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << shrid;
|
||||
config.appInstance = "eggsshard" + ss.str();
|
||||
ss << "eggsshard" << shrid.shardId() << "_" << shrid.replicaId();
|
||||
config.appInstance = ss.str();
|
||||
}
|
||||
config.prod = options.xmonProd;
|
||||
config.appType = XmonAppType::CRITICAL;
|
||||
|
||||
Reference in New Issue
Block a user