mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-05-11 23:02:27 -05:00
Wait forever, rather than having timeouts
The goal here is to not have constant wakeups due to timeout. Do not attempt to clean things up nicely before termination -- just terminate instead. We can setup a proper termination system in the future, I first want to see if this makes a difference. Also, change xmon to use pipes for communication, so that it can wait without timers as well. Also, `write` directly for logging, so that we know the logs will make it to the file after the logging call returns (since we now do not have the chance to flush them afterwards).
This commit is contained in:
committed by
Francesco Mazzoli
parent
5d53959e18
commit
38f3d54ecd
+59
-59
@@ -23,7 +23,6 @@
|
||||
#include "Msgs.hpp"
|
||||
#include "Shard.hpp"
|
||||
#include "Time.hpp"
|
||||
#include "Undertaker.hpp"
|
||||
#include "CDCDB.hpp"
|
||||
#include "Crypto.hpp"
|
||||
#include "CDCKey.hpp"
|
||||
@@ -32,6 +31,7 @@
|
||||
#include "wyhash.h"
|
||||
#include "Xmon.hpp"
|
||||
#include "Timings.hpp"
|
||||
#include "PeriodicLoop.hpp"
|
||||
#include "Loop.hpp"
|
||||
#include "ErrorCount.hpp"
|
||||
|
||||
@@ -228,9 +228,6 @@ public:
|
||||
{
|
||||
_currentLogIndex = _shared.db.lastAppliedLogEntry();
|
||||
expandKey(CDCKey, _expandedCDCKey);
|
||||
}
|
||||
|
||||
virtual void init() override {
|
||||
LOG_INFO(_env, "Waiting for shard info to be filled in");
|
||||
}
|
||||
|
||||
@@ -265,8 +262,8 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// 10ms timeout for prompt termination and for shard resps timeouts
|
||||
int ret = poll(_socks.data(), _socks.size(), 10);
|
||||
LOG_DEBUG(_env, "Blocking to wait for readable sockets");
|
||||
int ret = poll(_socks.data(), _socks.size(), -1);
|
||||
if (ret < 0) {
|
||||
throw SYSCALL_EXCEPTION("poll");
|
||||
}
|
||||
@@ -275,13 +272,13 @@ public:
|
||||
// first), then the CDC reqs ones.
|
||||
for (int i = 1; i < _socks.size(); i += 2) {
|
||||
const auto& sock = _socks[i];
|
||||
if (sock.revents & POLLIN) {
|
||||
if (sock.revents & (POLLIN|POLLHUP|POLLERR)) {
|
||||
_drainShardSock(sock.fd);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < _socks.size(); i += 2) {
|
||||
const auto& sock = _socks[i];
|
||||
if (sock.revents & POLLIN) {
|
||||
if (sock.revents & (POLLIN|POLLHUP|POLLERR)) {
|
||||
_drainCDCSock(sock.fd);
|
||||
}
|
||||
}
|
||||
@@ -309,10 +306,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void finish() override {
|
||||
_shared.db.close();
|
||||
}
|
||||
|
||||
private:
|
||||
void _updateInFlightTxns() {
|
||||
_shared.inFlightTxnsWindow[_inFlightTxnsWindowCursor%_shared.inFlightTxnsWindow.size()].store(_inFlightTxns.size());
|
||||
@@ -333,7 +326,7 @@ private:
|
||||
}
|
||||
}
|
||||
if (badShard) {
|
||||
sleepFor(10_ms);
|
||||
(10_ms).sleep();
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -679,7 +672,7 @@ private:
|
||||
// Note that we get EPERM on `sendto` when nf drops packets.
|
||||
if (likely(err == EAGAIN || err == EPERM)) {
|
||||
_env.updateAlert(alert, "we got %s/%s=%s when trying to send shard message, will wait and retry", err, translateErrno(err), safe_strerror(err));
|
||||
sleepFor(100_ms);
|
||||
(100_ms).sleepRetry();
|
||||
} else {
|
||||
_env.clearAlert(alert);
|
||||
throw EXPLICIT_SYSCALL_EXCEPTION(err, "sendto");
|
||||
@@ -708,9 +701,7 @@ public:
|
||||
_shuckleHost(options.shuckleHost),
|
||||
_shucklePort(options.shucklePort),
|
||||
_alert(10_sec)
|
||||
{}
|
||||
|
||||
virtual void init() override {
|
||||
{
|
||||
_env.updateAlert(_alert, "Waiting to get shards");
|
||||
}
|
||||
|
||||
@@ -824,9 +815,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void finish() override {
|
||||
periodicStep();
|
||||
}
|
||||
// TODO restore this when we can
|
||||
// virtual void finish() override {
|
||||
// LOG_INFO(_env, "inserting stats one last time");
|
||||
// periodicStep();
|
||||
// }
|
||||
};
|
||||
|
||||
struct CDCMetricsInserter : PeriodicLoop {
|
||||
@@ -837,7 +830,7 @@ private:
|
||||
std::unordered_map<std::string, uint64_t> _rocksDBStats;
|
||||
public:
|
||||
CDCMetricsInserter(Logger& logger, std::shared_ptr<XmonAgent>& xmon, CDCShared& shared):
|
||||
PeriodicLoop(logger, xmon, "metrics_inserter", {1_sec, 1.0, 1_mins, 0.1}),
|
||||
PeriodicLoop(logger, xmon, "metrics", {1_sec, 1.0, 1_mins, 0.1}),
|
||||
_shared(shared),
|
||||
_alert(10_sec)
|
||||
{}
|
||||
@@ -905,63 +898,70 @@ public:
|
||||
|
||||
|
||||
void runCDC(const std::string& dbDir, const CDCOptions& options) {
|
||||
auto undertaker = Undertaker::acquireUndertaker();
|
||||
|
||||
std::ostream* logOut = &std::cout;
|
||||
std::ofstream fileOut;
|
||||
int logOutFd = STDOUT_FILENO;
|
||||
if (!options.logFile.empty()) {
|
||||
fileOut = std::ofstream(options.logFile, std::ios::out | std::ios::app);
|
||||
if (!fileOut.is_open()) {
|
||||
throw EGGS_EXCEPTION("Could not open log file `%s'\n", options.logFile);
|
||||
logOutFd = open(options.logFile.c_str(), O_WRONLY|O_CREAT|O_APPEND, 0644);
|
||||
if (logOutFd < 0) {
|
||||
throw SYSCALL_EXCEPTION("open");
|
||||
}
|
||||
logOut = &fileOut;
|
||||
}
|
||||
Logger logger(options.logLevel, *logOut, options.syslog, true);
|
||||
Logger logger(options.logLevel, logOutFd, options.syslog, true);
|
||||
|
||||
std::shared_ptr<XmonAgent> xmon;
|
||||
if (options.xmon) {
|
||||
xmon = std::make_shared<XmonAgent>();
|
||||
}
|
||||
|
||||
{
|
||||
Env env(logger, xmon, "startup");
|
||||
LOG_INFO(env, "Running CDC with options:");
|
||||
LOG_INFO(env, " level = %s", options.logLevel);
|
||||
LOG_INFO(env, " logFile = '%s'", options.logFile);
|
||||
LOG_INFO(env, " port = %s", options.port);
|
||||
LOG_INFO(env, " shuckleHost = '%s'", options.shuckleHost);
|
||||
LOG_INFO(env, " shucklePort = %s", options.shucklePort);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
LOG_INFO(env, " port%s = %s", i+1, options.ipPorts[0].port);
|
||||
{
|
||||
char ip[INET_ADDRSTRLEN];
|
||||
uint32_t ipN = options.ipPorts[i].ip;
|
||||
LOG_INFO(env, " ownIp%s = %s", i+1, inet_ntop(AF_INET, &ipN, ip, INET_ADDRSTRLEN));
|
||||
}
|
||||
Env env(logger, xmon, "startup");
|
||||
LOG_INFO(env, "Running CDC with options:");
|
||||
LOG_INFO(env, " level = %s", options.logLevel);
|
||||
LOG_INFO(env, " logFile = '%s'", options.logFile);
|
||||
LOG_INFO(env, " port = %s", options.port);
|
||||
LOG_INFO(env, " shuckleHost = '%s'", options.shuckleHost);
|
||||
LOG_INFO(env, " shucklePort = %s", options.shucklePort);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
LOG_INFO(env, " port%s = %s", i+1, options.ipPorts[0].port);
|
||||
{
|
||||
char ip[INET_ADDRSTRLEN];
|
||||
uint32_t ipN = options.ipPorts[i].ip;
|
||||
LOG_INFO(env, " ownIp%s = %s", i+1, inet_ntop(AF_INET, &ipN, ip, INET_ADDRSTRLEN));
|
||||
}
|
||||
LOG_INFO(env, " syslog = %s", (int)options.syslog);
|
||||
}
|
||||
LOG_INFO(env, " syslog = %s", (int)options.syslog);
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
// xmon first, so that by the time it shuts down it'll have all the leftover requests
|
||||
if (xmon) {
|
||||
XmonConfig config;
|
||||
config.appInstance = "eggscdc";
|
||||
config.appType = "restech_eggsfs.critical";
|
||||
config.prod = options.xmonProd;
|
||||
Xmon::spawn(*undertaker, std::make_unique<Xmon>(logger, xmon, config));
|
||||
threads.emplace_back([&logger, xmon, &options]() mutable {
|
||||
XmonConfig config;
|
||||
config.appInstance = "eggscdc";
|
||||
config.appType = "restech_eggsfs.critical";
|
||||
config.prod = options.xmonProd;
|
||||
Xmon(logger, xmon, config).run();
|
||||
});
|
||||
}
|
||||
|
||||
CDCDB db(logger, xmon, dbDir);
|
||||
auto shared = std::make_unique<CDCShared>(db);
|
||||
CDCShared shared(db);
|
||||
|
||||
Loop::spawn(*undertaker, std::make_unique<CDCServer>(logger, xmon, options, *shared));
|
||||
|
||||
Loop::spawn(*undertaker, std::make_unique<CDCShardUpdater>(logger, xmon, options, *shared));
|
||||
Loop::spawn(*undertaker, std::make_unique<CDCRegisterer>(logger, xmon, options, *shared));
|
||||
Loop::spawn(*undertaker, std::make_unique<CDCStatsInserter>(logger, xmon, options, *shared));
|
||||
threads.emplace_back([&logger, xmon, &options, &shared]() mutable {
|
||||
CDCServer(logger, xmon, options, shared).run();
|
||||
});
|
||||
threads.emplace_back([&logger, xmon, &options, &shared]() mutable {
|
||||
CDCShardUpdater(logger, xmon, options, shared).run();
|
||||
});
|
||||
threads.emplace_back([&logger, xmon, &options, &shared]() mutable {
|
||||
CDCRegisterer(logger, xmon, options, shared).run();
|
||||
});
|
||||
threads.emplace_back([&logger, xmon, &options, &shared]() mutable {
|
||||
CDCStatsInserter(logger, xmon, options, shared).run();
|
||||
});
|
||||
if (options.metrics) {
|
||||
Loop::spawn(*undertaker, std::make_unique<CDCMetricsInserter>(logger, xmon, *shared));
|
||||
threads.emplace_back([&logger, xmon, &shared]() mutable {
|
||||
CDCMetricsInserter(logger, xmon, shared).run();
|
||||
});
|
||||
}
|
||||
|
||||
undertaker->reap();
|
||||
for (;;) { sleep(60*60*24); }
|
||||
}
|
||||
|
||||
+14
-16
@@ -1370,20 +1370,22 @@ struct CDCDBImpl {
|
||||
_initDb();
|
||||
}
|
||||
|
||||
void close() {
|
||||
~CDCDBImpl() {
|
||||
LOG_INFO(_env, "destroying column families and closing database");
|
||||
|
||||
ROCKS_DB_CHECKED(_dbDontUseDirectly->DestroyColumnFamilyHandle(_defaultCf));
|
||||
ROCKS_DB_CHECKED(_dbDontUseDirectly->DestroyColumnFamilyHandle(_reqQueueCfLegacy));
|
||||
ROCKS_DB_CHECKED(_dbDontUseDirectly->DestroyColumnFamilyHandle(_parentCf));
|
||||
ROCKS_DB_CHECKED(_dbDontUseDirectly->DestroyColumnFamilyHandle(_enqueuedCf));
|
||||
ROCKS_DB_CHECKED(_dbDontUseDirectly->DestroyColumnFamilyHandle(_executingCf));
|
||||
ROCKS_DB_CHECKED(_dbDontUseDirectly->DestroyColumnFamilyHandle(_dirsToTxnsCf));
|
||||
const auto gentleRocksDBChecked = [this](const std::string& what, rocksdb::Status status) {
|
||||
if (!status.ok()) {
|
||||
LOG_INFO(_env, "Could not %s: %s", what, status.ToString());
|
||||
}
|
||||
};
|
||||
gentleRocksDBChecked("destroy default CF", _dbDontUseDirectly->DestroyColumnFamilyHandle(_defaultCf));
|
||||
gentleRocksDBChecked("destroy req queue CF", _dbDontUseDirectly->DestroyColumnFamilyHandle(_reqQueueCfLegacy));
|
||||
gentleRocksDBChecked("destroy parent CF", _dbDontUseDirectly->DestroyColumnFamilyHandle(_parentCf));
|
||||
gentleRocksDBChecked("destroy enqueued CF", _dbDontUseDirectly->DestroyColumnFamilyHandle(_enqueuedCf));
|
||||
gentleRocksDBChecked("destroy executing CF", _dbDontUseDirectly->DestroyColumnFamilyHandle(_executingCf));
|
||||
gentleRocksDBChecked("destroy dirs to txns CF", _dbDontUseDirectly->DestroyColumnFamilyHandle(_dirsToTxnsCf));
|
||||
|
||||
ROCKS_DB_CHECKED(_dbDontUseDirectly->Close());
|
||||
}
|
||||
|
||||
~CDCDBImpl() {
|
||||
gentleRocksDBChecked("close DB", _dbDontUseDirectly->Close());
|
||||
delete _dbDontUseDirectly;
|
||||
}
|
||||
|
||||
@@ -1528,7 +1530,7 @@ struct CDCDBImpl {
|
||||
}
|
||||
if (likely(status.IsTryAgain())) {
|
||||
_env.updateAlert(alert, "got try again in CDC transaction, will sleep for a second and try again");
|
||||
sleepFor(1_sec);
|
||||
(1_sec).sleepRetry();
|
||||
continue;
|
||||
}
|
||||
// We don't expect any other kind of error. The docs state:
|
||||
@@ -1916,10 +1918,6 @@ CDCDB::CDCDB(Logger& logger, std::shared_ptr<XmonAgent>& xmon, const std::string
|
||||
_impl = new CDCDBImpl(logger, xmon, path);
|
||||
}
|
||||
|
||||
void CDCDB::close() {
|
||||
((CDCDBImpl*)_impl)->close();
|
||||
}
|
||||
|
||||
CDCDB::~CDCDB() {
|
||||
delete ((CDCDBImpl*)_impl);
|
||||
}
|
||||
|
||||
@@ -85,7 +85,6 @@ public:
|
||||
|
||||
CDCDB(Logger& env, std::shared_ptr<XmonAgent>& xmon, const std::string& path);
|
||||
~CDCDB();
|
||||
void close();
|
||||
|
||||
// Unlike with ShardDB, we don't have an explicit log preparation step here,
|
||||
// because at least for now logs are simply either CDC requests, or shard
|
||||
|
||||
@@ -6,27 +6,6 @@
|
||||
#include <unistd.h>
|
||||
#include <iomanip>
|
||||
|
||||
// Throwing in static initialization is nasty, and there is no useful stacktrace
|
||||
// Also use direct syscalls to write the error as iostream might not be initialized
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=28954
|
||||
void dieWithError(const char *err) {
|
||||
size_t remaining = strlen(err);
|
||||
while (remaining) {
|
||||
size_t s = write(STDERR_FILENO, err, remaining);
|
||||
if (s == -1) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
} else {
|
||||
_exit(1); // Can't write remaining error but nothing else can be done..
|
||||
}
|
||||
}
|
||||
remaining -= s;
|
||||
err += s;
|
||||
}
|
||||
size_t _ = write(STDERR_FILENO, "\n", 1);
|
||||
_exit(1);
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, struct sockaddr_in& addr) {
|
||||
char buf[INET_ADDRSTRLEN];
|
||||
inet_ntop(AF_INET, &addr.sin_addr, buf, sizeof(buf));
|
||||
|
||||
@@ -6,9 +6,6 @@
|
||||
// Static init for globals
|
||||
///////////////////////////////////
|
||||
|
||||
// Safer alternative to throwing when in static initialization or signal handler
|
||||
void dieWithError(const char *err);
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Compiler hints
|
||||
////////////////////////////////////////////
|
||||
|
||||
+1
-1
@@ -76,4 +76,4 @@ void tearDownLoggerSignalHandler(void* logger) {
|
||||
if (sigaction(SIGUSR2, &act, NULL) < 0) {
|
||||
throw SYSCALL_EXCEPTION("sigaction");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+22
-19
@@ -28,13 +28,13 @@ struct Logger {
|
||||
private:
|
||||
LogLevel _logLevel;
|
||||
LogLevel _savedLogLevel;
|
||||
std::ostream& _out;
|
||||
int _fd;
|
||||
std::mutex _mutex;
|
||||
bool _syslog;
|
||||
bool _usr2ToDebug;
|
||||
public:
|
||||
Logger(LogLevel logLevel, std::ostream& out, bool syslog, bool usr2ToDebug):
|
||||
_logLevel(logLevel), _savedLogLevel(logLevel), _out(out), _syslog(syslog), _usr2ToDebug(usr2ToDebug)
|
||||
Logger(LogLevel logLevel, int fd, bool syslog, bool usr2ToDebug):
|
||||
_logLevel(logLevel), _savedLogLevel(logLevel), _fd(fd), _syslog(syslog), _usr2ToDebug(usr2ToDebug)
|
||||
{
|
||||
if (usr2ToDebug) {
|
||||
installLoggerSignalHandler(this);
|
||||
@@ -49,10 +49,10 @@ public:
|
||||
|
||||
template<typename ...Args>
|
||||
void _log(LogLevel level, const std::string& prefix, const char* fmt, Args&&... args) {
|
||||
std::stringstream ss;
|
||||
format_pack(ss, fmt, args...);
|
||||
std::stringstream formatSs;
|
||||
std::stringstream outSs;
|
||||
format_pack(formatSs, fmt, args...);
|
||||
std::string line;
|
||||
std::scoped_lock lock(_mutex);
|
||||
if (likely(_syslog)) {
|
||||
int syslogLevel;
|
||||
switch (level) {
|
||||
@@ -66,13 +66,25 @@ public:
|
||||
default:
|
||||
syslogLevel = 3; break; // should be throw?
|
||||
}
|
||||
while (std::getline(ss, line)) {
|
||||
_out << "<" << syslogLevel << ">" << prefix << ": " << line << std::endl;
|
||||
while (std::getline(formatSs, line)) {
|
||||
outSs << "<" << syslogLevel << ">" << prefix << ": " << line << std::endl;
|
||||
}
|
||||
} else {
|
||||
auto t = eggsNow();
|
||||
while (std::getline(ss, line)) {
|
||||
_out << t << " " << prefix << " [" << level << "] " << line << std::endl;
|
||||
while (std::getline(formatSs, line)) {
|
||||
outSs << t << " " << prefix << " [" << level << "] " << line << std::endl;
|
||||
}
|
||||
}
|
||||
{
|
||||
std::scoped_lock lock(_mutex);
|
||||
std::string_view v = outSs.view();
|
||||
int remaining = v.size();
|
||||
while (remaining) {
|
||||
int res = write(_fd, v.data(), v.size());
|
||||
if (res < 0) {
|
||||
throw SYSCALL_EXCEPTION("write");
|
||||
}
|
||||
remaining -= res;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -81,11 +93,6 @@ public:
|
||||
return level >= _logLevel;
|
||||
}
|
||||
|
||||
void flush() {
|
||||
std::scoped_lock lock(_mutex);
|
||||
_out.flush();
|
||||
}
|
||||
|
||||
void toggleDebug();
|
||||
};
|
||||
|
||||
@@ -135,10 +142,6 @@ public:
|
||||
bool _shouldLog(LogLevel level) {
|
||||
return _logger._shouldLog(level);
|
||||
}
|
||||
|
||||
void flush() {
|
||||
_logger.flush();
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef EGGS_DEBUG
|
||||
|
||||
@@ -43,7 +43,6 @@ private:
|
||||
std::string _msg;
|
||||
};
|
||||
|
||||
|
||||
class FatalException : public AbstractException {
|
||||
public:
|
||||
template <typename ... Args>
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
#include "Loop.hpp"
|
||||
|
||||
void* runLoop(void* server) {
|
||||
((Loop*)server)->run();
|
||||
return nullptr;
|
||||
}
|
||||
+15
-89
@@ -1,106 +1,32 @@
|
||||
#pragma once
|
||||
|
||||
#include "Undertaker.hpp"
|
||||
#include "Stopper.hpp"
|
||||
#include <pthread.h>
|
||||
|
||||
#include "Env.hpp"
|
||||
#include "wyhash.h"
|
||||
#include "Exception.hpp"
|
||||
|
||||
void* runLoop(void*);
|
||||
|
||||
struct Loop : Undertaker::Reapable {
|
||||
struct Loop {
|
||||
protected:
|
||||
Env _env;
|
||||
Stopper _stopper;
|
||||
|
||||
private:
|
||||
std::string _name;
|
||||
|
||||
public:
|
||||
Loop(Logger& logger, std::shared_ptr<XmonAgent>& xmon, const std::string& name) : _env(logger, xmon, name), _name(name) {}
|
||||
|
||||
const std::string& name() const { return _name; }
|
||||
|
||||
virtual ~Loop() = default;
|
||||
|
||||
virtual void terminate() override {
|
||||
_env.flush();
|
||||
_stopper.stop();
|
||||
Loop(Logger& logger, std::shared_ptr<XmonAgent>& xmon, const std::string& name) : _env(logger, xmon, name), _name(name) {
|
||||
int ret = pthread_setname_np(pthread_self(), name.c_str());
|
||||
if (ret != 0) {
|
||||
throw EXPLICIT_SYSCALL_EXCEPTION(ret, "pthreat_setname_np %s", name);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void onAbort() override {
|
||||
_env.flush();
|
||||
const std::string& name() const {
|
||||
return _name;
|
||||
}
|
||||
|
||||
virtual void init() {};
|
||||
virtual void step() = 0;
|
||||
virtual void finish() {};
|
||||
|
||||
virtual void run() {
|
||||
init();
|
||||
for (;;) {
|
||||
if (_stopper.shouldStop()) {
|
||||
LOG_INFO(_env, "got told to stop, stopping");
|
||||
finish();
|
||||
_stopper.stopDone();
|
||||
return;
|
||||
}
|
||||
step();
|
||||
}
|
||||
}
|
||||
|
||||
static void spawn(Undertaker& undertaker, std::unique_ptr<Loop> loop) {
|
||||
std::string name = loop->name();
|
||||
pthread_t tid;
|
||||
if (pthread_create(&tid, nullptr, &runLoop, &*loop) != 0) {
|
||||
throw SYSCALL_EXCEPTION("pthread_create");
|
||||
}
|
||||
undertaker.checkin(std::move(loop), tid, name);
|
||||
void run() {
|
||||
for (;;) { step(); }
|
||||
}
|
||||
};
|
||||
|
||||
struct PeriodicLoopConfig {
|
||||
Duration quantum = 10_ms;
|
||||
Duration failureInterval; // waiting between failures
|
||||
double failureIntervalJitter = 1.0;
|
||||
Duration successInterval; // waiting between successes
|
||||
double successIntervalJitter = 1.0;
|
||||
|
||||
PeriodicLoopConfig(Duration failureInterval_, Duration successInterval_) : failureInterval(failureInterval_), successInterval(successInterval_) {}
|
||||
PeriodicLoopConfig(Duration failureInterval_, double failureIntervalJitter_, Duration successInterval_, double successIntervalJitter_) :
|
||||
failureInterval(failureInterval_), failureIntervalJitter(failureIntervalJitter_), successInterval(successInterval_), successIntervalJitter(successIntervalJitter_)
|
||||
{}
|
||||
};
|
||||
|
||||
struct PeriodicLoop : Loop {
|
||||
private:
|
||||
PeriodicLoopConfig _config;
|
||||
uint64_t _rand;
|
||||
EggsTime _nextStepAt;
|
||||
public:
|
||||
PeriodicLoop(Logger& logger, std::shared_ptr<XmonAgent>& xmon, const std::string& name, const PeriodicLoopConfig& config) :
|
||||
Loop(logger, xmon, name),
|
||||
_config(config),
|
||||
_rand(eggsNow().ns),
|
||||
_nextStepAt(0)
|
||||
{}
|
||||
|
||||
// true = success, false = failure
|
||||
virtual bool periodicStep() = 0;
|
||||
|
||||
virtual void step() override {
|
||||
EggsTime t = eggsNow();
|
||||
if (t < _nextStepAt) {
|
||||
sleepFor(_config.quantum);
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_DEBUG(_env, "we're past %s, running periodic step", _nextStepAt);
|
||||
|
||||
bool success = periodicStep();
|
||||
if (success) {
|
||||
_nextStepAt = t + _config.successInterval + Duration((double)_config.successInterval.ns * (_config.successIntervalJitter * wyhash64_double(&_rand)));
|
||||
LOG_DEBUG(_env, "periodic step succeeded, next step at %s", _nextStepAt);
|
||||
} else {
|
||||
_nextStepAt = t + _config.failureInterval + Duration((double)_config.failureInterval.ns * (_config.failureIntervalJitter * wyhash64_double(&_rand)));
|
||||
LOG_DEBUG(_env, "periodic step failed, next step at %s", _nextStepAt);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,51 @@
|
||||
#pragma once
|
||||
|
||||
#include "Time.hpp"
|
||||
#include "Env.hpp"
|
||||
#include "Loop.hpp"
|
||||
#include "wyhash.h"
|
||||
|
||||
struct PeriodicLoopConfig {
|
||||
Duration failureInterval; // waiting between failures
|
||||
double failureIntervalJitter = 1.0;
|
||||
Duration successInterval; // waiting between successes
|
||||
double successIntervalJitter = 1.0;
|
||||
|
||||
PeriodicLoopConfig(Duration failureInterval_, Duration successInterval_) : failureInterval(failureInterval_), successInterval(successInterval_) {}
|
||||
PeriodicLoopConfig(Duration failureInterval_, double failureIntervalJitter_, Duration successInterval_, double successIntervalJitter_) :
|
||||
failureInterval(failureInterval_), failureIntervalJitter(failureIntervalJitter_), successInterval(successInterval_), successIntervalJitter(successIntervalJitter_)
|
||||
{}
|
||||
};
|
||||
|
||||
struct PeriodicLoop : Loop {
|
||||
private:
|
||||
PeriodicLoopConfig _config;
|
||||
uint64_t _rand;
|
||||
bool _lastSucceded;
|
||||
|
||||
public:
|
||||
PeriodicLoop(Logger& logger, std::shared_ptr<XmonAgent>& xmon, const std::string& name, const PeriodicLoopConfig& config) :
|
||||
Loop(logger, xmon, name),
|
||||
_config(config),
|
||||
_rand(eggsNow().ns),
|
||||
_lastSucceded(false)
|
||||
{}
|
||||
|
||||
// true = success, false = failure
|
||||
virtual bool periodicStep() = 0;
|
||||
|
||||
// We sleep first to immediately introduce a jitter.
|
||||
virtual void step() override {
|
||||
auto t = eggsNow();
|
||||
Duration pause;
|
||||
if (_lastSucceded) {
|
||||
pause = _config.successInterval + Duration((double)_config.successInterval.ns * (_config.successIntervalJitter * wyhash64_double(&_rand)));
|
||||
LOG_DEBUG(_env, "periodic step succeeded, next step at %s", t + pause);
|
||||
} else {
|
||||
pause = _config.failureInterval + Duration((double)_config.failureInterval.ns * (_config.failureIntervalJitter * wyhash64_double(&_rand)));
|
||||
LOG_DEBUG(_env, "periodic step failed, next step at %s", t + pause);
|
||||
}
|
||||
pause.sleepRetry();
|
||||
_lastSucceded = periodicStep();
|
||||
}
|
||||
};
|
||||
+27
-2
@@ -1,5 +1,4 @@
|
||||
// #include <bits/types/struct_timespec.h>
|
||||
#include <cstdio>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
@@ -25,6 +24,32 @@ std::ostream& operator<<(std::ostream& out, Duration d) {
|
||||
return out;
|
||||
}
|
||||
|
||||
void Duration::sleepRetry() const {
|
||||
struct timespec ts = timespec();
|
||||
for (;;) {
|
||||
int ret = nanosleep(&ts, &ts);
|
||||
if (likely(ret == 0)) {
|
||||
return;
|
||||
}
|
||||
if (likely(errno == EINTR)) {
|
||||
continue;
|
||||
}
|
||||
throw SYSCALL_EXCEPTION("nanosleep");
|
||||
}
|
||||
}
|
||||
|
||||
Duration Duration::sleep() const {
|
||||
struct timespec ts = timespec();
|
||||
int ret = nanosleep(&ts, &ts);
|
||||
if (likely(ret == 0)) {
|
||||
return 0;
|
||||
}
|
||||
if (likely(errno == EINTR)) {
|
||||
return Duration(ts);
|
||||
}
|
||||
throw SYSCALL_EXCEPTION("nanosleep");
|
||||
}
|
||||
|
||||
__attribute__((constructor))
|
||||
static void checkClockRes() {
|
||||
struct timespec ts;
|
||||
|
||||
+23
-5
@@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <time.h>
|
||||
|
||||
#include "Common.hpp"
|
||||
#include "Bincode.hpp"
|
||||
|
||||
@@ -8,6 +10,7 @@ struct Duration {
|
||||
|
||||
constexpr Duration(): ns(0) {}
|
||||
constexpr Duration(int64_t ns_): ns(ns_) {}
|
||||
constexpr Duration(const struct timespec& ts): ns(ts.tv_sec*1'000'000'000ull + ts.tv_nsec) {}
|
||||
|
||||
bool operator==(Duration rhs) const {
|
||||
return ns == rhs.ns;
|
||||
@@ -29,13 +32,30 @@ struct Duration {
|
||||
return ns <= rhs.ns;
|
||||
}
|
||||
|
||||
Duration operator+(Duration d) {
|
||||
Duration operator+(Duration d) const {
|
||||
return Duration(ns + d.ns);
|
||||
}
|
||||
|
||||
Duration operator-(Duration d) {
|
||||
Duration operator*(int64_t x) const {
|
||||
return Duration(ns * x);
|
||||
}
|
||||
|
||||
Duration operator-(Duration d) const {
|
||||
return ns - d.ns;
|
||||
}
|
||||
|
||||
struct timespec timespec() const {
|
||||
struct timespec ts;
|
||||
ts.tv_sec = ns / 1'000'000'000ull;
|
||||
ts.tv_nsec = ns % 1'000'000'000ull;
|
||||
return ts;
|
||||
}
|
||||
|
||||
// sleeps, returns a non-zero duration if we were interrupted
|
||||
Duration sleep() const;
|
||||
|
||||
// sleeps, retrying if we get EINTR
|
||||
void sleepRetry() const;
|
||||
};
|
||||
|
||||
constexpr Duration operator "" _ns (unsigned long long t) { return Duration(t); }
|
||||
@@ -97,6 +117,4 @@ struct EggsTime {
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, EggsTime t);
|
||||
|
||||
EggsTime eggsNow();
|
||||
|
||||
void sleepFor(Duration dt);
|
||||
EggsTime eggsNow();
|
||||
@@ -1,105 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <pthread.h>
|
||||
#include <memory>
|
||||
#include <signal.h>
|
||||
#include <list>
|
||||
#include <mutex>
|
||||
|
||||
#include "SBRMUnix.hpp"
|
||||
|
||||
#if 0
|
||||
...
|
||||
;::::;
|
||||
;::::; :;
|
||||
;:::::/ :;
|
||||
;:::::; ;.
|
||||
,:::::/ ; OOO\
|
||||
::::::; ; OOOOO\
|
||||
;:::::; ; OOOOOOOO
|
||||
,;::::::; ;/ / OOOOOOO
|
||||
;:::::::::`. ,,,;. / / DOOOOOO
|
||||
./;:::::::::::::::::;, / / DOOOO
|
||||
,::::::;::::::;;;;::::;, / / DOOO
|
||||
;`::::::`/::::::;;;::::: ,#/ / DOOO
|
||||
:`:::::::`;::::::;;::: ;::# / DOOO
|
||||
::`:::::::`;:::::::: ;::::# / DOO
|
||||
`:`:::::::`;:::::: ;::::::#/ DOO
|
||||
:::`:::::::`;; ;:::::::::## OO
|
||||
::::`:::::::`;::::::::;:::# OO
|
||||
`:::::`::::::::::::;/`:;::# O
|
||||
`:::::`::::::::;/ / / `:#
|
||||
#endif
|
||||
|
||||
/**
|
||||
* The undertaker provides signal management, reaps offending AppThreads and tidies up the mess.
|
||||
*
|
||||
* Essentially two ways of using this:
|
||||
* 1. Outside of the App framework:
|
||||
* just call Undertaker::configureErrorHandlers() at the beginning of main() and be done.
|
||||
* 2. Within the App framework:
|
||||
* instanciate an Undertaker, register Reapable AppThreads and spin them out; then call reap().
|
||||
*
|
||||
* If a critical/terminate signal is delivered to the undertaker it will:
|
||||
* 1. call cleanup() on the offending Reapable
|
||||
* 2. destroy all other registered Reapables
|
||||
* 3. exit/abort
|
||||
*/
|
||||
|
||||
class Undertaker {
|
||||
public:
|
||||
/**
|
||||
* Something (like an AppThread) that can be harvested
|
||||
* NOTE: the contract on these methods is that they _MUST_ be thread safe
|
||||
*/
|
||||
class Reapable {
|
||||
public:
|
||||
virtual ~Reapable();
|
||||
|
||||
// ask for a clean termination
|
||||
virtual void terminate() = 0;
|
||||
// last chance to run abort handlers before going down hard
|
||||
virtual void onAbort() = 0;
|
||||
};
|
||||
|
||||
// Two methods of setting up the undertaker - can only call one once per application
|
||||
static Undertaker * acquireUndertaker();
|
||||
static void configureErrorHandlers();
|
||||
|
||||
// checkin with the undertaker: register a Reapable instance running on this thread. NOTE: cleanup order is in reverse to checkin order
|
||||
void checkin(std::unique_ptr<Reapable> r, pthread_t threadId, const std::string& threadName);
|
||||
// block until either a thread exits or a signal is received: does not return if a critical signal is received
|
||||
void reap(size_t reapAfter = 1);
|
||||
|
||||
// callbacks for signals handlers
|
||||
|
||||
// run abort handlers for all threads
|
||||
void onAbort();
|
||||
// try to terminate normally at own convenience
|
||||
void onExit();
|
||||
|
||||
static bool isTerminating();
|
||||
|
||||
static void handleCriticalSignal(int signum, siginfo_t *info, void *ucontext);
|
||||
|
||||
private:
|
||||
Undertaker();
|
||||
~Undertaker();
|
||||
|
||||
struct ReapableThread {
|
||||
ReapableThread(pthread_t tid, const std::string& name, std::unique_ptr<Reapable> r);
|
||||
ReapableThread();
|
||||
|
||||
pthread_t _tid;
|
||||
bool _joined = false;
|
||||
std::string _name;
|
||||
std::unique_ptr<Reapable> _reapable;
|
||||
};
|
||||
typedef std::list<ReapableThread> Condemned;
|
||||
|
||||
FDHolder _evtfd;
|
||||
std::mutex _lock;
|
||||
Condemned _condemned;
|
||||
pthread_t _reaperTid;
|
||||
};
|
||||
|
||||
+244
-165
@@ -1,8 +1,12 @@
|
||||
#include <time.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <bit>
|
||||
#include <sys/socket.h>
|
||||
#include <unordered_set>
|
||||
#include <sys/timerfd.h>
|
||||
|
||||
#include "Env.hpp"
|
||||
#include "Exception.hpp"
|
||||
#include "Time.hpp"
|
||||
#include "Xmon.hpp"
|
||||
@@ -30,7 +34,7 @@ Xmon::Xmon(
|
||||
std::shared_ptr<XmonAgent>& agent,
|
||||
const XmonConfig& config
|
||||
) :
|
||||
_env(logger, agent, "xmon"),
|
||||
Loop(logger, agent, "xmon"),
|
||||
_agent(agent),
|
||||
_appType(config.appType),
|
||||
_xmonHost(config.prod ? "REDACTED" : "REDACTED"),
|
||||
@@ -49,11 +53,56 @@ Xmon::Xmon(
|
||||
_hostname = _hostname.substr(0, _hostname.find("."));
|
||||
}
|
||||
_appInstance = config.appInstance + "@" + _hostname;
|
||||
|
||||
// xmon socket
|
||||
_fds[SOCK_FD].fd = -1;
|
||||
_fds[SOCK_FD].events = POLLIN;
|
||||
// xmon agent read end
|
||||
_fds[PIPE_FD].fd = agent->readFd();
|
||||
_fds[PIPE_FD].events = POLLIN;
|
||||
// timer fd
|
||||
_fds[TIMER_FD].fd = timerfd_create(CLOCK_MONOTONIC, 0);
|
||||
if (_fds[TIMER_FD].fd < 0) {
|
||||
throw SYSCALL_EXCEPTION("timerfd_create");
|
||||
}
|
||||
_fds[TIMER_FD].events = POLLIN;
|
||||
|
||||
// arm initial timer
|
||||
{
|
||||
auto now = eggsNow();
|
||||
_timerExpiresAt = -1;
|
||||
_ensureTimer(now, now);
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t HEARTBEAT_INTERVAL_SECS = 5; // 5 seconds
|
||||
Xmon::~Xmon() {
|
||||
if (_fds[SOCK_FD].fd >= 0) {
|
||||
if (close(_fds[0].fd) < 0) {
|
||||
std::cerr << "Could not close xmon socket (" << errno << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
if (close(_fds[TIMER_FD].fd) < 0) {
|
||||
std::cerr << "Could not close timer fd (" << errno << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void Xmon::packLogon(XmonBuf& buf) {
|
||||
void Xmon::_ensureTimer(EggsTime now, EggsTime t) {
|
||||
if (_timerExpiresAt <= t) { return; }
|
||||
Duration d = std::max<Duration>(1, t - now);
|
||||
LOG_DEBUG(_env, "arming timer in %s", d);
|
||||
struct itimerspec utmr;
|
||||
memset(&utmr, 0, sizeof(utmr));
|
||||
utmr.it_value = d.timespec();
|
||||
if (timerfd_settime(_fds[TIMER_FD].fd, 0, &utmr, nullptr) < 0) {
|
||||
throw SYSCALL_EXCEPTION("timerfd_settime");
|
||||
}
|
||||
_timerExpiresAt = t;
|
||||
}
|
||||
|
||||
const Duration HEARTBEAT_INTERVAL = 10_sec;
|
||||
const uint8_t HEARTBEAT_INTERVAL_SECS = HEARTBEAT_INTERVAL.ns/1'000'000'000ull;
|
||||
|
||||
void Xmon::_packLogon(XmonBuf& buf) {
|
||||
// <https://REDACTED>
|
||||
// Name Type Description
|
||||
// Magic int16 always 'T'
|
||||
@@ -90,7 +139,7 @@ void Xmon::packLogon(XmonBuf& buf) {
|
||||
buf.packString(details); // details
|
||||
}
|
||||
|
||||
void Xmon::packUpdate(XmonBuf& buf) {
|
||||
void Xmon::_packUpdate(XmonBuf& buf) {
|
||||
buf.reset();
|
||||
// Message type int32 0x1
|
||||
// (unused) int32 must be 0
|
||||
@@ -106,7 +155,7 @@ void Xmon::packUpdate(XmonBuf& buf) {
|
||||
buf.packScalar<XmonMood>(XmonMood::Happy);
|
||||
}
|
||||
|
||||
void Xmon::packRequest(XmonBuf& buf, const XmonRequest& req) {
|
||||
void Xmon::_packRequest(XmonBuf& buf, const XmonRequest& req) {
|
||||
buf.reset();
|
||||
buf.packScalar(req.msgType);
|
||||
buf.packScalar(req.alertId);
|
||||
@@ -142,11 +191,11 @@ bool XmonBuf::readIn(int fd, size_t sz, std::string& errString) {
|
||||
if (r < 0 && errno == EAGAIN && readSoFar == 0) {
|
||||
return false;
|
||||
}
|
||||
if (r < 0) {
|
||||
if (unlikely(r < 0)) {
|
||||
errString = generateErrString("read in xmon buf", errno);
|
||||
return false;
|
||||
}
|
||||
if (r == 0) {
|
||||
if (unlikely(r == 0)) {
|
||||
errString = "unexpected EOF";
|
||||
return false;
|
||||
}
|
||||
@@ -163,188 +212,218 @@ struct QuietAlert {
|
||||
std::string message;
|
||||
};
|
||||
|
||||
void Xmon::run() {
|
||||
XmonBuf buf;
|
||||
int sock = -1;
|
||||
std::deque<XmonRequest> requests;
|
||||
std::unordered_set<int64_t> binnableAlerts;
|
||||
std::unordered_map<int64_t, QuietAlert> quietAlerts;
|
||||
|
||||
EggsTime Xmon::_stepNextWakeup() {
|
||||
std::string errString;
|
||||
EggsTime nextWakeup = -1;
|
||||
|
||||
#define CHECK_ERR_STRING(__what) \
|
||||
if (errString.size()) { \
|
||||
LOG_INFO(_env, "could not %s: %s, will reconnect", __what, errString); \
|
||||
sleepFor(1_sec); \
|
||||
goto reconnect; \
|
||||
if (close(_fds[SOCK_FD].fd) < 0) { \
|
||||
throw SYSCALL_EXCEPTION("close"); \
|
||||
} \
|
||||
_fds[SOCK_FD].fd = -1; \
|
||||
(1_sec).sleep(); \
|
||||
return nextWakeup; \
|
||||
}
|
||||
|
||||
reconnect:
|
||||
errString.clear();
|
||||
sock = connectToHost(_xmonHost, _xmonPort, errString);
|
||||
CHECK_ERR_STRING(errString);
|
||||
LOG_INFO(_env, "connected to xmon %s:%s", _xmonHost, _xmonPort);
|
||||
|
||||
// 100ms timeout for prompt termination/processing
|
||||
{
|
||||
struct timeval tv;
|
||||
tv.tv_sec = 0;
|
||||
tv.tv_usec = 100'000;
|
||||
if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,&tv,sizeof(tv)) < 0) {
|
||||
throw SYSCALL_EXCEPTION("setsockopt");
|
||||
// Not connected, connect
|
||||
if (_fds[SOCK_FD].fd < 0) {
|
||||
_fds[SOCK_FD].fd = connectToHost(_xmonHost, _xmonPort, errString);
|
||||
CHECK_ERR_STRING(errString);
|
||||
LOG_INFO(_env, "connected to xmon %s:%s", _xmonHost, _xmonPort);
|
||||
|
||||
// non blocking receive
|
||||
if (fcntl(_fds[SOCK_FD].fd, F_SETFL, O_NONBLOCK) < 0) {
|
||||
throw SYSCALL_EXCEPTION("fcntl");
|
||||
}
|
||||
|
||||
// Send logon message
|
||||
_packLogon(_buf);
|
||||
errString = _buf.writeOut(_fds[SOCK_FD].fd);
|
||||
CHECK_ERR_STRING(errString);
|
||||
LOG_INFO(_env, "sent logon to xmon, appType=%s appInstance=%s", _appType, _appInstance);
|
||||
|
||||
_gotHeartbeatAt = 0;
|
||||
nextWakeup = std::min(nextWakeup, eggsNow() + HEARTBEAT_INTERVAL*2);
|
||||
}
|
||||
|
||||
if (poll(_fds, NUM_FDS, -1) < 0) {
|
||||
throw SYSCALL_EXCEPTION("poll");
|
||||
}
|
||||
|
||||
auto now = eggsNow();
|
||||
|
||||
if (_fds[SOCK_FD].revents & (POLLIN|POLLHUP|POLLERR)) {
|
||||
LOG_DEBUG(_env, "got event in sock fd");
|
||||
for (;;) {
|
||||
// Receive everything
|
||||
errString.clear();
|
||||
bool success = _buf.readIn(_fds[SOCK_FD].fd, 4, errString);
|
||||
CHECK_ERR_STRING("read message type");
|
||||
if (!success) { return nextWakeup; }
|
||||
int32_t msgType = _buf.unpackScalar<int32_t>();
|
||||
switch (msgType) {
|
||||
case 0x0: {
|
||||
if (_gotHeartbeatAt == 0) {
|
||||
LOG_INFO(_env, "got first xmon heartbeat, will start sending requests");
|
||||
} else {
|
||||
LOG_DEBUG(_env, "got xmon heartbeat");
|
||||
}
|
||||
_gotHeartbeatAt = now;
|
||||
nextWakeup = std::min(nextWakeup, now + HEARTBEAT_INTERVAL*2);
|
||||
_packUpdate(_buf);
|
||||
errString = _buf.writeOut(_fds[SOCK_FD].fd);
|
||||
CHECK_ERR_STRING("send heartbeat");
|
||||
break; }
|
||||
case 0x1: {
|
||||
bool success = false;
|
||||
do {
|
||||
errString.clear();
|
||||
success = _buf.readIn(_fds[SOCK_FD].fd, 8, errString);
|
||||
CHECK_ERR_STRING("reading alert binned id");
|
||||
} while (!success);
|
||||
int64_t alertId = _buf.unpackScalar<int64_t>();
|
||||
LOG_INFO(_env, "got alert %s binned from UI", alertId);
|
||||
_binnableAlerts.erase(alertId);
|
||||
break; }
|
||||
default:
|
||||
throw EGGS_EXCEPTION("unknown message type %s", msgType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send logon message
|
||||
packLogon(buf);
|
||||
errString = buf.writeOut(sock);
|
||||
CHECK_ERR_STRING(errString);
|
||||
LOG_INFO(_env, "sent logon to xmon, appType=%s appInstance=%s", _appType, _appInstance);
|
||||
if (_fds[TIMER_FD].revents & POLLIN) {
|
||||
LOG_DEBUG(_env, "got event in timer FD");
|
||||
// drain timer
|
||||
{
|
||||
char buf[8];
|
||||
if (read(_fds[TIMER_FD].fd, buf, sizeof(buf)) != sizeof(buf)) {
|
||||
throw SYSCALL_EXCEPTION("read");
|
||||
}
|
||||
}
|
||||
_timerExpiresAt = -1; // it just fired
|
||||
|
||||
// Start recv loop
|
||||
EggsTime gotHeartbeatAt = 0;
|
||||
for (;;) {
|
||||
// try to send all requests before shutting down
|
||||
bool shutDown = _stopper.shouldStop();
|
||||
|
||||
// shut down if too much time has passed
|
||||
if (gotHeartbeatAt > 0 && (eggsNow() - gotHeartbeatAt > (uint64_t)HEARTBEAT_INTERVAL_SECS * 2 * 1'000'000'000ull)) {
|
||||
LOG_INFO(_env, "heartbeat deadline has passed, will reconnect");
|
||||
gotHeartbeatAt = 0;
|
||||
goto reconnect;
|
||||
// check if we're past the deadline
|
||||
if (_gotHeartbeatAt > 0) {
|
||||
nextWakeup = std::min(nextWakeup, _gotHeartbeatAt + HEARTBEAT_INTERVAL*2);
|
||||
LOG_DEBUG(_env, "nextWakeup=%s", nextWakeup);
|
||||
if ((now-_gotHeartbeatAt > HEARTBEAT_INTERVAL*2)) {
|
||||
LOG_INFO(_env, "heartbeat deadline has passed, will reconnect");
|
||||
if (close(_fds[SOCK_FD].fd) < 0) {
|
||||
throw SYSCALL_EXCEPTION("close");
|
||||
}
|
||||
_fds[SOCK_FD].fd = -1;
|
||||
return nextWakeup;
|
||||
}
|
||||
}
|
||||
|
||||
// send all requests
|
||||
if (gotHeartbeatAt > 0) {
|
||||
auto now = eggsNow();
|
||||
// unquiet alerts that are due
|
||||
for (auto it = quietAlerts.begin(); it != quietAlerts.end();) {
|
||||
if (now >= it->second.quietUntil) {
|
||||
requests.emplace_back(XmonRequest{
|
||||
.msgType = XmonRequestType::CREATE,
|
||||
.alertId = it->first,
|
||||
.quietPeriod = 0,
|
||||
.binnable = false,
|
||||
.message = std::move(it->second.message),
|
||||
});
|
||||
it = quietAlerts.erase(it);
|
||||
} else {
|
||||
it++;
|
||||
}
|
||||
// unquiet alerts that are due
|
||||
for (auto it = _quietAlerts.begin(); it != _quietAlerts.end();) {
|
||||
if (now >= it->second.quietUntil) {
|
||||
nextWakeup = std::min(nextWakeup, it->second.quietUntil);
|
||||
_queuedRequests.emplace_back(XmonRequest{
|
||||
.msgType = XmonRequestType::CREATE,
|
||||
.alertId = it->first,
|
||||
.quietPeriod = 0,
|
||||
.binnable = false,
|
||||
.message = std::move(it->second.message),
|
||||
});
|
||||
it = _quietAlerts.erase(it);
|
||||
} else {
|
||||
it++;
|
||||
}
|
||||
// get all requests
|
||||
_agent->getRequests(requests);
|
||||
while (!requests.empty()) {
|
||||
const auto& req = requests.front();
|
||||
LOG_INFO(_env, "got req of type %s alertId=%s", (int)XmonRequestType::CREATE, req.alertId);
|
||||
int64_t alertIdToInsert = -1;
|
||||
if (req.msgType == XmonRequestType::CREATE) {
|
||||
if (req.quietPeriod > 0) {
|
||||
ALWAYS_ASSERT(!req.binnable, "got alert with quietPeriod=%s, but it is binnable", req.quietPeriod);
|
||||
LOG_INFO(_env, "got non-binnable alertId=%s message=%s quietPeriod=%s, will wait", req.alertId, req.message, req.quietPeriod);
|
||||
quietAlerts[req.alertId] = QuietAlert{
|
||||
.quietUntil = now + req.quietPeriod,
|
||||
.message = std::move(req.message),
|
||||
};
|
||||
goto skip_request;
|
||||
} else if (req.binnable) {
|
||||
alertIdToInsert = req.alertId;
|
||||
if (binnableAlerts.size() > MAX_BINNABLE_ALERTS) {
|
||||
LOG_ERROR(_env, "not creating alert, aid=%s binnable=%s message=%s, we're full", req.alertId, req.binnable, req.message);
|
||||
if (binnableAlerts.count(XmonAgent::TOO_MANY_ALERTS_ALERT_ID) == 0) {
|
||||
XmonRequest req{
|
||||
.msgType = XmonRequestType::CREATE,
|
||||
.alertId = XmonAgent::TOO_MANY_ALERTS_ALERT_ID,
|
||||
.quietPeriod = 0,
|
||||
.binnable = true,
|
||||
.message = "too many alerts, alerts dropped",
|
||||
};
|
||||
packRequest(buf, req);
|
||||
alertIdToInsert = XmonAgent::TOO_MANY_ALERTS_ALERT_ID;
|
||||
goto write_request;
|
||||
} else {
|
||||
goto skip_request;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (_fds[PIPE_FD].revents & (POLLIN|POLLHUP|POLLERR)) {
|
||||
LOG_DEBUG(_env, "got event in pipe FD");
|
||||
// drain pipe
|
||||
for (;;) {
|
||||
XmonRequest req;
|
||||
if (!req.read(_fds[PIPE_FD].fd)) { break; }
|
||||
_queuedRequests.emplace_back(std::move(req));
|
||||
}
|
||||
}
|
||||
|
||||
// Write out all requests, if socket is alive anyway
|
||||
if (_gotHeartbeatAt > 0) {
|
||||
while (!_queuedRequests.empty()) {
|
||||
const auto& req = _queuedRequests.front();
|
||||
LOG_INFO(_env, "got req of type %s alertId=%s", (int)XmonRequestType::CREATE, req.alertId);
|
||||
int64_t alertIdToInsert = -1;
|
||||
if (req.msgType == XmonRequestType::CREATE) {
|
||||
if (req.quietPeriod > 0) {
|
||||
ALWAYS_ASSERT(!req.binnable, "got alert with quietPeriod=%s, but it is binnable", req.quietPeriod);
|
||||
LOG_INFO(_env, "got non-binnable alertId=%s message=%s quietPeriod=%s, will wait", req.alertId, req.message, req.quietPeriod);
|
||||
EggsTime quietUntil = now + req.quietPeriod;
|
||||
nextWakeup = std::min(nextWakeup, quietUntil);
|
||||
_quietAlerts[req.alertId] = QuietAlert{
|
||||
.quietUntil = quietUntil,
|
||||
.message = std::move(req.message),
|
||||
};
|
||||
goto skip_request;
|
||||
} else if (req.binnable) {
|
||||
alertIdToInsert = req.alertId;
|
||||
if (_binnableAlerts.size() > MAX_BINNABLE_ALERTS) {
|
||||
LOG_ERROR(_env, "not creating alert, aid=%s binnable=%s message=%s, we're full", req.alertId, req.binnable, req.message);
|
||||
if (_binnableAlerts.count(XmonAgent::TOO_MANY_ALERTS_ALERT_ID) == 0) {
|
||||
XmonRequest req{
|
||||
.msgType = XmonRequestType::CREATE,
|
||||
.alertId = XmonAgent::TOO_MANY_ALERTS_ALERT_ID,
|
||||
.quietPeriod = 0,
|
||||
.binnable = true,
|
||||
.message = "too many alerts, alerts dropped",
|
||||
};
|
||||
_packRequest(_buf, req);
|
||||
alertIdToInsert = XmonAgent::TOO_MANY_ALERTS_ALERT_ID;
|
||||
goto write_request;
|
||||
} else {
|
||||
goto skip_request;
|
||||
}
|
||||
}
|
||||
LOG_INFO(_env, "creating alert, aid=%s binnable=%s message=%s", req.alertId, req.binnable, req.message);
|
||||
} else if (req.msgType == XmonRequestType::UPDATE) {
|
||||
ALWAYS_ASSERT(!req.binnable);
|
||||
auto quiet = quietAlerts.find(req.alertId);
|
||||
if (quiet != quietAlerts.end()) {
|
||||
LOG_INFO(_env, "skipping update alertId=%s message=%s since it's still quiet", req.alertId, req.message);
|
||||
quiet->second.message = std::move(req.message);
|
||||
goto skip_request;
|
||||
}
|
||||
LOG_INFO(_env, "updating alert, aid=%s binnable=%s message=%s", req.alertId, req.binnable, req.message);
|
||||
} else if (req.msgType == XmonRequestType::CLEAR) {
|
||||
ALWAYS_ASSERT(!req.binnable);
|
||||
auto quiet = quietAlerts.find(req.alertId);
|
||||
if (quiet != quietAlerts.end()) {
|
||||
LOG_INFO(_env, "skipping clear alertId=%s since it's still quiet", req.alertId);
|
||||
quietAlerts.erase(quiet);
|
||||
goto skip_request;
|
||||
}
|
||||
LOG_INFO(_env, "clearing alert, aid=%s", req.alertId);
|
||||
} else {
|
||||
ALWAYS_ASSERT(false, "bad req type %s", (int)req.msgType);
|
||||
}
|
||||
packRequest(buf, req);
|
||||
write_request:
|
||||
errString = buf.writeOut(sock);
|
||||
CHECK_ERR_STRING(errString);
|
||||
LOG_DEBUG(_env, "sent request to xmon");
|
||||
if (alertIdToInsert >= 0) {
|
||||
binnableAlerts.insert(alertIdToInsert);
|
||||
LOG_INFO(_env, "creating alert, aid=%s binnable=%s message=%s", req.alertId, req.binnable, req.message);
|
||||
} else if (req.msgType == XmonRequestType::UPDATE) {
|
||||
ALWAYS_ASSERT(!req.binnable);
|
||||
auto quiet = _quietAlerts.find(req.alertId);
|
||||
if (quiet != _quietAlerts.end()) {
|
||||
LOG_INFO(_env, "skipping update alertId=%s message=%s since it's still quiet", req.alertId, req.message);
|
||||
quiet->second.message = std::move(req.message);
|
||||
goto skip_request;
|
||||
}
|
||||
skip_request:
|
||||
requests.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we're done
|
||||
if (shutDown) {
|
||||
LOG_DEBUG(_env, "got told to stop, stopping");
|
||||
close(sock);
|
||||
_stopper.stopDone();
|
||||
return;
|
||||
}
|
||||
|
||||
errString.clear();
|
||||
bool success = buf.readIn(sock, 4, errString);
|
||||
CHECK_ERR_STRING("read message type");
|
||||
if (!success) { continue; }
|
||||
int32_t msgType = buf.unpackScalar<int32_t>();
|
||||
switch (msgType) {
|
||||
case 0x0: {
|
||||
if (gotHeartbeatAt == 0) {
|
||||
LOG_INFO(_env, "got first xmon heartbeat, will start sending requests");
|
||||
LOG_INFO(_env, "updating alert, aid=%s binnable=%s message=%s", req.alertId, req.binnable, req.message);
|
||||
} else if (req.msgType == XmonRequestType::CLEAR) {
|
||||
ALWAYS_ASSERT(!req.binnable);
|
||||
auto quiet = _quietAlerts.find(req.alertId);
|
||||
if (quiet != _quietAlerts.end()) {
|
||||
LOG_INFO(_env, "skipping clear alertId=%s since it's still quiet", req.alertId);
|
||||
_quietAlerts.erase(quiet);
|
||||
goto skip_request;
|
||||
}
|
||||
LOG_INFO(_env, "clearing alert, aid=%s", req.alertId);
|
||||
} else {
|
||||
LOG_DEBUG(_env, "got xmon heartbeat");
|
||||
ALWAYS_ASSERT(false, "bad req type %s", (int)req.msgType);
|
||||
}
|
||||
gotHeartbeatAt = eggsNow();
|
||||
packUpdate(buf);
|
||||
errString = buf.writeOut(sock);
|
||||
CHECK_ERR_STRING("send heartbeat");
|
||||
break; }
|
||||
case 0x1: {
|
||||
bool success = false;
|
||||
do {
|
||||
errString.clear();
|
||||
success = buf.readIn(sock, 8, errString);
|
||||
CHECK_ERR_STRING("reading alert binned id");
|
||||
} while (!success);
|
||||
int64_t alertId = buf.unpackScalar<int64_t>();
|
||||
LOG_INFO(_env, "got alert %s binned from UI", alertId);
|
||||
binnableAlerts.erase(alertId);
|
||||
break; }
|
||||
default:
|
||||
throw EGGS_EXCEPTION("unknown message type %s", msgType);
|
||||
_packRequest(_buf, req);
|
||||
write_request:
|
||||
errString = _buf.writeOut(_fds[SOCK_FD].fd);
|
||||
CHECK_ERR_STRING(errString);
|
||||
LOG_DEBUG(_env, "sent request to xmon");
|
||||
if (alertIdToInsert >= 0) {
|
||||
_binnableAlerts.insert(alertIdToInsert);
|
||||
}
|
||||
skip_request:
|
||||
_queuedRequests.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
return nextWakeup;
|
||||
}
|
||||
|
||||
void* runXmon(void* server) {
|
||||
((Xmon*)server)->run();
|
||||
return nullptr;
|
||||
void Xmon::step() {
|
||||
EggsTime nextTimer = _stepNextWakeup();
|
||||
_ensureTimer(eggsNow(), nextTimer);
|
||||
}
|
||||
+35
-28
@@ -6,11 +6,13 @@
|
||||
#include <unordered_map>
|
||||
#include <semaphore>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
#include <poll.h>
|
||||
|
||||
#include "Common.hpp"
|
||||
#include "Env.hpp"
|
||||
#include "XmonAgent.hpp"
|
||||
#include "Undertaker.hpp"
|
||||
#include "Loop.hpp"
|
||||
#include "Stopper.hpp"
|
||||
|
||||
struct XmonConfig {
|
||||
@@ -98,11 +100,8 @@ struct XmonBuf {
|
||||
bool readIn(int fd, size_t sz, std::string& errString);
|
||||
};
|
||||
|
||||
void* runXmon(void*);
|
||||
|
||||
struct Xmon : Undertaker::Reapable {
|
||||
struct Xmon : Loop {
|
||||
private:
|
||||
Env _env;
|
||||
Stopper _stopper;
|
||||
std::shared_ptr<XmonAgent> _agent;
|
||||
std::string _hostname;
|
||||
@@ -111,34 +110,42 @@ private:
|
||||
std::string _xmonHost;
|
||||
uint16_t _xmonPort;
|
||||
|
||||
void packLogon(XmonBuf& buf);
|
||||
void packUpdate(XmonBuf& buf);
|
||||
void packRequest(XmonBuf& buf, const XmonRequest& req);
|
||||
// xmon socket, xmon agent pipe read end, timer fd
|
||||
static constexpr int SOCK_FD = 0;
|
||||
static constexpr int PIPE_FD = 1;
|
||||
static constexpr int TIMER_FD = 2;
|
||||
static constexpr int NUM_FDS = 3;
|
||||
struct pollfd _fds[NUM_FDS];
|
||||
// requests we got from the pipe
|
||||
std::deque<XmonRequest> _queuedRequests;
|
||||
// active binnable alerts
|
||||
std::unordered_set<int64_t> _binnableAlerts;
|
||||
// quiet alerts we're waiting to send out
|
||||
struct QuietAlert {
|
||||
EggsTime quietUntil;
|
||||
std::string message;
|
||||
};
|
||||
std::unordered_map<int64_t, QuietAlert> _quietAlerts;
|
||||
// last heartbeat from xmon
|
||||
EggsTime _gotHeartbeatAt;
|
||||
// what the timer fd expiration is currently set to
|
||||
EggsTime _timerExpiresAt;
|
||||
|
||||
XmonBuf _buf;
|
||||
|
||||
void _packLogon(XmonBuf& buf);
|
||||
void _packUpdate(XmonBuf& buf);
|
||||
void _packRequest(XmonBuf& buf, const XmonRequest& req);
|
||||
void _ensureTimer(EggsTime now, EggsTime t);
|
||||
|
||||
EggsTime _stepNextWakeup();
|
||||
public:
|
||||
Xmon(
|
||||
Logger& logger,
|
||||
std::shared_ptr<XmonAgent>& agent,
|
||||
const XmonConfig& config
|
||||
);
|
||||
~Xmon();
|
||||
|
||||
virtual ~Xmon() = default;
|
||||
|
||||
virtual void terminate() override {
|
||||
_env.flush();
|
||||
_stopper.stop();
|
||||
}
|
||||
|
||||
virtual void onAbort() override {
|
||||
_env.flush();
|
||||
}
|
||||
|
||||
void run();
|
||||
|
||||
static void spawn(Undertaker& undertaker, std::unique_ptr<Xmon> xmon) {
|
||||
pthread_t tid;
|
||||
if (pthread_create(&tid, nullptr, &runXmon, &*xmon) != 0) {
|
||||
throw SYSCALL_EXCEPTION("pthread_create");
|
||||
}
|
||||
undertaker.checkin(std::move(xmon), tid, "xmon");
|
||||
}
|
||||
virtual void step() override;
|
||||
};
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
#include "XmonAgent.hpp"
|
||||
#include "Assert.hpp"
|
||||
#include "Exception.hpp"
|
||||
|
||||
struct XmonRequestHeader {
|
||||
XmonRequestType msgType;
|
||||
int64_t alertId;
|
||||
Duration quietPeriod;
|
||||
bool binnable;
|
||||
uint16_t messageLen;
|
||||
};
|
||||
|
||||
void XmonRequest::write(int fd) const {
|
||||
ALWAYS_ASSERT(message.size() < 1<<16);
|
||||
XmonRequestHeader header = {
|
||||
.msgType = msgType,
|
||||
.alertId = alertId,
|
||||
.quietPeriod = quietPeriod,
|
||||
.binnable = binnable,
|
||||
.messageLen = (uint16_t)message.size(),
|
||||
};
|
||||
static thread_local char buf[PIPE_BUF];
|
||||
ALWAYS_ASSERT(sizeof(header) + header.messageLen < PIPE_BUF);
|
||||
memcpy(buf, &header, sizeof(header));
|
||||
memcpy(buf+sizeof(header), message.data(), header.messageLen);
|
||||
{
|
||||
// pipe writes of < PIPE_BUF are guaranteed to be atomic, see pipe(7)
|
||||
int written = ::write(fd, buf, sizeof(header)+header.messageLen);
|
||||
if (written != sizeof(header)+header.messageLen) {
|
||||
throw SYSCALL_EXCEPTION("write");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool XmonRequest::read(int fd) {
|
||||
static thread_local char buf[PIPE_BUF];
|
||||
XmonRequestHeader header;
|
||||
int read = ::read(fd, buf, sizeof(header));
|
||||
if (read < 0 && errno == EAGAIN) { return false; }
|
||||
if (read != sizeof(header)) {
|
||||
throw SYSCALL_EXCEPTION("read");
|
||||
}
|
||||
memcpy(&header, buf, sizeof(header));
|
||||
msgType = header.msgType;
|
||||
alertId = header.alertId;
|
||||
quietPeriod = header.quietPeriod;
|
||||
binnable = header.binnable;
|
||||
read = ::read(fd, buf, header.messageLen);
|
||||
if (read != header.messageLen) {
|
||||
throw SYSCALL_EXCEPTION("read");
|
||||
}
|
||||
buf[header.messageLen] = '\0';
|
||||
message = std::string(buf);
|
||||
return true;
|
||||
}
|
||||
+38
-13
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <stdint.h>
|
||||
#include <mutex>
|
||||
#include <deque>
|
||||
@@ -7,6 +8,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "Common.hpp"
|
||||
#include "Exception.hpp"
|
||||
#include "Time.hpp"
|
||||
|
||||
enum struct XmonRequestType : int32_t {
|
||||
@@ -21,6 +23,13 @@ struct XmonRequest {
|
||||
Duration quietPeriod;
|
||||
bool binnable;
|
||||
std::string message;
|
||||
|
||||
// multiple writers are fine.
|
||||
void write(int fd) const;
|
||||
|
||||
// Only one reader at a time, please. Return false
|
||||
// if we got EAGAIN immediately.
|
||||
bool read(int fd);
|
||||
};
|
||||
|
||||
struct XmonNCAlert {
|
||||
@@ -33,13 +42,15 @@ struct XmonNCAlert {
|
||||
|
||||
struct XmonAgent {
|
||||
private:
|
||||
std::mutex _mu;
|
||||
std::deque<XmonRequest> _requests;
|
||||
int _pipe[2];
|
||||
std::atomic<int64_t> _alertId;
|
||||
|
||||
void _addRequest(XmonRequest&& req) {
|
||||
std::lock_guard<std::mutex> lock(_mu);
|
||||
_requests.emplace_back(req);
|
||||
int _writeFd() const {
|
||||
return _pipe[1];
|
||||
}
|
||||
|
||||
void _sendRequest(const XmonRequest& req) {
|
||||
req.write(_writeFd());
|
||||
}
|
||||
|
||||
int64_t _createAlert(bool binnable, Duration quietPeriod, const std::string& message) {
|
||||
@@ -50,14 +61,30 @@ private:
|
||||
req.binnable = binnable;
|
||||
req.message = message;
|
||||
req.quietPeriod = quietPeriod;
|
||||
_addRequest(std::move(req));
|
||||
_sendRequest(req);
|
||||
return aid;
|
||||
}
|
||||
|
||||
public:
|
||||
static constexpr int64_t TOO_MANY_ALERTS_ALERT_ID = 0;
|
||||
|
||||
XmonAgent() : _alertId(1) {}
|
||||
XmonAgent() : _alertId(1) {
|
||||
if (pipe(_pipe) < 0) {
|
||||
throw SYSCALL_EXCEPTION("pipe");
|
||||
}
|
||||
if (fcntl(readFd(), F_SETFL, O_NONBLOCK) < 0) {
|
||||
throw SYSCALL_EXCEPTION("fcntl");
|
||||
}
|
||||
}
|
||||
|
||||
~XmonAgent() {
|
||||
if (close(_pipe[0]) < 0) {
|
||||
std::cerr << "Could not close read end of pipe pipe (" << errno << ")" << std::endl;
|
||||
}
|
||||
if (close(_pipe[1]) < 0) {
|
||||
std::cerr << "Could not close write end of pipe pipe (" << errno << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void raiseAlert(const std::string& message) {
|
||||
_createAlert(true, 0, message);
|
||||
@@ -73,7 +100,7 @@ public:
|
||||
req.binnable = false;
|
||||
req.message = message;
|
||||
req.quietPeriod = 0;
|
||||
_addRequest(std::move(req));
|
||||
_sendRequest(req);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,13 +112,11 @@ public:
|
||||
req.binnable = false;
|
||||
req.message = {};
|
||||
req.quietPeriod = 0;
|
||||
_addRequest(std::move(req));
|
||||
_sendRequest(req);
|
||||
aid.alertId = -1;
|
||||
}
|
||||
|
||||
void getRequests(std::deque<XmonRequest>& reqs) {
|
||||
std::lock_guard<std::mutex> lock(_mu);
|
||||
std::move(std::begin(_requests), std::end(_requests), std::back_inserter(reqs));
|
||||
_requests.clear();
|
||||
int readFd() const {
|
||||
return _pipe[0];
|
||||
}
|
||||
};
|
||||
|
||||
+60
-64
@@ -8,6 +8,7 @@
|
||||
#include <thread>
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "Assert.hpp"
|
||||
#include "Bincode.hpp"
|
||||
@@ -21,14 +22,14 @@
|
||||
#include "CDCKey.hpp"
|
||||
#include "Shuckle.hpp"
|
||||
#include "Time.hpp"
|
||||
#include "Undertaker.hpp"
|
||||
#include "Time.hpp"
|
||||
#include "wyhash.h"
|
||||
#include "Xmon.hpp"
|
||||
#include "Timings.hpp"
|
||||
#include "ErrorCount.hpp"
|
||||
#include "Loop.hpp"
|
||||
#include "PeriodicLoop.hpp"
|
||||
#include "Metrics.hpp"
|
||||
#include "Loop.hpp"
|
||||
|
||||
// Data needed to synchronize between the different threads
|
||||
struct ShardShared {
|
||||
@@ -106,6 +107,7 @@ struct ShardServer : Loop {
|
||||
private:
|
||||
// init data
|
||||
ShardShared& _shared;
|
||||
bool _initialized;
|
||||
ShardId _shid;
|
||||
int _ipPortIx;
|
||||
uint32_t _ownIp;
|
||||
@@ -130,6 +132,7 @@ public:
|
||||
ShardServer(Logger& logger, std::shared_ptr<XmonAgent>& xmon, ShardId shid, const ShardOptions& options, int ipPortIx, ShardShared& shared) :
|
||||
Loop(logger, xmon, "server_" + std::to_string(ipPortIx+1)),
|
||||
_shared(shared),
|
||||
_initialized(false),
|
||||
_shid(shid),
|
||||
_ipPortIx(ipPortIx),
|
||||
_ownIp(options.ipPorts[ipPortIx].ip),
|
||||
@@ -150,7 +153,7 @@ public:
|
||||
convertProb("outgoing", options.simulateOutgoingPacketDrop, _outgoingPacketDropProbability);
|
||||
}
|
||||
|
||||
virtual void init() override {
|
||||
void init() {
|
||||
expandKey(CDCKey, _expandedCDCKey);
|
||||
|
||||
_sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
|
||||
@@ -171,15 +174,6 @@ public:
|
||||
throw SYSCALL_EXCEPTION("getsockname");
|
||||
}
|
||||
}
|
||||
// 10ms timeout for prompt termination
|
||||
{
|
||||
struct timeval tv;
|
||||
tv.tv_sec = 0;
|
||||
tv.tv_usec = 10'000;
|
||||
if (setsockopt(_sock, SOL_SOCKET, SO_RCVTIMEO,&tv,sizeof(tv)) < 0) {
|
||||
throw SYSCALL_EXCEPTION("setsockopt");
|
||||
}
|
||||
}
|
||||
|
||||
if (_ipPortIx == 0) {
|
||||
_shared.ip1.store(ntohl(_serverAddr.sin_addr.s_addr));
|
||||
@@ -198,13 +192,17 @@ public:
|
||||
}
|
||||
|
||||
virtual void step() override {
|
||||
if (!_initialized) {
|
||||
init();
|
||||
_initialized = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Read one request
|
||||
LOG_DEBUG(_env, "Blocking to read 1 request in socket %s", _ipPortIx);
|
||||
memset(&_clientAddr, 0, sizeof(_clientAddr));
|
||||
socklen_t addrLen = sizeof(_clientAddr);
|
||||
int read = recvfrom(_sock, _recvBuf.data(), _recvBuf.size(), 0, (struct sockaddr*)&_clientAddr, &addrLen);
|
||||
if (read < 0 && (errno == EAGAIN || errno == EINTR)) {
|
||||
return;
|
||||
}
|
||||
if (read < 0) {
|
||||
throw SYSCALL_EXCEPTION("recvfrom");
|
||||
}
|
||||
@@ -327,13 +325,6 @@ public:
|
||||
}
|
||||
LOG_DEBUG(_env, "sent response %s to %s", _respContainer->kind(), _clientAddr);
|
||||
}
|
||||
|
||||
// If we're terminating gracefully we're the last ones, close the db nicely
|
||||
void virtual finish() override {
|
||||
if (_ipPortIx == 0) {
|
||||
_shared.db.close();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ShardRegisterer : PeriodicLoop {
|
||||
@@ -355,7 +346,7 @@ public:
|
||||
_hasSecondIp(options.ipPorts[1].port != 0)
|
||||
{}
|
||||
|
||||
virtual void init() {
|
||||
void init() {
|
||||
_env.updateAlert(_alert, "Waiting to register ourselves for the first time");
|
||||
}
|
||||
|
||||
@@ -392,14 +383,12 @@ private:
|
||||
ShardLogEntry _logEntry;
|
||||
public:
|
||||
ShardBlockServiceUpdater(Logger& logger, std::shared_ptr<XmonAgent>& xmon, ShardId shid, const ShardOptions& options, ShardShared& shared):
|
||||
PeriodicLoop(logger, xmon, "block_service_updater", {1_sec, 1_mins}),
|
||||
PeriodicLoop(logger, xmon, "bs_updater", {1_sec, 1_mins}),
|
||||
_shared(shared),
|
||||
_shid(shid),
|
||||
_shuckleHost(options.shuckleHost),
|
||||
_shucklePort(options.shucklePort)
|
||||
{}
|
||||
|
||||
virtual void init() override {
|
||||
{
|
||||
_env.updateAlert(_alert, "Waiting to fetch block services for the first time");
|
||||
}
|
||||
|
||||
@@ -441,7 +430,7 @@ private:
|
||||
std::vector<Stat> _stats;
|
||||
public:
|
||||
ShardStatsInserter(Logger& logger, std::shared_ptr<XmonAgent>& xmon, ShardId shid, const ShardOptions& options, ShardShared& shared):
|
||||
PeriodicLoop(logger, xmon, "stats_inserter", {1_mins, 1_hours}),
|
||||
PeriodicLoop(logger, xmon, "stats", {1_mins, 1_hours}),
|
||||
_shared(shared),
|
||||
_shid(shid),
|
||||
_shuckleHost(options.shuckleHost),
|
||||
@@ -471,15 +460,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void finish() override {
|
||||
periodicStep();
|
||||
for (ShardMessageKind kind : allShardMessageKind) {
|
||||
const auto& timings = _shared.timings[(int)kind];
|
||||
uint64_t count = timings.count();
|
||||
if (count == 0) { continue; }
|
||||
LOG_INFO(_env, "%s: count=%s mean=%s p50=%s p90=%s p99=%s", kind, count, timings.mean(), timings.percentile(0.5), timings.percentile(0.9), timings.percentile(0.99));
|
||||
}
|
||||
}
|
||||
// TODO restore this when we have the functionality to do so
|
||||
// virtual void finish() override {
|
||||
// LOG_INFO(_env, "insert stats one last time");
|
||||
// periodicStep();
|
||||
// }
|
||||
};
|
||||
|
||||
struct ShardMetricsInserter : PeriodicLoop {
|
||||
@@ -491,7 +476,7 @@ private:
|
||||
std::unordered_map<std::string, uint64_t> _rocksDBStats;
|
||||
public:
|
||||
ShardMetricsInserter(Logger& logger, std::shared_ptr<XmonAgent>& xmon, ShardId shid, ShardShared& shared):
|
||||
PeriodicLoop(logger, xmon, "metrics_inserter", {1_sec, 1.0, 1_mins, 0.1}),
|
||||
PeriodicLoop(logger, xmon, "metrics", {1_sec, 1.0, 1_mins, 0.1}),
|
||||
_shared(shared),
|
||||
_shid(shid)
|
||||
{}
|
||||
@@ -548,18 +533,14 @@ public:
|
||||
};
|
||||
|
||||
void runShard(ShardId shid, const std::string& dbDir, const ShardOptions& options) {
|
||||
auto undertaker = Undertaker::acquireUndertaker();
|
||||
|
||||
std::ostream* logOut = &std::cout;
|
||||
std::ofstream fileOut;
|
||||
int logOutFd = STDOUT_FILENO;
|
||||
if (!options.logFile.empty()) {
|
||||
fileOut = std::ofstream(options.logFile, std::ios::out | std::ios::app);
|
||||
if (!fileOut.is_open()) {
|
||||
throw EGGS_EXCEPTION("Could not open log file `%s'\n", options.logFile);
|
||||
logOutFd = open(options.logFile.c_str(), O_WRONLY|O_CREAT|O_APPEND, 0644);
|
||||
if (logOutFd < 0) {
|
||||
throw SYSCALL_EXCEPTION("open");
|
||||
}
|
||||
logOut = &fileOut;
|
||||
}
|
||||
Logger logger(options.logLevel, *logOut, options.syslog, true);
|
||||
Logger logger(options.logLevel, logOutFd, options.syslog, true);
|
||||
|
||||
std::shared_ptr<XmonAgent> xmon;
|
||||
if (options.xmon) {
|
||||
@@ -587,17 +568,20 @@ void runShard(ShardId shid, const std::string& dbDir, const ShardOptions& option
|
||||
LOG_INFO(env, " syslog = %s", (int)options.syslog);
|
||||
}
|
||||
|
||||
// xmon first, so that by the time it shuts down it'll have all the leftover requests
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
if (xmon) {
|
||||
XmonConfig config;
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << std::setw(3) << std::setfill('0') << shid;
|
||||
config.appInstance = "eggsshard" + ss.str();
|
||||
}
|
||||
config.prod = options.xmonProd;
|
||||
config.appType = "restech_eggsfs.critical";
|
||||
Xmon::spawn(*undertaker, std::make_unique<Xmon>(logger, xmon, config));
|
||||
threads.emplace_back([&logger, xmon, shid, &options]() mutable {
|
||||
XmonConfig config;
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << std::setw(3) << std::setfill('0') << shid;
|
||||
config.appInstance = "eggsshard" + ss.str();
|
||||
}
|
||||
config.prod = options.xmonProd;
|
||||
config.appType = "restech_eggsfs.critical";
|
||||
Xmon(logger, xmon, config).run();
|
||||
});
|
||||
}
|
||||
|
||||
XmonNCAlert dbInitAlert;
|
||||
@@ -607,17 +591,29 @@ void runShard(ShardId shid, const std::string& dbDir, const ShardOptions& option
|
||||
|
||||
ShardShared shared(db);
|
||||
|
||||
Loop::spawn(*undertaker, std::make_unique<ShardServer>(logger, xmon, shid, options, 0, shared));
|
||||
threads.emplace_back([&logger, xmon, shid, &options, &shared]() mutable {
|
||||
ShardServer(logger, xmon, shid, options, 0, shared).run();
|
||||
});
|
||||
if (options.ipPorts[1].ip != 0) {
|
||||
Loop::spawn(*undertaker, std::make_unique<ShardServer>(logger, xmon, shid, options, 1, shared));
|
||||
threads.emplace_back([&logger, xmon, shid, &options, &shared]() mutable {
|
||||
ShardServer(logger, xmon, shid, options, 1, shared).run();
|
||||
});
|
||||
}
|
||||
|
||||
Loop::spawn(*undertaker, std::make_unique<ShardRegisterer>(logger, xmon, shid, options, shared));
|
||||
Loop::spawn(*undertaker, std::make_unique<ShardBlockServiceUpdater>(logger, xmon, shid, options, shared));
|
||||
Loop::spawn(*undertaker, std::make_unique<ShardStatsInserter>(logger, xmon, shid, options, shared));
|
||||
threads.emplace_back([&logger, xmon, shid, &options, &shared]() mutable {
|
||||
ShardRegisterer(logger, xmon, shid, options, shared).run();
|
||||
});
|
||||
threads.emplace_back([&logger, xmon, shid, &options, &shared]() mutable {
|
||||
ShardBlockServiceUpdater(logger, xmon, shid, options, shared).run();
|
||||
});
|
||||
threads.emplace_back([&logger, xmon, shid, &options, &shared]() mutable {
|
||||
ShardStatsInserter(logger, xmon, shid, options, shared).run();
|
||||
});
|
||||
if (options.metrics) {
|
||||
Loop::spawn(*undertaker, std::make_unique<ShardMetricsInserter>(logger, xmon, shid, shared));
|
||||
threads.emplace_back([&logger, xmon, shid, &shared]() mutable {
|
||||
ShardMetricsInserter(logger, xmon, shid, shared).run();
|
||||
});
|
||||
}
|
||||
|
||||
undertaker->reap();
|
||||
for (;;) { sleep(60*60*24); }
|
||||
}
|
||||
+15
-15
@@ -246,22 +246,26 @@ struct ShardDBImpl {
|
||||
_initDb();
|
||||
}
|
||||
|
||||
void close() {
|
||||
~ShardDBImpl() {
|
||||
|
||||
LOG_INFO(_env, "destroying column families and closing database");
|
||||
|
||||
ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_defaultCf));
|
||||
ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_filesCf));
|
||||
ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_spansCf));
|
||||
ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_transientCf));
|
||||
ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_directoriesCf));
|
||||
ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_edgesCf));
|
||||
ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_blockServicesToFilesCf));
|
||||
ROCKS_DB_CHECKED(_db->Close());
|
||||
const auto gentleRocksDBChecked = [this](const std::string& what, rocksdb::Status status) {
|
||||
if (!status.ok()) {
|
||||
LOG_INFO(_env, "Could not %s: %s", what, status.ToString());
|
||||
}
|
||||
};
|
||||
gentleRocksDBChecked("destroy default CF", _db->DestroyColumnFamilyHandle(_defaultCf));
|
||||
gentleRocksDBChecked("destroy files CF", _db->DestroyColumnFamilyHandle(_filesCf));
|
||||
gentleRocksDBChecked("destroy spans CF", _db->DestroyColumnFamilyHandle(_spansCf));
|
||||
gentleRocksDBChecked("destroy transient CF", _db->DestroyColumnFamilyHandle(_transientCf));
|
||||
gentleRocksDBChecked("destroy directories CF", _db->DestroyColumnFamilyHandle(_directoriesCf));
|
||||
gentleRocksDBChecked("destroy edges CF", _db->DestroyColumnFamilyHandle(_edgesCf));
|
||||
gentleRocksDBChecked("destroy block services CF", _db->DestroyColumnFamilyHandle(_blockServicesToFilesCf));
|
||||
gentleRocksDBChecked("close DB", _db->Close());
|
||||
|
||||
LOG_INFO(_env, "database closed");
|
||||
}
|
||||
|
||||
~ShardDBImpl() {
|
||||
delete _db;
|
||||
}
|
||||
|
||||
@@ -3816,10 +3820,6 @@ ShardDB::ShardDB(Logger& logger, std::shared_ptr<XmonAgent>& agent, ShardId shid
|
||||
_impl = new ShardDBImpl(logger, agent, shid, deadlineInterval, path);
|
||||
}
|
||||
|
||||
void ShardDB::close() {
|
||||
((ShardDBImpl*)_impl)->close();
|
||||
}
|
||||
|
||||
ShardDB::~ShardDB() {
|
||||
delete (ShardDBImpl*)_impl;
|
||||
_impl = nullptr;
|
||||
|
||||
@@ -39,9 +39,6 @@ public:
|
||||
ShardDB(Logger& logger, std::shared_ptr<XmonAgent>& xmon, ShardId shid, Duration deadlineInterval, const std::string& path);
|
||||
~ShardDB();
|
||||
|
||||
// Stuff which might throw, and therefore not well suited to destructor.
|
||||
void close();
|
||||
|
||||
// Performs a read-only request, responding immediately. If an error is returned,
|
||||
// the contents of `resp` should be ignored.
|
||||
EggsError read(const ShardReqContainer& req, ShardRespContainer& resp);
|
||||
|
||||
+2
-12
@@ -5,6 +5,7 @@
|
||||
#include <rocksdb/db.h>
|
||||
#include <sstream>
|
||||
#include <random>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "Common.hpp"
|
||||
#include "Bincode.hpp"
|
||||
@@ -14,7 +15,6 @@
|
||||
#include "Crypto.hpp"
|
||||
#include "RocksDBUtils.hpp"
|
||||
#include "Time.hpp"
|
||||
#include "Undertaker.hpp"
|
||||
#include "CDCKey.hpp"
|
||||
#include "wyhash.h"
|
||||
#include "Timings.hpp"
|
||||
@@ -29,10 +29,6 @@ REGISTER_EXCEPTION_TRANSLATOR(AbstractException& ex) {
|
||||
return doctest::String(ss.str().c_str());
|
||||
}
|
||||
|
||||
__attribute__((constructor))
|
||||
void configureErrorHandlersBeforeTests() {
|
||||
Undertaker::configureErrorHandlers();
|
||||
}
|
||||
|
||||
template<typename A>
|
||||
static void bincodeTestScalar() {
|
||||
@@ -389,7 +385,7 @@ struct TempShardDB {
|
||||
ShardId shid;
|
||||
std::unique_ptr<ShardDB> db;
|
||||
|
||||
TempShardDB(LogLevel level, ShardId shid_): logger(level, std::cerr, false, false), shid(shid_) {
|
||||
TempShardDB(LogLevel level, ShardId shid_): logger(level, STDERR_FILENO, false, false), shid(shid_) {
|
||||
dbDir = std::string("temp-shard-db.XXXXXX");
|
||||
if (mkdtemp(dbDir.data()) == nullptr) {
|
||||
throw SYSCALL_EXCEPTION("mkdtemp");
|
||||
@@ -400,7 +396,6 @@ struct TempShardDB {
|
||||
|
||||
// useful to test recovery
|
||||
void restart() {
|
||||
db->close();
|
||||
std::shared_ptr<XmonAgent> xmon;
|
||||
db = std::make_unique<ShardDB>(logger, xmon, shid, DEFAULT_DEADLINE_INTERVAL, dbDir);
|
||||
}
|
||||
@@ -410,11 +405,6 @@ struct TempShardDB {
|
||||
if (std::filesystem::remove_all(std::filesystem::path(dbDir), err) < 0) {
|
||||
std::cerr << "Could not remove " << dbDir << ": " << err << std::endl;
|
||||
}
|
||||
try {
|
||||
db->close();
|
||||
} catch (...) {
|
||||
std::cerr << "Could not close Shard DB" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<ShardDB>& operator->() {
|
||||
|
||||
Reference in New Issue
Block a user