mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2025-12-30 23:39:46 -06:00
This means that they'll be interrupted at shutdown, rather than holding everything up when shuckle is overloaded. We also detect idle connection or slow transmitting data.
515 lines
18 KiB
C++
515 lines
18 KiB
C++
#include <limits>
|
|
#include <time.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
#include <bit>
|
|
#include <sys/socket.h>
|
|
#include <unordered_set>
|
|
#include <sys/timerfd.h>
|
|
|
|
#include "Common.hpp"
|
|
#include "Env.hpp"
|
|
#include "Exception.hpp"
|
|
#include "Time.hpp"
|
|
#include "Xmon.hpp"
|
|
#include "Connect.hpp"
|
|
#include "XmonAgent.hpp"
|
|
#include "strerror.h"
|
|
|
|
static const std::vector<XmonAppType> appTypes = {XmonAppType::NEVER, XmonAppType::DAYTIME, XmonAppType::CRITICAL};
|
|
|
|
static const char* appTypeString(XmonAppType appType) {
|
|
switch (appType) {
|
|
case XmonAppType::NEVER:
|
|
return "restech_eggsfs.never";
|
|
case XmonAppType::DAYTIME:
|
|
return "restech_eggsfs.daytime";
|
|
case XmonAppType::CRITICAL:
|
|
return "restech_eggsfs.critical";
|
|
default:
|
|
throw EGGS_EXCEPTION("Bad xmon app type %s", (int)appType);
|
|
}
|
|
}
|
|
|
|
enum struct XmonMood : int32_t {
|
|
Happy = 0,
|
|
Perturbed = 1,
|
|
Upset = 2,
|
|
};
|
|
|
|
static std::string explicitGenerateErrString(const std::string& what, int err, const char* str) {
|
|
std::stringstream ss;
|
|
ss << "could not " << what << ": " << err << "/" << str;
|
|
return ss.str();
|
|
}
|
|
|
|
static std::string generateErrString(const std::string& what, int err) {
|
|
return explicitGenerateErrString(what, err, (std::string(translateErrno(err)) + "=" + safe_strerror(err)).c_str());
|
|
}
|
|
|
|
Xmon::Xmon(
|
|
Logger& logger,
|
|
std::shared_ptr<XmonAgent>& agent,
|
|
const XmonConfig& config
|
|
) :
|
|
Loop(logger, agent, "xmon"),
|
|
_agent(agent),
|
|
_appInstance(config.appInstance),
|
|
_parent(config.appType),
|
|
_xmonHost(config.prod ? "REDACTED" : "REDACTED"),
|
|
_xmonPort(5004)
|
|
{
|
|
if (_appInstance.empty()) {
|
|
throw EGGS_EXCEPTION("empty app name");
|
|
}
|
|
{
|
|
char buf[HOST_NAME_MAX];
|
|
int res = gethostname(buf, HOST_NAME_MAX);
|
|
if (res != 0) {
|
|
throw SYSCALL_EXCEPTION("gethostname");
|
|
}
|
|
_hostname = buf;
|
|
_hostname = _hostname.substr(0, _hostname.find("."));
|
|
}
|
|
_appInstance = _appInstance + "@" + _hostname;
|
|
|
|
{
|
|
bool found = false;
|
|
for (const auto& appType: appTypes) {
|
|
if (appType == _parent) {
|
|
found = true;
|
|
continue;
|
|
}
|
|
_children.emplace_back(appType);
|
|
}
|
|
ALWAYS_ASSERT(found, "Did not find app type %s", (uint8_t)_parent);
|
|
}
|
|
|
|
// xmon socket
|
|
_fds[SOCK_FD].fd = -1;
|
|
_fds[SOCK_FD].events = POLLIN;
|
|
// xmon agent read end
|
|
_fds[PIPE_FD].fd = agent->readFd();
|
|
_fds[PIPE_FD].events = POLLIN;
|
|
// timer fd
|
|
_fds[TIMER_FD].fd = timerfd_create(CLOCK_MONOTONIC, 0);
|
|
if (_fds[TIMER_FD].fd < 0) {
|
|
throw SYSCALL_EXCEPTION("timerfd_create");
|
|
}
|
|
_fds[TIMER_FD].events = POLLIN;
|
|
|
|
// arm initial timer
|
|
{
|
|
auto now = eggsNow();
|
|
_timerExpiresAt = std::numeric_limits<uint64_t>::max();
|
|
_ensureTimer(now, now);
|
|
}
|
|
}
|
|
|
|
Xmon::~Xmon() {
|
|
if (_fds[SOCK_FD].fd >= 0) {
|
|
if (close(_fds[0].fd) < 0) {
|
|
std::cerr << "Could not close xmon socket (" << errno << ")" << std::endl;
|
|
}
|
|
}
|
|
if (close(_fds[TIMER_FD].fd) < 0) {
|
|
std::cerr << "Could not close timer fd (" << errno << ")" << std::endl;
|
|
}
|
|
}
|
|
|
|
void Xmon::_ensureTimer(EggsTime now, EggsTime t) {
|
|
if (_timerExpiresAt <= t) { return; }
|
|
Duration d = std::max<Duration>(1, t - now);
|
|
LOG_DEBUG(_env, "arming timer in %s", d);
|
|
struct itimerspec utmr;
|
|
memset(&utmr, 0, sizeof(utmr));
|
|
utmr.it_value = d.timespec();
|
|
if (timerfd_settime(_fds[TIMER_FD].fd, 0, &utmr, nullptr) < 0) {
|
|
throw SYSCALL_EXCEPTION("timerfd_settime");
|
|
}
|
|
_timerExpiresAt = t;
|
|
}
|
|
|
|
const Duration HEARTBEAT_INTERVAL = 10_sec;
|
|
const uint8_t HEARTBEAT_INTERVAL_SECS = HEARTBEAT_INTERVAL.ns/1'000'000'000ull;
|
|
|
|
void Xmon::_packLogon(XmonBuf& buf) {
|
|
buf.reset();
|
|
|
|
// <https://REDACTED>
|
|
// Name Type Description
|
|
// Magic int16 always 'T'
|
|
// Version int32 version number (latest version is 4)
|
|
// Message type int32 0x0
|
|
// (unused) int32 must be 0
|
|
// Hostname string your hostname
|
|
// Heartbeat interval byte in seconds, 0 for default (5s))
|
|
// (unused) byte must be 0
|
|
// (unused) int16 must be 0
|
|
// App type string defines rota and schedule
|
|
// App inst string arbitrary identifier
|
|
// Heap size int64 (deprecated)
|
|
// Num threads int32 (deprecated)
|
|
// Num errors int32 (deprecated)
|
|
// Mood int32 current mood
|
|
// Details JSON string (unused)
|
|
buf.packScalar<int16_t>('T'); // magic
|
|
buf.packScalar<int32_t>(4); // version
|
|
buf.packScalar<int32_t>(0x0); // message type
|
|
buf.packScalar<int32_t>(0x0); // unused
|
|
buf.packString(_hostname); // hostname
|
|
buf.packScalar<uint8_t>(HEARTBEAT_INTERVAL_SECS); // default interval
|
|
buf.packScalar<uint8_t>(0); // unused
|
|
buf.packScalar<int16_t>(0); // unused
|
|
buf.packString(appTypeString(_parent)); // app type
|
|
buf.packString(_appInstance); // app instance
|
|
buf.packScalar<int64_t>(0); // heap size
|
|
buf.packScalar<int32_t>(0); // num threads
|
|
buf.packScalar<int32_t>(0); // num errors
|
|
buf.packScalar<XmonMood>(XmonMood::Happy); // mood
|
|
std::string details;
|
|
buf.packString(details); // details
|
|
|
|
for (const auto& child: _children) {
|
|
buf.packScalar<int32_t>(0x100); // msg type
|
|
buf.packScalar<int32_t>(0x0); // must be 0
|
|
buf.packString(_hostname); // hostname
|
|
buf.packScalar<uint8_t>(0); // unused
|
|
buf.packScalar<uint8_t>(0); // unused
|
|
buf.packScalar<int16_t>(0); // unused
|
|
buf.packString(appTypeString(child)); // app type
|
|
buf.packString(_appInstance); // app instance
|
|
buf.packScalar<int64_t>(0); // heap size
|
|
buf.packScalar<int32_t>(0); // num threads
|
|
buf.packScalar<int32_t>(0); // num errors
|
|
buf.packScalar<XmonMood>(XmonMood::Happy); // mood
|
|
std::string details;
|
|
buf.packString(details); // details
|
|
}
|
|
}
|
|
|
|
void Xmon::_packUpdate(XmonBuf& buf) {
|
|
buf.reset();
|
|
// Message type int32 0x1
|
|
// (unused) int32 must be 0
|
|
// Heap size int64 (deprecated)
|
|
// Num threads int32 (deprecated)
|
|
// Num errors int32 (deprecated)
|
|
// Mood int32 updated mood
|
|
buf.packScalar<int32_t>(0x1);
|
|
buf.packScalar<int32_t>(0);
|
|
buf.packScalar<int64_t>(0);
|
|
buf.packScalar<int32_t>(0);
|
|
buf.packScalar<int32_t>(0);
|
|
buf.packScalar<XmonMood>(XmonMood::Happy);
|
|
}
|
|
|
|
void Xmon::_packRequest(XmonBuf& buf, const XmonRequest& req) {
|
|
buf.reset();
|
|
|
|
bool hasMessage = req.msgType == XmonRequestType::CREATE || req.msgType == XmonRequestType::UPDATE;
|
|
|
|
if (_parent == req.appType) {
|
|
buf.packScalar(req.msgType);
|
|
buf.packScalar(req.alertId);
|
|
if (hasMessage) {
|
|
buf.packScalar(req.binnable);
|
|
buf.packString(req.message);
|
|
}
|
|
} else {
|
|
buf.packScalar<int32_t>((int32_t)req.msgType | 0x100);
|
|
buf.packString(appTypeString(req.appType));
|
|
buf.packString(_appInstance);
|
|
buf.packString(std::to_string(req.alertId));
|
|
if (hasMessage) {
|
|
buf.packScalar((int32_t)req.binnable);
|
|
buf.packString(req.message);
|
|
std::string json; // unused
|
|
buf.packString(json);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string XmonBuf::writeOut(int fd) {
|
|
const char* curr = buf + cur;
|
|
const char* end = buf + len;
|
|
while (curr < end) {
|
|
ssize_t written = write(fd, curr, end-curr);
|
|
if (written < 0) {
|
|
return generateErrString("writing out xmon buf", errno);
|
|
}
|
|
if (written == 0) {
|
|
return "unexpected EOF";
|
|
}
|
|
curr += written;
|
|
}
|
|
return {};
|
|
}
|
|
|
|
void XmonBuf::readIn(int fd, size_t sz, std::string& errString) {
|
|
errString.clear();
|
|
reset();
|
|
ensureUnpackSizeOrPanic(sz);
|
|
size_t readSoFar = 0;
|
|
while (readSoFar < sz) {
|
|
ssize_t r = read(fd, buf+cur+readSoFar, sz-readSoFar);
|
|
if (unlikely(r < 0)) {
|
|
errString = generateErrString("read in xmon buf", errno);
|
|
return;
|
|
}
|
|
if (unlikely(r == 0)) {
|
|
errString = "unexpected EOF";
|
|
return;
|
|
}
|
|
readSoFar += r;
|
|
}
|
|
len += sz;
|
|
}
|
|
|
|
constexpr int MAX_BINNABLE_ALERTS = 20;
|
|
|
|
EggsTime Xmon::_stepNextWakeup() {
|
|
std::string errString;
|
|
EggsTime nextWakeup = std::numeric_limits<uint64_t>::max();
|
|
|
|
#define CHECK_ERR_STRING(__what) \
|
|
if (errString.size()) { \
|
|
LOG_INFO(_env, "could not %s: %s, will reconnect", __what, errString); \
|
|
if (_fds[SOCK_FD].fd >= 0 && close(_fds[SOCK_FD].fd) < 0) { \
|
|
LOG_INFO(_env, "could not close xmon socket after error: %s (%s)", errno, safe_strerror(errno)); \
|
|
} \
|
|
_fds[SOCK_FD].fd = -1; \
|
|
(1_sec).sleep(); \
|
|
return nextWakeup; \
|
|
}
|
|
|
|
errString.clear();
|
|
|
|
// Not connected, connect
|
|
if (_fds[SOCK_FD].fd < 0) {
|
|
{
|
|
auto [sock, err] = connectToHost(_xmonHost, _xmonPort);
|
|
_fds[SOCK_FD].fd = sock.release();
|
|
errString = err;
|
|
}
|
|
CHECK_ERR_STRING(errString);
|
|
LOG_INFO(_env, "connected to xmon %s:%s", _xmonHost, _xmonPort);
|
|
|
|
// Send logon message(s)
|
|
_packLogon(_buf);
|
|
errString = _buf.writeOut(_fds[SOCK_FD].fd);
|
|
CHECK_ERR_STRING(errString);
|
|
LOG_INFO(_env, "sent logon to xmon, appType=%s appInstance=%s", appTypeString(_parent), _appInstance);
|
|
|
|
_gotHeartbeatAt = 0;
|
|
nextWakeup = std::min(nextWakeup, eggsNow() + HEARTBEAT_INTERVAL*2);
|
|
}
|
|
|
|
if (poll(_fds, NUM_FDS, -1) < 0) {
|
|
if (errno == EINTR) { return nextWakeup; }
|
|
throw SYSCALL_EXCEPTION("poll");
|
|
}
|
|
|
|
auto now = eggsNow();
|
|
|
|
if (_fds[SOCK_FD].revents & (POLLIN|POLLHUP|POLLERR)) {
|
|
LOG_DEBUG(_env, "got event in sock fd");
|
|
// Receive everything
|
|
_buf.readIn(_fds[SOCK_FD].fd, 4, errString);
|
|
CHECK_ERR_STRING("read message type");
|
|
int32_t msgType = _buf.unpackScalar<int32_t>();
|
|
switch (msgType) {
|
|
case 0x0: {
|
|
if (_gotHeartbeatAt == 0) {
|
|
LOG_INFO(_env, "got first xmon heartbeat, will start sending requests");
|
|
} else {
|
|
LOG_DEBUG(_env, "got xmon heartbeat");
|
|
}
|
|
_gotHeartbeatAt = now;
|
|
nextWakeup = std::min(nextWakeup, now + HEARTBEAT_INTERVAL*2);
|
|
_packUpdate(_buf);
|
|
errString = _buf.writeOut(_fds[SOCK_FD].fd);
|
|
CHECK_ERR_STRING("send heartbeat");
|
|
break; }
|
|
case 0x1: {
|
|
_buf.readIn(_fds[SOCK_FD].fd, 8, errString);
|
|
CHECK_ERR_STRING("reading alert binned id");
|
|
int64_t alertId = _buf.unpackScalar<int64_t>();
|
|
LOG_INFO(_env, "got alert %s binned from UI", alertId);
|
|
_binnableAlerts.erase(alertId);
|
|
break; }
|
|
case 0x101: {
|
|
// we don't care about which child this is, alert ids are unique anyway
|
|
// also this really ain't the best code
|
|
_buf.readIn(_fds[SOCK_FD].fd, 2, errString);
|
|
CHECK_ERR_STRING("reading child app type length");
|
|
{
|
|
uint16_t appTypeLen = _buf.unpackScalar<uint16_t>();
|
|
_buf.readIn(_fds[SOCK_FD].fd, appTypeLen, errString);
|
|
CHECK_ERR_STRING("reading child app type");
|
|
}
|
|
_buf.readIn(_fds[SOCK_FD].fd, 2, errString);
|
|
CHECK_ERR_STRING("reading child app instance length");
|
|
{
|
|
uint16_t appInstanceLen = _buf.unpackScalar<uint16_t>();
|
|
_buf.readIn(_fds[SOCK_FD].fd, appInstanceLen, errString);
|
|
CHECK_ERR_STRING("reading child app instance");
|
|
}
|
|
_buf.readIn(_fds[SOCK_FD].fd, 2, errString);
|
|
CHECK_ERR_STRING("reading alert id length");
|
|
int64_t alertId;
|
|
{
|
|
uint16_t alertIdLen = _buf.unpackScalar<uint16_t>();
|
|
_buf.readIn(_fds[SOCK_FD].fd, alertIdLen, errString);
|
|
CHECK_ERR_STRING("reading alert id");
|
|
size_t idx;
|
|
alertId = std::stoll(std::string(_buf.buf, alertIdLen), &idx);
|
|
if (idx != alertIdLen) {
|
|
errString = "could not parse alert id";
|
|
CHECK_ERR_STRING("parsing alert id");
|
|
}
|
|
}
|
|
LOG_INFO(_env, "got alert %s binned from UI", alertId);
|
|
_binnableAlerts.erase(alertId);
|
|
break; }
|
|
default:
|
|
throw EGGS_EXCEPTION("unknown message type %s", msgType);
|
|
}
|
|
}
|
|
|
|
if (_fds[TIMER_FD].revents & POLLIN) {
|
|
LOG_DEBUG(_env, "got event in timer FD");
|
|
// drain timer
|
|
{
|
|
char buf[8];
|
|
if (read(_fds[TIMER_FD].fd, buf, sizeof(buf)) != sizeof(buf)) {
|
|
throw SYSCALL_EXCEPTION("read");
|
|
}
|
|
}
|
|
_timerExpiresAt = std::numeric_limits<uint64_t>::max(); // it just fired
|
|
|
|
// check if we're past the deadline
|
|
if (_gotHeartbeatAt > 0) {
|
|
nextWakeup = std::min(nextWakeup, _gotHeartbeatAt + HEARTBEAT_INTERVAL*2);
|
|
LOG_DEBUG(_env, "nextWakeup=%s", nextWakeup);
|
|
if ((now-_gotHeartbeatAt > HEARTBEAT_INTERVAL*2)) {
|
|
LOG_INFO(_env, "heartbeat deadline has passed, will reconnect");
|
|
if (close(_fds[SOCK_FD].fd) < 0) {
|
|
throw SYSCALL_EXCEPTION("close");
|
|
}
|
|
_fds[SOCK_FD].fd = -1;
|
|
return nextWakeup;
|
|
}
|
|
}
|
|
|
|
// unquiet alerts that are due
|
|
for (auto it = _quietAlerts.begin(); it != _quietAlerts.end();) {
|
|
if (now >= it->second.quietUntil) {
|
|
nextWakeup = std::min(nextWakeup, it->second.quietUntil);
|
|
_queuedRequests.emplace_back(XmonRequest{
|
|
.alertId = it->first,
|
|
.quietPeriod = 0,
|
|
.message = std::move(it->second.message),
|
|
.msgType = XmonRequestType::CREATE,
|
|
.appType = it->second.appType,
|
|
.binnable = false,
|
|
});
|
|
it = _quietAlerts.erase(it);
|
|
} else {
|
|
it++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (_fds[PIPE_FD].revents & (POLLIN|POLLHUP|POLLERR)) {
|
|
LOG_DEBUG(_env, "got event in pipe FD");
|
|
XmonRequest req;
|
|
req.read(_fds[PIPE_FD].fd);
|
|
if (req.appType == XmonAppType::DEFAULT) {
|
|
req.appType = _parent;
|
|
}
|
|
_queuedRequests.emplace_back(std::move(req));
|
|
}
|
|
|
|
// Write out all requests, if socket is alive anyway
|
|
if (_gotHeartbeatAt > 0) {
|
|
while (!_queuedRequests.empty()) {
|
|
const auto& req = _queuedRequests.front();
|
|
LOG_INFO(_env, "got req of type %s alertId=%s", (int)XmonRequestType::CREATE, req.alertId);
|
|
int64_t alertIdToInsert = -1;
|
|
if (req.msgType == XmonRequestType::CREATE) {
|
|
if (req.quietPeriod > 0) {
|
|
ALWAYS_ASSERT(!req.binnable, "got alert with quietPeriod=%s, but it is binnable", req.quietPeriod);
|
|
LOG_INFO(_env, "got non-binnable alertId=%s message=%s quietPeriod=%s, will wait", req.alertId, req.message, req.quietPeriod);
|
|
EggsTime quietUntil = now + req.quietPeriod;
|
|
nextWakeup = std::min(nextWakeup, quietUntil);
|
|
_quietAlerts[req.alertId] = QuietAlert{
|
|
.quietUntil = quietUntil,
|
|
.appType = req.appType,
|
|
.message = std::move(req.message),
|
|
};
|
|
goto skip_request;
|
|
} else if (req.binnable) {
|
|
alertIdToInsert = req.alertId;
|
|
if (_binnableAlerts.size() > MAX_BINNABLE_ALERTS) {
|
|
LOG_ERROR(_env, "not creating alert, aid=%s binnable=%s message=%s, we're full", req.alertId, req.binnable, req.message);
|
|
if (_binnableAlerts.count(XmonAgent::TOO_MANY_ALERTS_ALERT_ID) == 0) {
|
|
XmonRequest req{
|
|
.alertId = XmonAgent::TOO_MANY_ALERTS_ALERT_ID,
|
|
.quietPeriod = 0,
|
|
.message = "too many alerts, alerts dropped",
|
|
.msgType = XmonRequestType::CREATE,
|
|
.appType = _parent,
|
|
.binnable = true,
|
|
};
|
|
_packRequest(_buf, req);
|
|
alertIdToInsert = XmonAgent::TOO_MANY_ALERTS_ALERT_ID;
|
|
goto write_request;
|
|
} else {
|
|
goto skip_request;
|
|
}
|
|
}
|
|
}
|
|
LOG_INFO(_env, "creating alert, aid=%s binnable=%s message=%s", req.alertId, req.binnable, req.message);
|
|
} else if (req.msgType == XmonRequestType::UPDATE) {
|
|
ALWAYS_ASSERT(!req.binnable);
|
|
auto quiet = _quietAlerts.find(req.alertId);
|
|
if (quiet != _quietAlerts.end()) {
|
|
LOG_INFO(_env, "skipping update alertId=%s message=%s since it's still quiet", req.alertId, req.message);
|
|
quiet->second.message = std::move(req.message);
|
|
goto skip_request;
|
|
}
|
|
LOG_INFO(_env, "updating alert, aid=%s binnable=%s message=%s", req.alertId, req.binnable, req.message);
|
|
} else if (req.msgType == XmonRequestType::CLEAR) {
|
|
ALWAYS_ASSERT(!req.binnable);
|
|
auto quiet = _quietAlerts.find(req.alertId);
|
|
if (quiet != _quietAlerts.end()) {
|
|
LOG_INFO(_env, "skipping clear alertId=%s since it's still quiet", req.alertId);
|
|
_quietAlerts.erase(quiet);
|
|
goto skip_request;
|
|
}
|
|
LOG_INFO(_env, "clearing alert, aid=%s", req.alertId);
|
|
} else {
|
|
throw EGGS_EXCEPTION("bad req type %s", (int)req.msgType);
|
|
}
|
|
_packRequest(_buf, req);
|
|
write_request:
|
|
errString = _buf.writeOut(_fds[SOCK_FD].fd);
|
|
CHECK_ERR_STRING(errString);
|
|
LOG_DEBUG(_env, "sent request to xmon");
|
|
if (alertIdToInsert >= 0) {
|
|
_binnableAlerts.insert(alertIdToInsert);
|
|
}
|
|
skip_request:
|
|
_queuedRequests.pop_front();
|
|
}
|
|
}
|
|
|
|
return nextWakeup;
|
|
}
|
|
|
|
void Xmon::step() {
|
|
EggsTime nextTimer = _stepNextWakeup();
|
|
_ensureTimer(eggsNow(), nextTimer);
|
|
}
|