Kill all references to internal services

This commit is contained in:
Francesco Mazzoli
2025-09-03 10:12:25 +00:00
parent 110705db8d
commit 4096e73818
28 changed files with 366 additions and 181 deletions

17
cpp/Dockerfile-alpine Normal file
View File

@@ -0,0 +1,17 @@
# The image we use to build the static "alpine" binaries
# that we deploy.
# We are staying on 3.18 and patching in go1.22 instead of moving to 3.20
# due to issues with compiling the go-sqlite3 lib on 3.20.
# We intend to drop the alpine build entirely, see https://internal-repo/issues/336.
FROM alpine:3.18
RUN set -eu
RUN apk update
RUN apk add ca-certificates
RUN /usr/sbin/update-ca-certificates
RUN apk add --no-cache bash perl coreutils python3 musl gcc g++ clang lld make cmake ninja mandoc linux-headers patch wget
# Explicitly install go outside of apk since the default version is 1.18
RUN wget https://go.dev/dl/go1.22.3.linux-amd64.tar.gz
RUN echo 8920ea521bad8f6b7bc377b4824982e011c19af27df88a815e3586ea895f1b36 go1.22.3.linux-amd64.tar.gz | sha256sum --check
RUN tar -C /usr/local -xzf go1.22.3.linux-amd64.tar.gz; rm go1.22.3.linux-amd64.tar.gz
ENV PATH="${PATH}:/usr/local/go/bin"
ENV IN_TERN_BUILD_CONTAINER Y

14
cpp/Dockerfile-ubuntu Normal file
View File

@@ -0,0 +1,14 @@
FROM ubuntu:22.04
RUN set -eu
# See <https://stackoverflow.com/questions/71941032/why-i-cannot-run-apt-update-inside-a-fresh-ubuntu22-04>
RUN rm -f /etc/apt/apt.conf.d/docker-clean
RUN apt-get update
RUN apt-get install -y python3 gcc g++ clang lld make cmake ninja-build mandoc build-essential git fuse valgrind llvm ca-certificates wget
RUN apt-get clean
# Explicitly install go outside of apt since the default version is 1.18
RUN /usr/sbin/update-ca-certificates
RUN wget https://go.dev/dl/go1.22.3.linux-amd64.tar.gz
RUN echo 8920ea521bad8f6b7bc377b4824982e011c19af27df88a815e3586ea895f1b36 go1.22.3.linux-amd64.tar.gz | sha256sum --check
RUN tar -C /usr/local -xzf go1.22.3.linux-amd64.tar.gz; rm go1.22.3.linux-amd64.tar.gz
ENV PATH="${PATH}:/usr/local/go/bin"
ENV IN_TERN_BUILD_CONTAINER Y

View File

@@ -19,17 +19,17 @@ repo_dir = cpp_dir.parent
build_dir = cpp_dir / 'build' / build_type
build_dir.mkdir(parents=True, exist_ok=True)
if build_type in ('ubuntu', 'ubuntudebug', 'ubuntusanitized', 'ubuntuvalgrind', 'alpine', 'alpinedebug') and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
if build_type in ('ubuntu', 'ubuntudebug', 'ubuntusanitized', 'ubuntuvalgrind', 'alpine', 'alpinedebug') and 'IN_TERN_BUILD_CONTAINER' not in os.environ:
if build_type.startswith('alpine'):
container = 'REDACTED'
container = 'ghcr.io/xtxmarkets/ternfs-alpine-build:2025-09-03'
else:
container = 'REDACTED'
container = 'ghcr.io/xtxmarkets/ternfs-ubuntu-build:2025-09-03'
# See <https://groups.google.com/g/seastar-dev/c/r7W-Kqzy9O4>
# for motivation for `--security-opt seccomp=unconfined`,
# the `--pids-limit -1` is not something I hit but it seems
# like a good idea.
subprocess.run(
['docker', 'run', '--network', 'host', '--pids-limit', '-1', '--security-opt', 'seccomp=unconfined', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', '-u', f'{os.getuid()}:{os.getgid()}', container, '/eggsfs/cpp/build.py', build_type] + sys.argv[2:],
['docker', 'run', '--network', 'host', '-e', 'http_proxy', '-e', 'https_proxy', '-e', 'no_proxy', '--pids-limit', '-1', '--security-opt', 'seccomp=unconfined', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/ternfs', '-u', f'{os.getuid()}:{os.getgid()}', container, '/ternfs/cpp/build.py', build_type] + sys.argv[2:],
check=True,
)
else:

View File

@@ -1066,6 +1066,7 @@ static void logsDBstatsToMetrics(struct MetricsBuilder& metricsBuilder, const Lo
struct CDCMetricsInserter : PeriodicLoop {
private:
InfluxDB _influxDB;
CDCShared& _shared;
ReplicaId _replicaId;
XmonNCAlert _sendMetricsAlert;
@@ -1073,8 +1074,9 @@ private:
std::unordered_map<std::string, uint64_t> _rocksDBStats;
XmonNCAlert _updateSizeAlert;
public:
CDCMetricsInserter(Logger& logger, std::shared_ptr<XmonAgent>& xmon, CDCShared& shared, ReplicaId replicaId):
CDCMetricsInserter(Logger& logger, std::shared_ptr<XmonAgent>& xmon, const InfluxDB& influxDB, CDCShared& shared, ReplicaId replicaId):
PeriodicLoop(logger, xmon, "metrics", {1_sec, 1.0, 1_mins, 0.1}),
_influxDB(influxDB),
_shared(shared),
_replicaId(replicaId),
_sendMetricsAlert(XmonAppType::DAYTIME, 1_mins),
@@ -1143,7 +1145,7 @@ public:
}
}
logsDBstatsToMetrics(_metricsBuilder, _shared.logsDB.getStats(), _replicaId, now);
std::string err = sendMetrics(10_sec, _metricsBuilder.payload());
std::string err = sendMetrics(_influxDB, 10_sec, _metricsBuilder.payload());
_metricsBuilder.reset();
if (err.empty()) {
LOG_INFO(_env, "Sent metrics to influxdb");
@@ -1168,7 +1170,7 @@ void runCDC(CDCOptions& options) {
Logger logger(options.logLevel, logOutFd, options.syslog, true);
std::shared_ptr<XmonAgent> xmon;
if (options.xmon) {
if (!options.xmonAddr.empty()) {
xmon = std::make_shared<XmonAgent>();
}
@@ -1197,7 +1199,7 @@ void runCDC(CDCOptions& options) {
config.appInstance = ss.str();
}
config.appType = XmonAppType::CRITICAL;
config.prod = options.xmonProd;
config.addr = options.xmonAddr;
threads.emplace_back(LoopThread::Spawn(std::make_unique<Xmon>(logger, xmon, config)));
}
@@ -1227,8 +1229,8 @@ void runCDC(CDCOptions& options) {
threads.emplace_back(LoopThread::Spawn(std::make_unique<CDCShardUpdater>(logger, xmon, options, shared)));
threads.emplace_back(LoopThread::Spawn(std::make_unique<CDCServer>(logger, xmon, options, shared)));
threads.emplace_back(LoopThread::Spawn(std::make_unique<CDCRegisterer>(logger, xmon, options, shared)));
if (options.metrics) {
threads.emplace_back(LoopThread::Spawn(std::make_unique<CDCMetricsInserter>(logger, xmon, shared, options.replicaId)));
if (options.influxDB) {
threads.emplace_back(LoopThread::Spawn(std::make_unique<CDCMetricsInserter>(logger, xmon, *options.influxDB, shared, options.replicaId)));
}
LoopThread::waitUntilStopped(threads);

View File

@@ -1,10 +1,13 @@
#pragma once
#include <cstdint>
#include <optional>
#include "Env.hpp"
#include "Msgs.hpp"
#include "Shard.hpp"
#include "Time.hpp"
#include <cstdint>
#include "Metrics.hpp"
struct CDCOptions {
LogLevel logLevel = LogLevel::LOG_INFO;
@@ -17,9 +20,8 @@ struct CDCOptions {
AddrsInfo cdcToShardAddress = {};
bool syslog = false;
Duration shardTimeout = 100_ms;
bool xmon = false;
bool xmonProd = false;
bool metrics = false;
std::string xmonAddr;
std::optional<InfluxDB> influxDB;
ReplicaId replicaId;
uint8_t location;

View File

@@ -23,9 +23,11 @@ void usage(const char* binary) {
fprintf(stderr, " If not provided, stdout.\n");
fprintf(stderr, " -shard-timeout-ms milliseconds\n");
fprintf(stderr, " How much to wait for shard responses. Right now this is a simple loop.\n");
fprintf(stderr, " -xmon qa|prod\n");
fprintf(stderr, " -xmon host:port\n");
fprintf(stderr, " Enable Xmon alerts.\n");
fprintf(stderr, " -metrics\n");
fprintf(stderr, " -influx-db-origin\n");
fprintf(stderr, " -influx-db-org\n");
fprintf(stderr, " -influx-db-bucket\n");
fprintf(stderr, " Enable metrics.\n");
fprintf(stderr, " -logsdb-leader\n");
fprintf(stderr, " Allow replica to become leader. Default is false\n");
@@ -102,6 +104,9 @@ int main(int argc, char** argv) {
CDCOptions options;
std::vector<std::string> args;
std::string shuckleAddress;
std::string influxDBOrigin;
std::string influxDBOrg;
std::string influxDBBucket;
uint8_t numAddressesFound = 0;
for (int i = 1; i < argc; i++) {
const auto getNextArg = [argc, &argv, &dieWithUsage, &i]() {
@@ -156,18 +161,13 @@ int main(int argc, char** argv) {
}
options.shardTimeout = Duration(ms * 1'000'000);
} else if (arg == "-xmon") {
options.xmon = true;
std::string xmonEnv = getNextArg();
if (xmonEnv == "qa") {
options.xmonProd = false;
} else if (xmonEnv == "prod") {
options.xmonProd = true;
} else {
fprintf(stderr, "Invalid xmon env %s", xmonEnv.c_str());
dieWithUsage();
}
} else if (arg == "-metrics") {
options.metrics = true;
options.xmonAddr = getNextArg();
} else if (arg == "-influx-db-origin") {
influxDBOrigin = getNextArg();
} else if (arg == "-influx-db-org") {
influxDBOrg = getNextArg();
} else if (arg == "-influx-db-bucket") {
influxDBBucket = getNextArg();
} else if (arg == "-logsdb-leader") {
options.avoidBeingLeader = false;
} else if (arg == "-logsdb-no-replication") {
@@ -177,6 +177,18 @@ int main(int argc, char** argv) {
}
}
if (influxDBOrigin.empty() != influxDBOrg.empty() || influxDBOrigin.empty() != influxDBBucket.empty()) {
fprintf(stderr, "Either all or none of the -influx-db flags must be provided\n");
dieWithUsage();
}
if (influxDBOrigin.empty()) {
options.influxDB = InfluxDB{
.origin = influxDBOrigin,
.org = influxDBOrg,
.bucket = influxDBBucket,
};
}
if (args.size() < 2 || args.size() > 3) {
fprintf(stderr, "Expecting two or three positional argument (DIRECTORY REPLICA_ID [LOCATION_ID]), got %ld.\n", args.size());
dieWithUsage();

View File

@@ -6,30 +6,6 @@
#define CPPHTTPLIB_USE_POLL
#include "httplib.h"
#if 0
curl -vvv --request POST \
"http://REDACTED?org=restech&bucket=metrics&precision=ns" \
--header "Content-Type: text/plain; charset=utf-8" \
--header "Accept: application/json" \
--data-binary 'eggsfs_fmazzol_test,sensor_id=TLM0201 temperature=73.97038159354763,humidity=35.23103248356096,co=0.48445310567793615 1690998953988422912'
curl -G 'http://REDACTED?pretty=true' --data-urlencode "db=metrics" --data-urlencode "q=SELECT \"value\" FROM \"disk_ecn_hostmon_disk_write_bytes\""
var TeamRestech = &Team{
Name: "restech",
Domain: "REDACTED",
ManagementDomain: "REDACTED",
XmonRota: "restech",
BootServerURLs: []string{"https://REDACTED"},
BuildServerURL: "https://REDACTED",
MetricsPrefix: "restech",
InfluxOrg: "restech",
InfluxDBURL: "http://REDACTED",
}
#endif
// We're being overly conservative here, see
// <https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/#special-characters>,
// but better being safe than sorry.
@@ -88,23 +64,19 @@ void MetricsBuilder::timestamp(TernTime t) {
_state = State::TIMESTAMP;
}
const std::string influxDBUrl = "http://REDACTED";
const std::string influxDBOrg = "restech";
const std::string influxDBBucket = "metrics";
std::string sendMetrics(Duration timeout, const std::string& payload) {
httplib::Client cli(influxDBUrl);
std::string sendMetrics(const InfluxDB& idb, Duration timeout, const std::string& payload) {
httplib::Client cli(idb.origin);
time_t ds = timeout.ns / 1'000'000'000ull;
time_t dus = (timeout.ns % 1'000'000'000ull) / 1'000ull;
cli.set_connection_timeout(ds, dus);
cli.set_read_timeout(ds, dus);
cli.set_write_timeout(ds, dus);
std::string path = "/api/v2/write?org=" + influxDBOrg + "&bucket=" + influxDBBucket + "&precision=ns";
std::string path = "/api/v2/write?org=" + idb.org + "&bucket=" + idb.bucket + "&precision=ns";
const auto res = cli.Post(path, payload, "text/plain");
if (res.error() != httplib::Error::Success) {
std::ostringstream ss;
ss << "Could not insert metrics to " << influxDBUrl << path << ": " << res.error();
ss << "Could not insert metrics to " << idb.origin << path << ": " << res.error();
return ss.str();
}
return {};

View File

@@ -54,5 +54,11 @@ public:
void timestamp(TernTime t);
};
struct InfluxDB {
std::string origin;
std::string org;
std::string bucket;
};
// error string on error
std::string sendMetrics(Duration timeout, const std::string& payload);
std::string sendMetrics(const InfluxDB& idb, Duration timeout, const std::string& payload);

View File

@@ -6,6 +6,7 @@
#include <sys/socket.h>
#include <unordered_set>
#include <sys/timerfd.h>
#include <charconv>
#include "Common.hpp"
#include "Env.hpp"
@@ -55,13 +56,36 @@ Xmon::Xmon(
Loop(logger, agent, "xmon"),
_agent(agent),
_appInstance(config.appInstance),
_parent(config.appType),
_xmonHost(config.prod ? "REDACTED" : "REDACTED"),
_xmonPort(5004)
_parent(config.appType)
{
if (_appInstance.empty()) {
throw TERN_EXCEPTION("empty app name");
}
{
size_t colon = config.addr.find_last_of(':');
if (colon == std::string_view::npos || colon == 0 || colon == config.addr.size()-1) {
throw TERN_EXCEPTION("invalid xmon addr %s", config.addr);
}
std::string host = config.addr.substr(0, colon);
std::string portString = config.addr.substr(colon + 1);
uint64_t port64;
auto [ptr, ec] = std::from_chars(portString.data(), portString.data() + portString.size(), port64);
if (ec != std::errc() || ptr != portString.data() + portString.size()) {
throw TERN_EXCEPTION("invalid xmon addr %s", config.addr);
}
if (port64 > ~(uint16_t)0) {
throw TERN_EXCEPTION("invalid port %s", port64);
}
_xmonHost = host;
_xmonPort = port64;
}
{
char buf[HOST_NAME_MAX];
int res = gethostname(buf, HOST_NAME_MAX);
@@ -136,7 +160,6 @@ const uint8_t HEARTBEAT_INTERVAL_SECS = HEARTBEAT_INTERVAL.ns/1'000'000'000ull;
void Xmon::_packLogon(XmonBuf& buf) {
buf.reset();
// <https://REDACTED>
// Name Type Description
// Magic int16 always 'T'
// Version int32 version number (latest version is 4)

View File

@@ -15,7 +15,7 @@
#include "Loop.hpp"
struct XmonConfig {
bool prod = false;
std::string addr;
XmonAppType appType = XmonAppType::NEVER;
std::string appInstance = "";
};

View File

@@ -2000,6 +2000,7 @@ static void logsDBstatsToMetrics(struct MetricsBuilder& metricsBuilder, const Lo
struct ShardMetricsInserter : PeriodicLoop {
private:
InfluxDB _influxDB;
ShardShared& _shared;
ShardReplicaId _shrid;
uint8_t _location;
@@ -2009,8 +2010,9 @@ private:
std::array<XmonNCAlert, 2> _sockQueueAlerts;
XmonNCAlert _writeQueueAlert;
public:
ShardMetricsInserter(Logger& logger, std::shared_ptr<XmonAgent>& xmon, ShardShared& shared):
ShardMetricsInserter(Logger& logger, std::shared_ptr<XmonAgent>& xmon, const InfluxDB& influxDB, ShardShared& shared):
PeriodicLoop(logger, xmon, "metrics", {1_sec, 1.0, 1_mins, 0.1}),
_influxDB(influxDB),
_shared(shared),
_shrid(_shared.options.shrid),
_location(_shared.options.location),
@@ -2103,7 +2105,7 @@ public:
}
}
logsDBstatsToMetrics(_metricsBuilder, _shared.logsDB.getStats(), _shrid, _location, now);
std::string err = sendMetrics(10_sec, _metricsBuilder.payload());
std::string err = sendMetrics(_influxDB, 10_sec, _metricsBuilder.payload());
_metricsBuilder.reset();
if (err.empty()) {
LOG_INFO(_env, "Sent metrics to influxdb");
@@ -2127,7 +2129,7 @@ void runShard(ShardOptions& options) {
Logger logger(options.logLevel, logOutFd, options.syslog, true);
std::shared_ptr<XmonAgent> xmon;
if (options.xmon) {
if (!options.xmonAddr.empty()) {
xmon = std::make_shared<XmonAgent>();
}
@@ -2158,7 +2160,7 @@ void runShard(ShardOptions& options) {
ss << "eggsshard_" << std::setfill('0') << std::setw(3) << options.shrid.shardId() << "_" << options.shrid.replicaId();
config.appInstance = ss.str();
}
config.prod = options.xmonProd;
config.addr = options.xmonAddr;
config.appType = XmonAppType::CRITICAL;
threads.emplace_back(LoopThread::Spawn(std::make_unique<Xmon>(logger, xmon, config)));
}
@@ -2197,8 +2199,8 @@ void runShard(ShardOptions& options) {
threads.emplace_back(LoopThread::Spawn(std::make_unique<ShardWriter>(logger, xmon, shared)));
threads.emplace_back(LoopThread::Spawn(std::make_unique<ShardRegisterer>(logger, xmon, shared)));
threads.emplace_back(LoopThread::Spawn(std::make_unique<ShardBlockServiceUpdater>(logger, xmon, shared)));
if (options.metrics) {
threads.emplace_back(LoopThread::Spawn(std::make_unique<ShardMetricsInserter>(logger, xmon, shared)));
if (options.influxDB) {
threads.emplace_back(LoopThread::Spawn(std::make_unique<ShardMetricsInserter>(logger, xmon, *options.influxDB, shared)));
}
// from this point on termination on SIGINT/SIGTERM will be graceful

View File

@@ -1,10 +1,13 @@
#pragma once
#include <cstdint>
#include <optional>
#include "Env.hpp"
#include "Msgs.hpp"
#include "MsgsGen.hpp"
#include "ShardDB.hpp"
#include <cstdint>
#include "Metrics.hpp"
struct ShardOptions {
ShardReplicaId shrid;
@@ -22,9 +25,8 @@ struct ShardOptions {
// resilience of the system.
double simulateOutgoingPacketDrop = 0.0;
bool syslog = false;
bool xmon = false;
bool xmonProd = false;
bool metrics = false;
std::string xmonAddr;
std::optional<InfluxDB> influxDB;
Duration transientDeadlineInterval = DEFAULT_DEADLINE_INTERVAL;
// LogsDB settings

View File

@@ -30,9 +30,11 @@ static void usage(const char* binary) {
fprintf(stderr, " If not provided, stdout.\n");
fprintf(stderr, " -outgoing-packet-drop [0, 1)\n");
fprintf(stderr, " Drop given ratio of packets after processing them.\n");
fprintf(stderr, " -xmon qa|prod\n");
fprintf(stderr, " -xmon host:port\n");
fprintf(stderr, " Enable Xmon alerts.\n");
fprintf(stderr, " -metrics\n");
fprintf(stderr, " -influx-db-origin\n");
fprintf(stderr, " -influx-db-org\n");
fprintf(stderr, " -influx-db-bucket\n");
fprintf(stderr, " Enable metrics.\n");
fprintf(stderr, " -transient-deadline-interval\n");
fprintf(stderr, " Tweaks the interval with wich the deadline for transient file gets bumped.\n");
@@ -144,6 +146,9 @@ int main(int argc, char** argv) {
ShardOptions options;
std::vector<std::string> args;
std::string shuckleAddress;
std::string influxDBOrigin;
std::string influxDBOrg;
std::string influxDBBucket;
uint8_t numAddressesFound = 0;
for (int i = 1; i < argc; i++) {
const auto getNextArg = [argc, &argv, &dieWithUsage, &i]() {
@@ -189,18 +194,13 @@ int main(int argc, char** argv) {
} else if (arg == "-syslog") {
options.syslog = true;
} else if (arg == "-xmon") {
options.xmon = true;
std::string xmonEnv = getNextArg();
if (xmonEnv == "qa") {
options.xmonProd = false;
} else if (xmonEnv == "prod") {
options.xmonProd = true;
} else {
fprintf(stderr, "Invalid xmon env %s", xmonEnv.c_str());
dieWithUsage();
}
} else if (arg == "-metrics") {
options.metrics = true;
options.xmonAddr = getNextArg();
} else if (arg == "-influx-db-origin") {
influxDBOrigin = getNextArg();
} else if (arg == "-influx-db-org") {
influxDBOrg = getNextArg();
} else if (arg == "-influx-db-bucket") {
influxDBBucket = getNextArg();
} else if (arg == "-transient-deadline-interval") {
options.transientDeadlineInterval = parseDuration(getNextArg());
} else if (arg == "-logsdb-leader") {
@@ -212,9 +212,21 @@ int main(int argc, char** argv) {
}
}
if (influxDBOrigin.empty() != influxDBOrg.empty() || influxDBOrigin.empty() != influxDBBucket.empty()) {
fprintf(stderr, "Either all or none of the -influx-db flags must be provided\n");
dieWithUsage();
}
if (influxDBOrigin.empty()) {
options.influxDB = InfluxDB{
.origin = influxDBOrigin,
.org = influxDBOrg,
.bucket = influxDBBucket,
};
}
if (options.noReplication && options.avoidBeingLeader) {
fprintf(stderr, "-logsdb-leader needs to be set if -logsdb-no-replication is set\n");
dieWithUsage();
dieWithUsage();
}
if (args.size() < 3 || args.size() > 4) {