Files
ternfs-XTXMarkets/cpp/tests/blockservicepickertests.cpp
T

1188 lines
43 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright 2025 XTX Markets Technologies Limited
//
// SPDX-License-Identifier: GPL-2.0-or-later
#include <thread>
#include <atomic>
#include <vector>
#include <unordered_map>
#include "BlockServicePicker.hpp"
#include "BlockServicesCacheDB.hpp"
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include "doctest.h"
static Logger testLogger(LogLevel::LOG_ERROR, STDERR_FILENO, false, false);
static std::shared_ptr<XmonAgent> testXmon;
static BlockServicePicker makePicker(Duration writableDelay = 0_sec,
uint64_t hddDriveThroughput = 0,
uint64_t flashDriveThroughput = 0,
uint64_t minSpaceRequiredForWrite = 0) {
return BlockServicePicker(testLogger, testXmon, writableDelay,
hddDriveThroughput, flashDriveThroughput, minSpaceRequiredForWrite);
}
static FailureDomain fdWith(uint8_t v) {
FailureDomain fd;
for (int i = 0; i < 16; ++i) fd.name.data[i] = v;
return fd;
}
static BlockServiceInfoShort bs(uint64_t id, uint8_t loc, uint8_t sc, uint8_t fdByte) {
BlockServiceInfoShort x;
x.id = BlockServiceId(id);
x.locationId = loc;
x.storageClass = sc;
x.failureDomain = fdWith(fdByte);
return x;
}
static std::unordered_map<uint64_t, BlockServiceCache> makeCatalog(const std::vector<BlockServiceInfoShort>& services) {
std::unordered_map<uint64_t, BlockServiceCache> cache;
for (const auto& svc : services) {
BlockServiceCache entry;
entry.locationId = svc.locationId;
entry.storageClass = svc.storageClass;
entry.failureDomain = svc.failureDomain.name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = 1000000;
entry.capacityBytes = 10000000;
entry.blocks = 0;
entry.hasFiles = false;
cache[svc.id.u64] = entry;
}
return cache;
}
TEST_CASE("picker basic selection") {
auto p = makePicker();
std::vector<BlockServiceInfoShort> catalog{
bs(1, 1, FLASH_STORAGE, 1),
bs(2, 1, FLASH_STORAGE, 2),
bs(3, 1, FLASH_STORAGE, 3),
bs(4, 2, HDD_STORAGE, 4)
};
auto cache = makeCatalog(catalog);
p.update(cache);
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 2, {}, out);
CHECK(err == TernError::NO_ERROR);
CHECK(out.size() == 2);
CHECK(out[0] != out[1]);
}
TEST_CASE("picker blacklist by id and failure domain") {
auto p = makePicker();
std::vector<BlockServiceInfoShort> catalog{
bs(10, 1, FLASH_STORAGE, 7),
bs(11, 1, FLASH_STORAGE, 8),
bs(12, 1, FLASH_STORAGE, 9)
};
auto cache = makeCatalog(catalog);
p.update(cache);
std::vector<BlacklistEntry> bl;
BlacklistEntry e1; e1.blockService = BlockServiceId(10); bl.push_back(e1);
BlacklistEntry e2; e2.failureDomain = fdWith(8); bl.push_back(e2);
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 2, bl, out);
// Only bs 12 remains available
CHECK(err == TernError::COULD_NOT_PICK_BLOCK_SERVICES);
CHECK(out.size() == 0);
}
TEST_CASE("picker insufficient candidates") {
auto p = makePicker();
std::vector<BlockServiceInfoShort> catalog{
bs(20, 1, HDD_STORAGE, 1)
};
auto cache = makeCatalog(catalog);
p.update(cache);
std::vector<BlockServiceId> out;
auto err = p.pick(1, HDD_STORAGE, 2, {}, out);
CHECK(err == TernError::COULD_NOT_PICK_BLOCK_SERVICES);
CHECK(out.size() == 0);
}
TEST_CASE("picker concurrency update while picks") {
auto p = makePicker();
std::atomic<bool> stop{false};
// Start with many candidates
std::vector<BlockServiceInfoShort> catalog;
for (uint64_t i = 0; i < 64; ++i) { catalog.push_back(bs(100+i, 1, FLASH_STORAGE, (uint8_t)i)); }
auto cache = makeCatalog(catalog);
p.update(cache);
std::thread t1([&]{
std::vector<BlockServiceId> out;
while (!stop.load()) {
auto err = p.pick(1, FLASH_STORAGE, 8, {}, out);
CHECK(err == TernError::NO_ERROR);
CHECK(out.size() == 8);
}
});
// Rebuild state repeatedly
for (int round = 0; round < 100; ++round) {
std::vector<BlockServiceInfoShort> cat2;
for (uint64_t i = 0; i < 64; ++i) { cat2.push_back(bs(200+i+round, 1, FLASH_STORAGE, (uint8_t)i)); }
auto cache2 = makeCatalog(cat2);
p.update(cache2);
}
stop = true;
t1.join();
}
TEST_CASE("picker weighted distribution") {
auto p = makePicker();
// Create services with different weights
// Service 1000: 10 MB available (weight 10M)
// Service 2000: 20 MB available (weight 20M)
// Service 3000: 30 MB available (weight 30M)
// Service 4000: 40 MB available (weight 40M)
// Total: 100M, so expected ratios are 10%, 20%, 30%, 40%
std::vector<BlockServiceInfoShort> catalog{
bs(1000, 1, FLASH_STORAGE, 1),
bs(2000, 1, FLASH_STORAGE, 2),
bs(3000, 1, FLASH_STORAGE, 3),
bs(4000, 1, FLASH_STORAGE, 4)
};
std::unordered_map<uint64_t, BlockServiceCache> cache;
for (const auto& svc : catalog) {
BlockServiceCache entry;
entry.locationId = svc.locationId;
entry.storageClass = svc.storageClass;
entry.failureDomain = svc.failureDomain.name.data;
entry.flags = BlockServiceFlags::EMPTY; // writable
entry.capacityBytes = 100000000;
entry.blocks = 0;
entry.hasFiles = false;
// Set different availableBytes for each service to create weights
if (svc.id.u64 == 1000) entry.availableBytes = 10000000;
else if (svc.id.u64 == 2000) entry.availableBytes = 20000000;
else if (svc.id.u64 == 3000) entry.availableBytes = 30000000;
else if (svc.id.u64 == 4000) entry.availableBytes = 40000000;
cache[svc.id.u64] = entry;
}
p.update(cache);
// Perform many picks to measure distribution
const int NUM_ITERATIONS = 10000;
std::unordered_map<uint64_t, int> pickCounts;
for (int i = 0; i < NUM_ITERATIONS; ++i) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 1);
pickCounts[out[0].u64]++;
}
// Check that all services were picked
CHECK(pickCounts.size() == 4);
CHECK(pickCounts.count(1000) > 0);
CHECK(pickCounts.count(2000) > 0);
CHECK(pickCounts.count(3000) > 0);
CHECK(pickCounts.count(4000) > 0);
// Verify distribution is roughly proportional to weights
// Expected: 1000->1000, 2000->2000, 3000->3000, 4000->4000
// Allow 20% deviation from expected
CHECK(pickCounts[1000] > 800);
CHECK(pickCounts[1000] < 1200);
CHECK(pickCounts[2000] > 1600);
CHECK(pickCounts[2000] < 2400);
CHECK(pickCounts[3000] > 2400);
CHECK(pickCounts[3000] < 3600);
CHECK(pickCounts[4000] > 3200);
CHECK(pickCounts[4000] < 4800);
}
TEST_CASE("picker blacklist enforcement") {
auto p = makePicker();
// Create 6 services in 6 different failure domains
std::vector<BlockServiceInfoShort> catalog{
bs(1001, 1, FLASH_STORAGE, 1), // FD 1
bs(1002, 1, FLASH_STORAGE, 2), // FD 2 - blacklist this FD
bs(1003, 1, FLASH_STORAGE, 3), // FD 3
bs(1004, 1, FLASH_STORAGE, 4), // FD 4 - blacklist service 1004
bs(1005, 1, FLASH_STORAGE, 5), // FD 5
bs(1006, 1, FLASH_STORAGE, 6) // FD 6
};
auto cache = makeCatalog(catalog);
p.update(cache);
// Blacklist: entire FD 2, and service 1004 from FD 4
std::vector<BlacklistEntry> blacklist;
BlacklistEntry fdBlacklist;
fdBlacklist.failureDomain = fdWith(2);
blacklist.push_back(fdBlacklist);
BlacklistEntry serviceBlacklist;
serviceBlacklist.blockService = BlockServiceId(1004);
blacklist.push_back(serviceBlacklist);
// Perform many picks to verify blacklisted items never appear
const int NUM_ITERATIONS = 5000;
std::unordered_set<uint64_t> pickedServices;
for (int i = 0; i < NUM_ITERATIONS; ++i) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, blacklist, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 1);
pickedServices.insert(out[0].u64);
// Verify blacklisted services never picked
CHECK(out[0].u64 != 1002); // FD 2 is blacklisted
CHECK(out[0].u64 != 1004); // Service 1004 is blacklisted
}
// Verify only valid services were picked
CHECK(pickedServices.count(1001) > 0); // Should be picked
CHECK(pickedServices.count(1002) == 0); // Blacklisted FD
CHECK(pickedServices.count(1003) > 0); // Should be picked
CHECK(pickedServices.count(1004) == 0); // Blacklisted service
CHECK(pickedServices.count(1005) > 0); // Should be picked
CHECK(pickedServices.count(1006) > 0); // Should be picked
}
TEST_CASE("picker weighted distribution with blacklist") {
auto p = makePicker();
// Create services with varying weights across multiple failure domains
// FD 1: service 100 (10MB), service 101 (10MB) - total 20MB
// FD 2: service 200 (40MB) - BLACKLIST THIS
// FD 3: service 300 (20MB), service 301 (20MB) - total 40MB
// Without blacklist: FD1=20MB (20%), FD2=40MB (40%), FD3=40MB (40%)
// With blacklist: FD1=20MB (33%), FD3=40MB (67%)
std::vector<BlockServiceInfoShort> catalog{
bs(100, 1, FLASH_STORAGE, 1),
bs(101, 1, FLASH_STORAGE, 1),
bs(200, 1, FLASH_STORAGE, 2),
bs(300, 1, FLASH_STORAGE, 3),
bs(301, 1, FLASH_STORAGE, 3)
};
std::unordered_map<uint64_t, BlockServiceCache> cache;
for (const auto& svc : catalog) {
BlockServiceCache entry;
entry.locationId = svc.locationId;
entry.storageClass = svc.storageClass;
entry.failureDomain = svc.failureDomain.name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.capacityBytes = 100000000;
entry.blocks = 0;
entry.hasFiles = false;
if (svc.id.u64 == 100 || svc.id.u64 == 101) entry.availableBytes = 10000000;
else if (svc.id.u64 == 200) entry.availableBytes = 40000000;
else if (svc.id.u64 == 300 || svc.id.u64 == 301) entry.availableBytes = 20000000;
cache[svc.id.u64] = entry;
}
p.update(cache);
// Blacklist entire FD 2
std::vector<BlacklistEntry> blacklist;
BlacklistEntry fdBlacklist;
fdBlacklist.failureDomain = fdWith(2);
blacklist.push_back(fdBlacklist);
// Perform many picks
const int NUM_ITERATIONS = 6000;
std::unordered_map<uint64_t, int> pickCounts;
for (int i = 0; i < NUM_ITERATIONS; ++i) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, blacklist, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 1);
// Verify blacklisted service never picked
CHECK(out[0].u64 != 200);
pickCounts[out[0].u64]++;
}
// Verify FD 2 was never picked
CHECK(pickCounts.count(200) == 0);
// Verify FD 1 and FD 3 services were picked
CHECK(pickCounts.count(100) > 0);
CHECK(pickCounts.count(101) > 0);
CHECK(pickCounts.count(300) > 0);
CHECK(pickCounts.count(301) > 0);
// Count picks by FD
int fd1Picks = pickCounts[100] + pickCounts[101];
int fd3Picks = pickCounts[300] + pickCounts[301];
// FD1 (20MB) vs FD3 (40MB) => expect roughly 1:2 ratio
// FD1 should get ~2000 picks, FD3 should get ~4000 picks
// Allow 20% deviation
CHECK(fd1Picks > 1600);
CHECK(fd1Picks < 2400);
CHECK(fd3Picks > 3200);
CHECK(fd3Picks < 4800);
// Within each FD, services should be roughly equal
// FD1: 100 and 101 should each get ~1000 (allow 25% deviation)
CHECK(pickCounts[100] > 750);
CHECK(pickCounts[100] < 1250);
CHECK(pickCounts[101] > 750);
CHECK(pickCounts[101] < 1250);
// FD3: 300 and 301 should each get ~2000 (allow 20% deviation)
CHECK(pickCounts[300] > 1600);
CHECK(pickCounts[300] < 2400);
CHECK(pickCounts[301] > 1600);
CHECK(pickCounts[301] < 2400);
}
TEST_CASE("picker multi-service weighted distribution") {
auto p = makePicker();
// Create 20 failure domains, each with 20-25 services with varying weights
// This creates a large heterogeneous pool to test distribution
std::vector<BlockServiceInfoShort> catalog;
std::unordered_map<uint64_t, BlockServiceCache> cache;
std::unordered_map<uint8_t, uint64_t> fdTotalWeights; // FD byte -> total weight
std::unordered_map<uint64_t, uint64_t> serviceWeights; // service id -> weight
uint64_t totalSystemWeight = 0;
uint64_t serviceId = 10000;
for (uint8_t fdByte = 1; fdByte <= 20; ++fdByte) {
// Each FD gets 20-25 services
int numServices = 20 + (fdByte % 6);
uint64_t fdWeight = 0;
for (int svcIdx = 0; svcIdx < numServices; ++svcIdx) {
// Vary weights within FD: some small (5MB), some medium (10-20MB), some large (30-50MB)
uint64_t weight;
if (svcIdx % 5 == 0) {
weight = 5000000; // 5MB - small
} else if (svcIdx % 3 == 0) {
weight = (10 + (svcIdx % 10)) * 1000000; // 10-20MB - medium
} else {
weight = (30 + (svcIdx % 20)) * 1000000; // 30-50MB - large
}
catalog.push_back(bs(serviceId, 1, FLASH_STORAGE, fdByte));
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(fdByte).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.capacityBytes = 100000000;
entry.availableBytes = weight;
entry.blocks = 0;
entry.hasFiles = false;
cache[serviceId] = entry;
serviceWeights[serviceId] = weight;
fdWeight += weight;
totalSystemWeight += weight;
serviceId++;
}
fdTotalWeights[fdByte] = fdWeight;
}
p.update(cache);
// Test picking different numbers of services: 1, 3, 5, 10, 15
for (int needed : {1, 3, 5, 10, 15}) {
std::unordered_map<uint64_t, int> pickCounts;
std::unordered_map<uint8_t, int> fdPickCounts;
// Run many iterations to gather statistics - 1M for good distribution
const int NUM_ITERATIONS = 1000000;
for (int i = 0; i < NUM_ITERATIONS; ++i) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, needed, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == needed);
// Count each picked service
for (const auto& svc : out) {
pickCounts[svc.u64]++;
// Find which FD this service belongs to
for (uint8_t fdByte = 1; fdByte <= 20; ++fdByte) {
if (cache[svc.u64].failureDomain == fdWith(fdByte).name.data) {
fdPickCounts[fdByte]++;
break;
}
}
}
}
// Verify that picks are distributed according to weights
uint64_t totalPicks = (uint64_t)NUM_ITERATIONS * needed;
// Check failure domain distribution. Marginal pick probability under
// sequential weighted-without-replacement isn't exactly proportional
// to weight when needed/numFDs is large (heavy FDs saturate towards
// P=1), so widen the tolerance for larger picks.
for (uint8_t fdByte = 1; fdByte <= 20; ++fdByte) {
double expectedRatio = (double)fdTotalWeights[fdByte] / totalSystemWeight;
double expectedPicks = totalPicks * expectedRatio;
int actualPicks = fdPickCounts[fdByte];
double tolerance = (needed >= 10) ? 0.10 : 0.05;
if (expectedPicks > 1000) {
CHECK(actualPicks > expectedPicks * (1.0 - tolerance));
CHECK(actualPicks < expectedPicks * (1.0 + tolerance));
}
}
// Check individual service distribution for services with significant weight
// Focus on services that should get a reasonable number of picks
for (const auto& [svcId, weight] : serviceWeights) {
double expectedRatio = (double)weight / totalSystemWeight;
double expectedPicks = totalPicks * expectedRatio;
int actualPicks = pickCounts[svcId];
// Only validate services we expect to see picked frequently enough
if (expectedPicks > 500) {
// Allow 15% deviation for individual services (tight with 10M iterations)
double tolerance = 0.15;
if (needed >= 10) {
tolerance = 0.1;
}
CHECK(actualPicks > expectedPicks * (1.0 - tolerance));
CHECK(actualPicks < expectedPicks * (1.0 + tolerance));
}
}
}
// Verify that the total number of unique services picked is reasonable
// We should see most services picked at least once across all iterations
std::unordered_set<uint64_t> allPickedServices;
const int FINAL_ITERATIONS = 1000000;
for (int i = 0; i < FINAL_ITERATIONS; ++i) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 10, {}, out);
REQUIRE(err == TernError::NO_ERROR);
for (const auto& svc : out) {
allPickedServices.insert(svc.u64);
}
}
// With weighted selection, we should see a good portion of services
// (not necessarily all, since low-weight services may be picked rarely)
CHECK(allPickedServices.size() > serviceWeights.size() * 0.9);
}
TEST_CASE("picker never picks same failure domain twice") {
// Test various configurations: different FD counts, service counts, and needed values
struct TestConfig {
int numFDs;
int servicesPerFD;
int needed;
uint64_t baseWeight;
};
std::vector<TestConfig> configs = {
{3, 1, 2, 1000000}, // minimal: 3 FDs, 1 service each, pick 2
{3, 1, 3, 1000000}, // pick all 3
{5, 3, 4, 1000000}, // 5 FDs with 3 services each, pick 4
{15, 1, 15, 1000000}, // pick every one of 15 FDs
{20, 5, 10, 1000000}, // 20 FDs, pick 10
{3, 1, 2, 1}, // tiny weights (post-scaling edge case)
{52, 1, 15, 100}, // many FDs, small weights
};
for (const auto& cfg : configs) {
auto p = makePicker();
std::unordered_map<uint64_t, BlockServiceCache> cache;
std::unordered_map<uint64_t, uint8_t> serviceToFD; // service id -> FD byte
uint64_t id = 1;
for (int fd = 1; fd <= cfg.numFDs; fd++) {
for (int s = 0; s < cfg.servicesPerFD; s++) {
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(fd).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = cfg.baseWeight;
entry.capacityBytes = cfg.baseWeight * 10;
entry.blocks = 0;
entry.hasFiles = false;
cache[id] = entry;
serviceToFD[id] = fd;
id++;
}
}
p.update(cache);
const int NUM_ITERATIONS = 10000;
for (int i = 0; i < NUM_ITERATIONS; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, cfg.needed, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == cfg.needed);
std::unordered_set<uint8_t> pickedFDs;
for (const auto& svc : out) {
uint8_t fd = serviceToFD[svc.u64];
CHECK(pickedFDs.insert(fd).second); // fails if FD already picked
}
}
}
}
TEST_CASE("picker extreme weight disparity uniform at max load") {
// Scenario: 200 existing FDs (nearly full, 5GB/disk × 100 disks = 500GB/FD)
// plus 5 brand-new FDs (empty, 20TB/disk × 100 disks = 2000TB/FD).
// Weight ratio per FD: 2000TB / 500GB = 4000x.
// At max load (default), ratio=1.0 clamps all FDs to minFdWeight, so distribution
// should be near-uniform across all FDs regardless of capacity disparity.
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
const uint64_t EXISTING_BYTES_PER_DISK = 5000000000ULL; // 5 GB
const uint64_t NEW_BYTES_PER_DISK = 20000000000000ULL; // 20 TB
const int DISKS_PER_FD = 100;
const int EXISTING_FDS = 200;
const int NEW_FDS = 5;
const int TOTAL_FDS = EXISTING_FDS + NEW_FDS;
const int NEEDED = 15;
std::unordered_map<uint64_t, BlockServiceCache> cache;
std::unordered_map<uint64_t, uint8_t> serviceToFd;
uint64_t id = 1;
for (int fd = 1; fd <= EXISTING_FDS; fd++) {
for (int d = 0; d < DISKS_PER_FD; d++) {
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(fd).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = EXISTING_BYTES_PER_DISK;
entry.capacityBytes = EXISTING_BYTES_PER_DISK * 10;
entry.blocks = 0;
entry.hasFiles = false;
cache[id] = entry;
serviceToFd[id] = fd;
id++;
}
}
for (int fd = 0; fd < NEW_FDS; fd++) {
uint8_t fdByte = EXISTING_FDS + 1 + fd;
for (int d = 0; d < DISKS_PER_FD; d++) {
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(fdByte).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = NEW_BYTES_PER_DISK;
entry.capacityBytes = NEW_BYTES_PER_DISK;
entry.blocks = 0;
entry.hasFiles = false;
cache[id] = entry;
serviceToFd[id] = fdByte;
id++;
}
}
p.update(cache);
const int NUM_ITERATIONS = 100000;
std::unordered_map<uint8_t, int> fdPickCounts;
for (int i = 0; i < NUM_ITERATIONS; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, NEEDED, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == NEEDED);
for (const auto& svc : out) {
fdPickCounts[serviceToFd[svc.u64]]++;
}
}
uint64_t totalPicks = (uint64_t)NUM_ITERATIONS * NEEDED;
double expectedPerFd = (double)totalPicks / TOTAL_FDS;
for (const auto& [fd, count] : fdPickCounts) {
double deviation = std::abs((double)count - expectedPerFd) / expectedPerFd;
CHECK_MESSAGE(deviation < 0.15,
"FD ", (int)fd, " got ", count, " picks, expected ~", (int)expectedPerFd,
" (deviation ", deviation * 100, "%)");
}
}
TEST_CASE("picker throughput adaptation increases ratio at low load") {
// Low observed throughput → high effectiveMaxRatio → capacity-proportional picks.
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
const int EXISTING_FDS = 4;
const int NEW_FDS = 1;
std::unordered_map<uint64_t, BlockServiceCache> cache;
std::unordered_map<uint64_t, uint8_t> serviceToFd;
uint64_t id = 1;
for (int fd = 1; fd <= EXISTING_FDS; fd++) {
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(fd).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = 1'000'000'000ULL; // 1GB
entry.capacityBytes = 10'000'000'000ULL;
entry.blocks = 0;
entry.hasFiles = false;
cache[id] = entry;
serviceToFd[id] = fd;
id++;
}
{
uint8_t fdByte = EXISTING_FDS + 1;
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(fdByte).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = 100'000'000'000ULL; // 100GB (100x more)
entry.capacityBytes = 100'000'000'000ULL;
entry.blocks = 0;
entry.hasFiles = false;
cache[id] = entry;
serviceToFd[id] = fdByte;
id++;
}
p.update(cache);
// Simulate low throughput: 1000 picks of 100 bytes each over 2 seconds.
// Per-shard total = 100KB. Cluster-wide = 25.6MB in 2s = 12.8MB/s.
// Max cluster throughput = 600MB/s * 5 drives = 3GB/s.
// Ratio = 3GB/s / 12.8MB/s ≈ 234 → effectively uncapped (raw weight ratio is 100x).
for (int i = 0; i < 1000; i++) {
std::vector<BlockServiceId> out;
p.pick(1, FLASH_STORAGE, 1, {}, out, 100);
}
_setCurrentTime(ternNow() + 2_sec);
p.update(cache);
std::unordered_map<uint8_t, int> fdPickCounts;
const int NUM_ITERATIONS = 50000;
for (int i = 0; i < NUM_ITERATIONS; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 1);
fdPickCounts[serviceToFd[out[0].u64]]++;
}
double avgExisting = 0;
for (int fd = 1; fd <= EXISTING_FDS; fd++) avgExisting += fdPickCounts[fd];
avgExisting /= EXISTING_FDS;
double avgNew = fdPickCounts[EXISTING_FDS + 1];
double lowLoadRatio = avgNew / avgExisting;
CHECK(lowLoadRatio > 80.0);
_setCurrentTime(TernTime());
}
TEST_CASE("picker clamping ratio varies with load") {
// 2 FDs: FD1 = 1GB (10 drives), FD2 = 100GB (10 drives). 100x weight difference.
// At max load (initial): ratio=1.0 → all disks clamped to minSvcWeight → near-uniform.
// At low load (after recalc): high ratio → capacity-proportional → FD2 gets ~100x more.
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
std::unordered_map<uint64_t, BlockServiceCache> cache;
std::unordered_map<uint64_t, uint8_t> serviceToFd;
uint64_t id = 1;
for (int d = 0; d < 10; d++) {
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(1).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = 1'000'000'000ULL; // 1GB per drive
entry.capacityBytes = 10'000'000'000ULL;
entry.blocks = 0;
entry.hasFiles = false;
cache[id] = entry;
serviceToFd[id] = 1;
id++;
}
for (int d = 0; d < 10; d++) {
BlockServiceCache entry;
entry.locationId = 1;
entry.storageClass = FLASH_STORAGE;
entry.failureDomain = fdWith(2).name.data;
entry.flags = BlockServiceFlags::EMPTY;
entry.availableBytes = 100'000'000'000ULL; // 100GB per drive
entry.capacityBytes = 100'000'000'000ULL;
entry.blocks = 0;
entry.hasFiles = false;
cache[id] = entry;
serviceToFd[id] = 2;
id++;
}
p.update(cache);
auto measureDistribution = [&]() {
std::unordered_map<uint8_t, int> fdPicks;
const int N = 50000;
for (int i = 0; i < N; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
REQUIRE(err == TernError::NO_ERROR);
fdPicks[serviceToFd[out[0].u64]]++;
}
return std::make_pair(fdPicks[1], fdPicks[2]);
};
// Phase 1: max load (initial state) — distribution should be near-uniform
auto [fd1High, fd2High] = measureDistribution();
double highLoadRatio = (double)fd2High / fd1High;
CHECK(highLoadRatio > 0.8);
CHECK(highLoadRatio < 1.25);
// Phase 2: simulate intermediate load targeting ratio ≈ 5.
// Max cluster throughput = 600MB/s * 20 drives = 12GB/s.
// For ratio=5: cluster throughput = 12GB/s / 5 = 2.4GB/s.
// Per-shard over 2s = 2.4GB/s * 2 / 256 ≈ 18.75MB → 1000 picks of ~18750 bytes.
// With ratio=5: svcCap = minSvc(1GB) * 5 = 5GB per disk.
// FD1 disks stay at 1GB, FD2 disks clamped 100GB→5GB → FD weights 10GB vs 50GB → ~5x.
for (int i = 0; i < 1000; i++) {
std::vector<BlockServiceId> out;
p.pick(1, FLASH_STORAGE, 1, {}, out, 18750);
}
_setCurrentTime(ternNow() + 2_sec);
p.update(cache);
auto [fd1Mid, fd2Mid] = measureDistribution();
double midLoadRatio = (double)fd2Mid / fd1Mid;
// Intermediate load: FD2 gets more but not fully proportional (100x)
CHECK(midLoadRatio > 3.0);
CHECK(midLoadRatio < 8.0);
_setCurrentTime(TernTime(0));
}
TEST_CASE("picker insufficient live FDs returns error fast") {
// If blacklist leaves fewer live FDs than `needed`, we cannot satisfy the pick.
auto p = makePicker();
std::vector<BlockServiceInfoShort> catalog{
bs(1, 1, FLASH_STORAGE, 1),
bs(2, 1, FLASH_STORAGE, 2),
bs(3, 1, FLASH_STORAGE, 3),
bs(4, 1, FLASH_STORAGE, 4),
};
auto cache = makeCatalog(catalog);
p.update(cache);
// Blacklist two whole FDs → only 2 live FDs left, ask for 3.
std::vector<BlacklistEntry> blacklist;
BlacklistEntry b1; b1.failureDomain = fdWith(1); blacklist.push_back(b1);
BlacklistEntry b2; b2.failureDomain = fdWith(2); blacklist.push_back(b2);
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 3, blacklist, out);
CHECK(err == TernError::COULD_NOT_PICK_BLOCK_SERVICES);
CHECK(out.size() == 0);
}
TEST_CASE("picker drained lcKey resets throughput stats") {
// if an lcKey loses all services, stale numDrives /
// lastThroughputEstimate must be cleared so the spike path can't read them.
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
std::vector<BlockServiceInfoShort> catalog{
bs(1, 1, FLASH_STORAGE, 1),
bs(2, 1, FLASH_STORAGE, 2),
};
p.update(makeCatalog(catalog));
// Warm the throughput estimate with a few picks.
for (int i = 0; i < 10; i++) {
std::vector<BlockServiceId> out;
p.pick(1, FLASH_STORAGE, 1, {}, out, 1000);
}
_setCurrentTime(ternNow() + 2_sec);
p.update(makeCatalog(catalog));
// Now drop all services for this lcKey.
p.update({});
auto stats = p.getStats();
for (const auto& ls : stats.locStorage) {
if ((ls.key & 0xFF) == FLASH_STORAGE) {
CHECK(ls.writableBlockServices == 0);
CHECK(ls.writableFailureDomains == 0);
CHECK(ls.throughputEstimate == 0);
}
}
_setCurrentTime(TernTime(0));
}
TEST_CASE("picker stale service id from dropped lcKey does not corrupt weights") {
// a service that existed under key K, was evicted,
// but still appears in a blacklist must not subtract bogus weight from
// the current fdWeights.
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
// Phase 1: service 99 lives in location=1.
std::vector<BlockServiceInfoShort> catalog1{
bs(50, 1, FLASH_STORAGE, 1),
bs(51, 1, FLASH_STORAGE, 2),
bs(99, 1, FLASH_STORAGE, 3),
};
p.update(makeCatalog(catalog1));
// Trigger spike recalc so serviceToFdInfo for key=(1, FLASH) is rebuilt.
for (int i = 0; i < 100; i++) {
std::vector<BlockServiceId> out;
p.pick(1, FLASH_STORAGE, 1, {}, out, 1'000'000);
}
_setCurrentTime(ternNow() + 2_sec);
// Phase 2: drop service 99 (remove its FD entirely).
std::vector<BlockServiceInfoShort> catalog2{
bs(50, 1, FLASH_STORAGE, 1),
bs(51, 1, FLASH_STORAGE, 2),
};
p.update(makeCatalog(catalog2));
// Pick with a blacklist still referencing 99. Should succeed with no
// corruption of the live (50, 51) FDs' weights.
std::vector<BlacklistEntry> bl;
BlacklistEntry e; e.blockService = BlockServiceId(99); bl.push_back(e);
for (int i = 0; i < 50; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, bl, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 1);
CHECK(out[0].u64 != 99);
}
_setCurrentTime(TernTime(0));
}
TEST_CASE("picker intra-FD clamp at max load spreads within heterogeneous FD") {
// Single FD with 10 disks: 1 fresh (2TB) + 9 near-full (10GB each).
// Initial throughput estimate = maxDriveThroughput * numDrives → ratio = 1.0 (max load).
// Phase 0 should clamp the fresh disk down to 10GB; each disk then receives ~1/10 of picks.
// Without Phase 0, the fresh disk would receive ~99.5% of picks (2000 / 2090 of in-FD weight).
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
std::unordered_map<uint64_t, BlockServiceCache> cache;
auto mk = [](uint64_t avail) {
BlockServiceCache e;
e.locationId = 1;
e.storageClass = FLASH_STORAGE;
e.failureDomain = fdWith(1).name.data;
e.flags = BlockServiceFlags::EMPTY;
e.availableBytes = avail;
e.capacityBytes = avail * 10;
e.blocks = 0;
e.hasFiles = false;
return e;
};
const uint64_t FULL_AVAIL = 10'000'000'000ULL; // 10 GB (near-full disk)
const uint64_t FRESH_AVAIL = 2'000'000'000'000ULL; // 2 TB (fresh disk)
const uint64_t FRESH_ID = 1;
cache[FRESH_ID] = mk(FRESH_AVAIL);
for (uint64_t id = 2; id <= 10; id++) cache[id] = mk(FULL_AVAIL);
p.update(cache);
p.resetStats();
const int N = 200000;
for (int i = 0; i < N; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 1);
}
auto stats = p.getStats();
uint64_t freshPicks = 0;
uint64_t totalPicks = 0;
for (const auto& b : stats.blockServices) {
totalPicks += b.picks;
if (b.blockServiceId == FRESH_ID) freshPicks = b.picks;
}
REQUIRE(totalPicks == (uint64_t)N);
double freshShare = (double)freshPicks / totalPicks;
CHECK(freshShare > 0.08);
CHECK(freshShare < 0.12);
_setCurrentTime(TernTime(0));
}
TEST_CASE("picker intra-FD clamp no-op at low load preserves capacity-proportional picks") {
// Same topology as the max-load case. Drive sustained low throughput so ratio
// becomes large; Phase 0 becomes a no-op and fresh disk dominates in-FD picks
// (this is the desirable "drain to fresh capacity when under-utilized" behaviour).
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
std::unordered_map<uint64_t, BlockServiceCache> cache;
auto mk = [](uint64_t avail) {
BlockServiceCache e;
e.locationId = 1;
e.storageClass = FLASH_STORAGE;
e.failureDomain = fdWith(1).name.data;
e.flags = BlockServiceFlags::EMPTY;
e.availableBytes = avail;
e.capacityBytes = avail * 10;
e.blocks = 0;
e.hasFiles = false;
return e;
};
const uint64_t FULL_AVAIL = 10'000'000'000ULL;
const uint64_t FRESH_AVAIL = 2'000'000'000'000ULL;
const uint64_t FRESH_ID = 1;
cache[FRESH_ID] = mk(FRESH_AVAIL);
for (uint64_t id = 2; id <= 10; id++) cache[id] = mk(FULL_AVAIL);
p.update(cache);
// Simulate low throughput: 1000 tiny picks over 2s. ratio ≈ (600MB × 10) / ~12.8MB ≈ 469.
// svcCap = 10GB × 469 ≈ 4.7TB, well above fresh disk's 2TB → Phase 0 is a no-op.
for (int i = 0; i < 1000; i++) {
std::vector<BlockServiceId> out;
p.pick(1, FLASH_STORAGE, 1, {}, out, 100);
}
_setCurrentTime(ternNow() + 2_sec);
p.update(cache);
p.resetStats();
const int N = 50000;
for (int i = 0; i < N; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
REQUIRE(err == TernError::NO_ERROR);
}
auto stats = p.getStats();
uint64_t freshPicks = 0;
uint64_t totalPicks = 0;
for (const auto& b : stats.blockServices) {
totalPicks += b.picks;
if (b.blockServiceId == FRESH_ID) freshPicks = b.picks;
}
REQUIRE(totalPicks == (uint64_t)N);
double freshShare = (double)freshPicks / totalPicks;
// Fresh disk raw weight share = 2000 / 2090 ≈ 0.957.
CHECK(freshShare > 0.9);
_setCurrentTime(TernTime(0));
}
TEST_CASE("picker intra-FD clamp preserves consistency with service blacklist") {
// Verify intra-FD-clamped state stays consistent with the blacklist path:
// blacklisting a service that was clamped must still produce a valid pick
// without corrupting remaining FD weights.
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
std::unordered_map<uint64_t, BlockServiceCache> cache;
auto mk = [](uint8_t fd, uint64_t avail) {
BlockServiceCache e;
e.locationId = 1;
e.storageClass = FLASH_STORAGE;
e.failureDomain = fdWith(fd).name.data;
e.flags = BlockServiceFlags::EMPTY;
e.availableBytes = avail;
e.capacityBytes = avail * 10;
e.blocks = 0;
e.hasFiles = false;
return e;
};
// FD1 heterogeneous: one 1TB fresh + two 10GB full — the fresh one gets clamped.
cache[1] = mk(1, 1'000'000'000'000ULL);
cache[2] = mk(1, 10'000'000'000ULL);
cache[3] = mk(1, 10'000'000'000ULL);
// FD2, FD3 uniform 10GB each.
cache[4] = mk(2, 10'000'000'000ULL);
cache[5] = mk(3, 10'000'000'000ULL);
p.update(cache);
std::vector<BlacklistEntry> bl;
BlacklistEntry e; e.blockService = BlockServiceId(1); bl.push_back(e);
for (int i = 0; i < 500; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 3, bl, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 3);
for (const auto& id : out) CHECK(id.u64 != 1);
}
_setCurrentTime(TernTime(0));
}
TEST_CASE("picker global cap equalises heterogeneous FDs at max load") {
// Three FDs with very different per-disk capacity. Global cap clamps every
// disk to the global minimum (10GB), so every FD ends at 10 disks × 10GB =
// 100GB. maxWeight == minWeight, and picking 3 from 3 FDs always covers all.
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
std::unordered_map<uint64_t, BlockServiceCache> cache;
std::unordered_map<uint64_t, uint8_t> serviceToFd;
auto addSvc = [&](uint64_t id, uint8_t fdByte, uint64_t avail) {
BlockServiceCache e;
e.locationId = 1;
e.storageClass = FLASH_STORAGE;
e.failureDomain = fdWith(fdByte).name.data;
e.flags = BlockServiceFlags::EMPTY;
e.availableBytes = avail;
e.capacityBytes = avail * 10;
e.blocks = 0;
e.hasFiles = false;
cache[id] = e;
serviceToFd[id] = fdByte;
};
addSvc(1, 1, 2'000'000'000'000ULL);
for (uint64_t id = 2; id <= 10; id++) addSvc(id, 1, 10'000'000'000ULL);
for (uint64_t id = 11; id <= 20; id++) addSvc(id, 2, 100'000'000'000ULL);
for (uint64_t id = 21; id <= 30; id++) addSvc(id, 3, 100'000'000'000ULL);
p.update(cache);
auto stats = p.getStats();
bool saw = false;
for (const auto& ls : stats.locStorage) {
if ((ls.key & 0xFF) == FLASH_STORAGE) {
saw = true;
CHECK(ls.maxWeight == ls.minWeight);
}
}
CHECK(saw);
const int N = 60000;
std::unordered_map<uint8_t, int> fdPicks;
for (int i = 0; i < N; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 3, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 3);
std::unordered_set<uint8_t> seen;
for (const auto& id : out) {
uint8_t fd = serviceToFd[id.u64];
CHECK(seen.insert(fd).second);
fdPicks[fd]++;
}
}
// With 3 FDs and needed=3 and all equal weight, each FD must be picked every time.
for (uint8_t fd = 1; fd <= 3; fd++) CHECK(fdPicks[fd] == N);
_setCurrentTime(TernTime(0));
}
TEST_CASE("picker uneven disk count per FD spreads per-disk load at max load") {
// Two FDs with the same total bytes but very different disk counts:
// FD1: 2 disks × 1TB = 2TB total, 2 disks
// FD2: 50 disks × 40GB = 2TB total, 50 disks
// Without a global per-disk cap, equal FD bytes make stride/weighted picks
// hit each FD ~50% of the time, hammering FD1's two disks at ~25% each
// while FD2's disks see ~1% each — a 25× per-disk imbalance.
// With the global cap at max load (ratio=1) every disk is capped to the
// global minimum (40GB), so FD weight becomes proportional to disk count
// and per-disk pick rate equalises across all 52 disks.
_setCurrentTime(ternNow());
auto p = makePicker(0_sec, 600'000'000, 600'000'000);
std::unordered_map<uint64_t, BlockServiceCache> cache;
std::unordered_map<uint64_t, uint8_t> serviceToFd;
auto addSvc = [&](uint64_t id, uint8_t fdByte, uint64_t avail) {
BlockServiceCache e;
e.locationId = 1;
e.storageClass = FLASH_STORAGE;
e.failureDomain = fdWith(fdByte).name.data;
e.flags = BlockServiceFlags::EMPTY;
e.availableBytes = avail;
e.capacityBytes = avail * 10;
e.blocks = 0;
e.hasFiles = false;
cache[id] = e;
serviceToFd[id] = fdByte;
};
const uint64_t FD1_AVAIL = 1'000'000'000'000ULL; // 1 TB per disk
const uint64_t FD2_AVAIL = 40'000'000'000ULL; // 40 GB per disk
uint64_t id = 1;
for (int i = 0; i < 2; i++) addSvc(id++, 1, FD1_AVAIL);
for (int i = 0; i < 50; i++) addSvc(id++, 2, FD2_AVAIL);
p.update(cache);
p.resetStats();
const int N = 200000;
for (int i = 0; i < N; i++) {
std::vector<BlockServiceId> out;
auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
REQUIRE(err == TernError::NO_ERROR);
REQUIRE(out.size() == 1);
}
auto stats = p.getStats();
uint64_t totalDiskPicks = 0;
uint64_t maxDiskPicks = 0;
uint64_t minDiskPicks = UINT64_MAX;
for (const auto& b : stats.blockServices) {
totalDiskPicks += b.picks;
maxDiskPicks = std::max(maxDiskPicks, b.picks);
minDiskPicks = std::min(minDiskPicks, b.picks);
}
REQUIRE(totalDiskPicks == (uint64_t)N);
// Per-disk pick rate should be uniform across all 52 disks.
double expectedPerDisk = (double)N / 52.0;
double maxDeviation = std::max(
std::abs((double)maxDiskPicks - expectedPerDisk) / expectedPerDisk,
std::abs((double)minDiskPicks - expectedPerDisk) / expectedPerDisk);
CHECK(maxDeviation < 0.20);
// Sanity: max-to-min ratio should be near 1 — emphatically not 25x.
CHECK((double)maxDiskPicks / minDiskPicks < 1.5);
_setCurrentTime(TernTime(0));
}