// Copyright 2025 XTX Markets Technologies Limited // // SPDX-License-Identifier: GPL-2.0-or-later #include #include #include #include #include "BlockServicePicker.hpp" #include "BlockServicesCacheDB.hpp" #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include "doctest.h" static Logger testLogger(LogLevel::LOG_ERROR, STDERR_FILENO, false, false); static std::shared_ptr testXmon; static BlockServicePicker makePicker(Duration writableDelay = 0_sec, uint64_t hddDriveThroughput = 0, uint64_t flashDriveThroughput = 0, uint64_t minSpaceRequiredForWrite = 0) { return BlockServicePicker(testLogger, testXmon, writableDelay, hddDriveThroughput, flashDriveThroughput, minSpaceRequiredForWrite); } static FailureDomain fdWith(uint8_t v) { FailureDomain fd; for (int i = 0; i < 16; ++i) fd.name.data[i] = v; return fd; } static BlockServiceInfoShort bs(uint64_t id, uint8_t loc, uint8_t sc, uint8_t fdByte) { BlockServiceInfoShort x; x.id = BlockServiceId(id); x.locationId = loc; x.storageClass = sc; x.failureDomain = fdWith(fdByte); return x; } static std::unordered_map makeCatalog(const std::vector& services) { std::unordered_map cache; for (const auto& svc : services) { BlockServiceCache entry; entry.locationId = svc.locationId; entry.storageClass = svc.storageClass; entry.failureDomain = svc.failureDomain.name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = 1000000; entry.capacityBytes = 10000000; entry.blocks = 0; entry.hasFiles = false; cache[svc.id.u64] = entry; } return cache; } TEST_CASE("picker basic selection") { auto p = makePicker(); std::vector catalog{ bs(1, 1, FLASH_STORAGE, 1), bs(2, 1, FLASH_STORAGE, 2), bs(3, 1, FLASH_STORAGE, 3), bs(4, 2, HDD_STORAGE, 4) }; auto cache = makeCatalog(catalog); p.update(cache); std::vector out; auto err = p.pick(1, FLASH_STORAGE, 2, {}, out); CHECK(err == TernError::NO_ERROR); CHECK(out.size() == 2); CHECK(out[0] != out[1]); } TEST_CASE("picker blacklist by id and failure domain") { auto p = makePicker(); std::vector catalog{ bs(10, 1, FLASH_STORAGE, 7), bs(11, 1, FLASH_STORAGE, 8), bs(12, 1, FLASH_STORAGE, 9) }; auto cache = makeCatalog(catalog); p.update(cache); std::vector bl; BlacklistEntry e1; e1.blockService = BlockServiceId(10); bl.push_back(e1); BlacklistEntry e2; e2.failureDomain = fdWith(8); bl.push_back(e2); std::vector out; auto err = p.pick(1, FLASH_STORAGE, 2, bl, out); // Only bs 12 remains available CHECK(err == TernError::COULD_NOT_PICK_BLOCK_SERVICES); CHECK(out.size() == 0); } TEST_CASE("picker insufficient candidates") { auto p = makePicker(); std::vector catalog{ bs(20, 1, HDD_STORAGE, 1) }; auto cache = makeCatalog(catalog); p.update(cache); std::vector out; auto err = p.pick(1, HDD_STORAGE, 2, {}, out); CHECK(err == TernError::COULD_NOT_PICK_BLOCK_SERVICES); CHECK(out.size() == 0); } TEST_CASE("picker concurrency update while picks") { auto p = makePicker(); std::atomic stop{false}; // Start with many candidates std::vector catalog; for (uint64_t i = 0; i < 64; ++i) { catalog.push_back(bs(100+i, 1, FLASH_STORAGE, (uint8_t)i)); } auto cache = makeCatalog(catalog); p.update(cache); std::thread t1([&]{ std::vector out; while (!stop.load()) { auto err = p.pick(1, FLASH_STORAGE, 8, {}, out); CHECK(err == TernError::NO_ERROR); CHECK(out.size() == 8); } }); // Rebuild state repeatedly for (int round = 0; round < 100; ++round) { std::vector cat2; for (uint64_t i = 0; i < 64; ++i) { cat2.push_back(bs(200+i+round, 1, FLASH_STORAGE, (uint8_t)i)); } auto cache2 = makeCatalog(cat2); p.update(cache2); } stop = true; t1.join(); } TEST_CASE("picker weighted distribution") { auto p = makePicker(); // Create services with different weights // Service 1000: 10 MB available (weight 10M) // Service 2000: 20 MB available (weight 20M) // Service 3000: 30 MB available (weight 30M) // Service 4000: 40 MB available (weight 40M) // Total: 100M, so expected ratios are 10%, 20%, 30%, 40% std::vector catalog{ bs(1000, 1, FLASH_STORAGE, 1), bs(2000, 1, FLASH_STORAGE, 2), bs(3000, 1, FLASH_STORAGE, 3), bs(4000, 1, FLASH_STORAGE, 4) }; std::unordered_map cache; for (const auto& svc : catalog) { BlockServiceCache entry; entry.locationId = svc.locationId; entry.storageClass = svc.storageClass; entry.failureDomain = svc.failureDomain.name.data; entry.flags = BlockServiceFlags::EMPTY; // writable entry.capacityBytes = 100000000; entry.blocks = 0; entry.hasFiles = false; // Set different availableBytes for each service to create weights if (svc.id.u64 == 1000) entry.availableBytes = 10000000; else if (svc.id.u64 == 2000) entry.availableBytes = 20000000; else if (svc.id.u64 == 3000) entry.availableBytes = 30000000; else if (svc.id.u64 == 4000) entry.availableBytes = 40000000; cache[svc.id.u64] = entry; } p.update(cache); // Perform many picks to measure distribution const int NUM_ITERATIONS = 10000; std::unordered_map pickCounts; for (int i = 0; i < NUM_ITERATIONS; ++i) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 1); pickCounts[out[0].u64]++; } // Check that all services were picked CHECK(pickCounts.size() == 4); CHECK(pickCounts.count(1000) > 0); CHECK(pickCounts.count(2000) > 0); CHECK(pickCounts.count(3000) > 0); CHECK(pickCounts.count(4000) > 0); // Verify distribution is roughly proportional to weights // Expected: 1000->1000, 2000->2000, 3000->3000, 4000->4000 // Allow 20% deviation from expected CHECK(pickCounts[1000] > 800); CHECK(pickCounts[1000] < 1200); CHECK(pickCounts[2000] > 1600); CHECK(pickCounts[2000] < 2400); CHECK(pickCounts[3000] > 2400); CHECK(pickCounts[3000] < 3600); CHECK(pickCounts[4000] > 3200); CHECK(pickCounts[4000] < 4800); } TEST_CASE("picker blacklist enforcement") { auto p = makePicker(); // Create 6 services in 6 different failure domains std::vector catalog{ bs(1001, 1, FLASH_STORAGE, 1), // FD 1 bs(1002, 1, FLASH_STORAGE, 2), // FD 2 - blacklist this FD bs(1003, 1, FLASH_STORAGE, 3), // FD 3 bs(1004, 1, FLASH_STORAGE, 4), // FD 4 - blacklist service 1004 bs(1005, 1, FLASH_STORAGE, 5), // FD 5 bs(1006, 1, FLASH_STORAGE, 6) // FD 6 }; auto cache = makeCatalog(catalog); p.update(cache); // Blacklist: entire FD 2, and service 1004 from FD 4 std::vector blacklist; BlacklistEntry fdBlacklist; fdBlacklist.failureDomain = fdWith(2); blacklist.push_back(fdBlacklist); BlacklistEntry serviceBlacklist; serviceBlacklist.blockService = BlockServiceId(1004); blacklist.push_back(serviceBlacklist); // Perform many picks to verify blacklisted items never appear const int NUM_ITERATIONS = 5000; std::unordered_set pickedServices; for (int i = 0; i < NUM_ITERATIONS; ++i) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, blacklist, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 1); pickedServices.insert(out[0].u64); // Verify blacklisted services never picked CHECK(out[0].u64 != 1002); // FD 2 is blacklisted CHECK(out[0].u64 != 1004); // Service 1004 is blacklisted } // Verify only valid services were picked CHECK(pickedServices.count(1001) > 0); // Should be picked CHECK(pickedServices.count(1002) == 0); // Blacklisted FD CHECK(pickedServices.count(1003) > 0); // Should be picked CHECK(pickedServices.count(1004) == 0); // Blacklisted service CHECK(pickedServices.count(1005) > 0); // Should be picked CHECK(pickedServices.count(1006) > 0); // Should be picked } TEST_CASE("picker weighted distribution with blacklist") { auto p = makePicker(); // Create services with varying weights across multiple failure domains // FD 1: service 100 (10MB), service 101 (10MB) - total 20MB // FD 2: service 200 (40MB) - BLACKLIST THIS // FD 3: service 300 (20MB), service 301 (20MB) - total 40MB // Without blacklist: FD1=20MB (20%), FD2=40MB (40%), FD3=40MB (40%) // With blacklist: FD1=20MB (33%), FD3=40MB (67%) std::vector catalog{ bs(100, 1, FLASH_STORAGE, 1), bs(101, 1, FLASH_STORAGE, 1), bs(200, 1, FLASH_STORAGE, 2), bs(300, 1, FLASH_STORAGE, 3), bs(301, 1, FLASH_STORAGE, 3) }; std::unordered_map cache; for (const auto& svc : catalog) { BlockServiceCache entry; entry.locationId = svc.locationId; entry.storageClass = svc.storageClass; entry.failureDomain = svc.failureDomain.name.data; entry.flags = BlockServiceFlags::EMPTY; entry.capacityBytes = 100000000; entry.blocks = 0; entry.hasFiles = false; if (svc.id.u64 == 100 || svc.id.u64 == 101) entry.availableBytes = 10000000; else if (svc.id.u64 == 200) entry.availableBytes = 40000000; else if (svc.id.u64 == 300 || svc.id.u64 == 301) entry.availableBytes = 20000000; cache[svc.id.u64] = entry; } p.update(cache); // Blacklist entire FD 2 std::vector blacklist; BlacklistEntry fdBlacklist; fdBlacklist.failureDomain = fdWith(2); blacklist.push_back(fdBlacklist); // Perform many picks const int NUM_ITERATIONS = 6000; std::unordered_map pickCounts; for (int i = 0; i < NUM_ITERATIONS; ++i) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, blacklist, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 1); // Verify blacklisted service never picked CHECK(out[0].u64 != 200); pickCounts[out[0].u64]++; } // Verify FD 2 was never picked CHECK(pickCounts.count(200) == 0); // Verify FD 1 and FD 3 services were picked CHECK(pickCounts.count(100) > 0); CHECK(pickCounts.count(101) > 0); CHECK(pickCounts.count(300) > 0); CHECK(pickCounts.count(301) > 0); // Count picks by FD int fd1Picks = pickCounts[100] + pickCounts[101]; int fd3Picks = pickCounts[300] + pickCounts[301]; // FD1 (20MB) vs FD3 (40MB) => expect roughly 1:2 ratio // FD1 should get ~2000 picks, FD3 should get ~4000 picks // Allow 20% deviation CHECK(fd1Picks > 1600); CHECK(fd1Picks < 2400); CHECK(fd3Picks > 3200); CHECK(fd3Picks < 4800); // Within each FD, services should be roughly equal // FD1: 100 and 101 should each get ~1000 (allow 25% deviation) CHECK(pickCounts[100] > 750); CHECK(pickCounts[100] < 1250); CHECK(pickCounts[101] > 750); CHECK(pickCounts[101] < 1250); // FD3: 300 and 301 should each get ~2000 (allow 20% deviation) CHECK(pickCounts[300] > 1600); CHECK(pickCounts[300] < 2400); CHECK(pickCounts[301] > 1600); CHECK(pickCounts[301] < 2400); } TEST_CASE("picker multi-service weighted distribution") { auto p = makePicker(); // Create 20 failure domains, each with 20-25 services with varying weights // This creates a large heterogeneous pool to test distribution std::vector catalog; std::unordered_map cache; std::unordered_map fdTotalWeights; // FD byte -> total weight std::unordered_map serviceWeights; // service id -> weight uint64_t totalSystemWeight = 0; uint64_t serviceId = 10000; for (uint8_t fdByte = 1; fdByte <= 20; ++fdByte) { // Each FD gets 20-25 services int numServices = 20 + (fdByte % 6); uint64_t fdWeight = 0; for (int svcIdx = 0; svcIdx < numServices; ++svcIdx) { // Vary weights within FD: some small (5MB), some medium (10-20MB), some large (30-50MB) uint64_t weight; if (svcIdx % 5 == 0) { weight = 5000000; // 5MB - small } else if (svcIdx % 3 == 0) { weight = (10 + (svcIdx % 10)) * 1000000; // 10-20MB - medium } else { weight = (30 + (svcIdx % 20)) * 1000000; // 30-50MB - large } catalog.push_back(bs(serviceId, 1, FLASH_STORAGE, fdByte)); BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(fdByte).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.capacityBytes = 100000000; entry.availableBytes = weight; entry.blocks = 0; entry.hasFiles = false; cache[serviceId] = entry; serviceWeights[serviceId] = weight; fdWeight += weight; totalSystemWeight += weight; serviceId++; } fdTotalWeights[fdByte] = fdWeight; } p.update(cache); // Test picking different numbers of services: 1, 3, 5, 10, 15 for (int needed : {1, 3, 5, 10, 15}) { std::unordered_map pickCounts; std::unordered_map fdPickCounts; // Run many iterations to gather statistics - 1M for good distribution const int NUM_ITERATIONS = 1000000; for (int i = 0; i < NUM_ITERATIONS; ++i) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, needed, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == needed); // Count each picked service for (const auto& svc : out) { pickCounts[svc.u64]++; // Find which FD this service belongs to for (uint8_t fdByte = 1; fdByte <= 20; ++fdByte) { if (cache[svc.u64].failureDomain == fdWith(fdByte).name.data) { fdPickCounts[fdByte]++; break; } } } } // Verify that picks are distributed according to weights uint64_t totalPicks = (uint64_t)NUM_ITERATIONS * needed; // Check failure domain distribution. Marginal pick probability under // sequential weighted-without-replacement isn't exactly proportional // to weight when needed/numFDs is large (heavy FDs saturate towards // P=1), so widen the tolerance for larger picks. for (uint8_t fdByte = 1; fdByte <= 20; ++fdByte) { double expectedRatio = (double)fdTotalWeights[fdByte] / totalSystemWeight; double expectedPicks = totalPicks * expectedRatio; int actualPicks = fdPickCounts[fdByte]; double tolerance = (needed >= 10) ? 0.10 : 0.05; if (expectedPicks > 1000) { CHECK(actualPicks > expectedPicks * (1.0 - tolerance)); CHECK(actualPicks < expectedPicks * (1.0 + tolerance)); } } // Check individual service distribution for services with significant weight // Focus on services that should get a reasonable number of picks for (const auto& [svcId, weight] : serviceWeights) { double expectedRatio = (double)weight / totalSystemWeight; double expectedPicks = totalPicks * expectedRatio; int actualPicks = pickCounts[svcId]; // Only validate services we expect to see picked frequently enough if (expectedPicks > 500) { // Allow 15% deviation for individual services (tight with 10M iterations) double tolerance = 0.15; if (needed >= 10) { tolerance = 0.1; } CHECK(actualPicks > expectedPicks * (1.0 - tolerance)); CHECK(actualPicks < expectedPicks * (1.0 + tolerance)); } } } // Verify that the total number of unique services picked is reasonable // We should see most services picked at least once across all iterations std::unordered_set allPickedServices; const int FINAL_ITERATIONS = 1000000; for (int i = 0; i < FINAL_ITERATIONS; ++i) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 10, {}, out); REQUIRE(err == TernError::NO_ERROR); for (const auto& svc : out) { allPickedServices.insert(svc.u64); } } // With weighted selection, we should see a good portion of services // (not necessarily all, since low-weight services may be picked rarely) CHECK(allPickedServices.size() > serviceWeights.size() * 0.9); } TEST_CASE("picker never picks same failure domain twice") { // Test various configurations: different FD counts, service counts, and needed values struct TestConfig { int numFDs; int servicesPerFD; int needed; uint64_t baseWeight; }; std::vector configs = { {3, 1, 2, 1000000}, // minimal: 3 FDs, 1 service each, pick 2 {3, 1, 3, 1000000}, // pick all 3 {5, 3, 4, 1000000}, // 5 FDs with 3 services each, pick 4 {15, 1, 15, 1000000}, // pick every one of 15 FDs {20, 5, 10, 1000000}, // 20 FDs, pick 10 {3, 1, 2, 1}, // tiny weights (post-scaling edge case) {52, 1, 15, 100}, // many FDs, small weights }; for (const auto& cfg : configs) { auto p = makePicker(); std::unordered_map cache; std::unordered_map serviceToFD; // service id -> FD byte uint64_t id = 1; for (int fd = 1; fd <= cfg.numFDs; fd++) { for (int s = 0; s < cfg.servicesPerFD; s++) { BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(fd).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = cfg.baseWeight; entry.capacityBytes = cfg.baseWeight * 10; entry.blocks = 0; entry.hasFiles = false; cache[id] = entry; serviceToFD[id] = fd; id++; } } p.update(cache); const int NUM_ITERATIONS = 10000; for (int i = 0; i < NUM_ITERATIONS; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, cfg.needed, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == cfg.needed); std::unordered_set pickedFDs; for (const auto& svc : out) { uint8_t fd = serviceToFD[svc.u64]; CHECK(pickedFDs.insert(fd).second); // fails if FD already picked } } } } TEST_CASE("picker extreme weight disparity uniform at max load") { // Scenario: 200 existing FDs (nearly full, 5GB/disk × 100 disks = 500GB/FD) // plus 5 brand-new FDs (empty, 20TB/disk × 100 disks = 2000TB/FD). // Weight ratio per FD: 2000TB / 500GB = 4000x. // At max load (default), ratio=1.0 clamps all FDs to minFdWeight, so distribution // should be near-uniform across all FDs regardless of capacity disparity. auto p = makePicker(0_sec, 600'000'000, 600'000'000); const uint64_t EXISTING_BYTES_PER_DISK = 5000000000ULL; // 5 GB const uint64_t NEW_BYTES_PER_DISK = 20000000000000ULL; // 20 TB const int DISKS_PER_FD = 100; const int EXISTING_FDS = 200; const int NEW_FDS = 5; const int TOTAL_FDS = EXISTING_FDS + NEW_FDS; const int NEEDED = 15; std::unordered_map cache; std::unordered_map serviceToFd; uint64_t id = 1; for (int fd = 1; fd <= EXISTING_FDS; fd++) { for (int d = 0; d < DISKS_PER_FD; d++) { BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(fd).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = EXISTING_BYTES_PER_DISK; entry.capacityBytes = EXISTING_BYTES_PER_DISK * 10; entry.blocks = 0; entry.hasFiles = false; cache[id] = entry; serviceToFd[id] = fd; id++; } } for (int fd = 0; fd < NEW_FDS; fd++) { uint8_t fdByte = EXISTING_FDS + 1 + fd; for (int d = 0; d < DISKS_PER_FD; d++) { BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(fdByte).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = NEW_BYTES_PER_DISK; entry.capacityBytes = NEW_BYTES_PER_DISK; entry.blocks = 0; entry.hasFiles = false; cache[id] = entry; serviceToFd[id] = fdByte; id++; } } p.update(cache); const int NUM_ITERATIONS = 100000; std::unordered_map fdPickCounts; for (int i = 0; i < NUM_ITERATIONS; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, NEEDED, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == NEEDED); for (const auto& svc : out) { fdPickCounts[serviceToFd[svc.u64]]++; } } uint64_t totalPicks = (uint64_t)NUM_ITERATIONS * NEEDED; double expectedPerFd = (double)totalPicks / TOTAL_FDS; for (const auto& [fd, count] : fdPickCounts) { double deviation = std::abs((double)count - expectedPerFd) / expectedPerFd; CHECK_MESSAGE(deviation < 0.15, "FD ", (int)fd, " got ", count, " picks, expected ~", (int)expectedPerFd, " (deviation ", deviation * 100, "%)"); } } TEST_CASE("picker throughput adaptation increases ratio at low load") { // Low observed throughput → high effectiveMaxRatio → capacity-proportional picks. _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); const int EXISTING_FDS = 4; const int NEW_FDS = 1; std::unordered_map cache; std::unordered_map serviceToFd; uint64_t id = 1; for (int fd = 1; fd <= EXISTING_FDS; fd++) { BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(fd).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = 1'000'000'000ULL; // 1GB entry.capacityBytes = 10'000'000'000ULL; entry.blocks = 0; entry.hasFiles = false; cache[id] = entry; serviceToFd[id] = fd; id++; } { uint8_t fdByte = EXISTING_FDS + 1; BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(fdByte).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = 100'000'000'000ULL; // 100GB (100x more) entry.capacityBytes = 100'000'000'000ULL; entry.blocks = 0; entry.hasFiles = false; cache[id] = entry; serviceToFd[id] = fdByte; id++; } p.update(cache); // Simulate low throughput: 1000 picks of 100 bytes each over 2 seconds. // Per-shard total = 100KB. Cluster-wide = 25.6MB in 2s = 12.8MB/s. // Max cluster throughput = 600MB/s * 5 drives = 3GB/s. // Ratio = 3GB/s / 12.8MB/s ≈ 234 → effectively uncapped (raw weight ratio is 100x). for (int i = 0; i < 1000; i++) { std::vector out; p.pick(1, FLASH_STORAGE, 1, {}, out, 100); } _setCurrentTime(ternNow() + 2_sec); p.update(cache); std::unordered_map fdPickCounts; const int NUM_ITERATIONS = 50000; for (int i = 0; i < NUM_ITERATIONS; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 1); fdPickCounts[serviceToFd[out[0].u64]]++; } double avgExisting = 0; for (int fd = 1; fd <= EXISTING_FDS; fd++) avgExisting += fdPickCounts[fd]; avgExisting /= EXISTING_FDS; double avgNew = fdPickCounts[EXISTING_FDS + 1]; double lowLoadRatio = avgNew / avgExisting; CHECK(lowLoadRatio > 80.0); _setCurrentTime(TernTime()); } TEST_CASE("picker clamping ratio varies with load") { // 2 FDs: FD1 = 1GB (10 drives), FD2 = 100GB (10 drives). 100x weight difference. // At max load (initial): ratio=1.0 → all disks clamped to minSvcWeight → near-uniform. // At low load (after recalc): high ratio → capacity-proportional → FD2 gets ~100x more. _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); std::unordered_map cache; std::unordered_map serviceToFd; uint64_t id = 1; for (int d = 0; d < 10; d++) { BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(1).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = 1'000'000'000ULL; // 1GB per drive entry.capacityBytes = 10'000'000'000ULL; entry.blocks = 0; entry.hasFiles = false; cache[id] = entry; serviceToFd[id] = 1; id++; } for (int d = 0; d < 10; d++) { BlockServiceCache entry; entry.locationId = 1; entry.storageClass = FLASH_STORAGE; entry.failureDomain = fdWith(2).name.data; entry.flags = BlockServiceFlags::EMPTY; entry.availableBytes = 100'000'000'000ULL; // 100GB per drive entry.capacityBytes = 100'000'000'000ULL; entry.blocks = 0; entry.hasFiles = false; cache[id] = entry; serviceToFd[id] = 2; id++; } p.update(cache); auto measureDistribution = [&]() { std::unordered_map fdPicks; const int N = 50000; for (int i = 0; i < N; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, {}, out); REQUIRE(err == TernError::NO_ERROR); fdPicks[serviceToFd[out[0].u64]]++; } return std::make_pair(fdPicks[1], fdPicks[2]); }; // Phase 1: max load (initial state) — distribution should be near-uniform auto [fd1High, fd2High] = measureDistribution(); double highLoadRatio = (double)fd2High / fd1High; CHECK(highLoadRatio > 0.8); CHECK(highLoadRatio < 1.25); // Phase 2: simulate intermediate load targeting ratio ≈ 5. // Max cluster throughput = 600MB/s * 20 drives = 12GB/s. // For ratio=5: cluster throughput = 12GB/s / 5 = 2.4GB/s. // Per-shard over 2s = 2.4GB/s * 2 / 256 ≈ 18.75MB → 1000 picks of ~18750 bytes. // With ratio=5: svcCap = minSvc(1GB) * 5 = 5GB per disk. // FD1 disks stay at 1GB, FD2 disks clamped 100GB→5GB → FD weights 10GB vs 50GB → ~5x. for (int i = 0; i < 1000; i++) { std::vector out; p.pick(1, FLASH_STORAGE, 1, {}, out, 18750); } _setCurrentTime(ternNow() + 2_sec); p.update(cache); auto [fd1Mid, fd2Mid] = measureDistribution(); double midLoadRatio = (double)fd2Mid / fd1Mid; // Intermediate load: FD2 gets more but not fully proportional (100x) CHECK(midLoadRatio > 3.0); CHECK(midLoadRatio < 8.0); _setCurrentTime(TernTime(0)); } TEST_CASE("picker insufficient live FDs returns error fast") { // If blacklist leaves fewer live FDs than `needed`, we cannot satisfy the pick. auto p = makePicker(); std::vector catalog{ bs(1, 1, FLASH_STORAGE, 1), bs(2, 1, FLASH_STORAGE, 2), bs(3, 1, FLASH_STORAGE, 3), bs(4, 1, FLASH_STORAGE, 4), }; auto cache = makeCatalog(catalog); p.update(cache); // Blacklist two whole FDs → only 2 live FDs left, ask for 3. std::vector blacklist; BlacklistEntry b1; b1.failureDomain = fdWith(1); blacklist.push_back(b1); BlacklistEntry b2; b2.failureDomain = fdWith(2); blacklist.push_back(b2); std::vector out; auto err = p.pick(1, FLASH_STORAGE, 3, blacklist, out); CHECK(err == TernError::COULD_NOT_PICK_BLOCK_SERVICES); CHECK(out.size() == 0); } TEST_CASE("picker drained lcKey resets throughput stats") { // if an lcKey loses all services, stale numDrives / // lastThroughputEstimate must be cleared so the spike path can't read them. _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); std::vector catalog{ bs(1, 1, FLASH_STORAGE, 1), bs(2, 1, FLASH_STORAGE, 2), }; p.update(makeCatalog(catalog)); // Warm the throughput estimate with a few picks. for (int i = 0; i < 10; i++) { std::vector out; p.pick(1, FLASH_STORAGE, 1, {}, out, 1000); } _setCurrentTime(ternNow() + 2_sec); p.update(makeCatalog(catalog)); // Now drop all services for this lcKey. p.update({}); auto stats = p.getStats(); for (const auto& ls : stats.locStorage) { if ((ls.key & 0xFF) == FLASH_STORAGE) { CHECK(ls.writableBlockServices == 0); CHECK(ls.writableFailureDomains == 0); CHECK(ls.throughputEstimate == 0); } } _setCurrentTime(TernTime(0)); } TEST_CASE("picker stale service id from dropped lcKey does not corrupt weights") { // a service that existed under key K, was evicted, // but still appears in a blacklist must not subtract bogus weight from // the current fdWeights. _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); // Phase 1: service 99 lives in location=1. std::vector catalog1{ bs(50, 1, FLASH_STORAGE, 1), bs(51, 1, FLASH_STORAGE, 2), bs(99, 1, FLASH_STORAGE, 3), }; p.update(makeCatalog(catalog1)); // Trigger spike recalc so serviceToFdInfo for key=(1, FLASH) is rebuilt. for (int i = 0; i < 100; i++) { std::vector out; p.pick(1, FLASH_STORAGE, 1, {}, out, 1'000'000); } _setCurrentTime(ternNow() + 2_sec); // Phase 2: drop service 99 (remove its FD entirely). std::vector catalog2{ bs(50, 1, FLASH_STORAGE, 1), bs(51, 1, FLASH_STORAGE, 2), }; p.update(makeCatalog(catalog2)); // Pick with a blacklist still referencing 99. Should succeed with no // corruption of the live (50, 51) FDs' weights. std::vector bl; BlacklistEntry e; e.blockService = BlockServiceId(99); bl.push_back(e); for (int i = 0; i < 50; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, bl, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 1); CHECK(out[0].u64 != 99); } _setCurrentTime(TernTime(0)); } TEST_CASE("picker intra-FD clamp at max load spreads within heterogeneous FD") { // Single FD with 10 disks: 1 fresh (2TB) + 9 near-full (10GB each). // Initial throughput estimate = maxDriveThroughput * numDrives → ratio = 1.0 (max load). // Phase 0 should clamp the fresh disk down to 10GB; each disk then receives ~1/10 of picks. // Without Phase 0, the fresh disk would receive ~99.5% of picks (2000 / 2090 of in-FD weight). _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); std::unordered_map cache; auto mk = [](uint64_t avail) { BlockServiceCache e; e.locationId = 1; e.storageClass = FLASH_STORAGE; e.failureDomain = fdWith(1).name.data; e.flags = BlockServiceFlags::EMPTY; e.availableBytes = avail; e.capacityBytes = avail * 10; e.blocks = 0; e.hasFiles = false; return e; }; const uint64_t FULL_AVAIL = 10'000'000'000ULL; // 10 GB (near-full disk) const uint64_t FRESH_AVAIL = 2'000'000'000'000ULL; // 2 TB (fresh disk) const uint64_t FRESH_ID = 1; cache[FRESH_ID] = mk(FRESH_AVAIL); for (uint64_t id = 2; id <= 10; id++) cache[id] = mk(FULL_AVAIL); p.update(cache); p.resetStats(); const int N = 200000; for (int i = 0; i < N; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 1); } auto stats = p.getStats(); uint64_t freshPicks = 0; uint64_t totalPicks = 0; for (const auto& b : stats.blockServices) { totalPicks += b.picks; if (b.blockServiceId == FRESH_ID) freshPicks = b.picks; } REQUIRE(totalPicks == (uint64_t)N); double freshShare = (double)freshPicks / totalPicks; CHECK(freshShare > 0.08); CHECK(freshShare < 0.12); _setCurrentTime(TernTime(0)); } TEST_CASE("picker intra-FD clamp no-op at low load preserves capacity-proportional picks") { // Same topology as the max-load case. Drive sustained low throughput so ratio // becomes large; Phase 0 becomes a no-op and fresh disk dominates in-FD picks // (this is the desirable "drain to fresh capacity when under-utilized" behaviour). _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); std::unordered_map cache; auto mk = [](uint64_t avail) { BlockServiceCache e; e.locationId = 1; e.storageClass = FLASH_STORAGE; e.failureDomain = fdWith(1).name.data; e.flags = BlockServiceFlags::EMPTY; e.availableBytes = avail; e.capacityBytes = avail * 10; e.blocks = 0; e.hasFiles = false; return e; }; const uint64_t FULL_AVAIL = 10'000'000'000ULL; const uint64_t FRESH_AVAIL = 2'000'000'000'000ULL; const uint64_t FRESH_ID = 1; cache[FRESH_ID] = mk(FRESH_AVAIL); for (uint64_t id = 2; id <= 10; id++) cache[id] = mk(FULL_AVAIL); p.update(cache); // Simulate low throughput: 1000 tiny picks over 2s. ratio ≈ (600MB × 10) / ~12.8MB ≈ 469. // svcCap = 10GB × 469 ≈ 4.7TB, well above fresh disk's 2TB → Phase 0 is a no-op. for (int i = 0; i < 1000; i++) { std::vector out; p.pick(1, FLASH_STORAGE, 1, {}, out, 100); } _setCurrentTime(ternNow() + 2_sec); p.update(cache); p.resetStats(); const int N = 50000; for (int i = 0; i < N; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, {}, out); REQUIRE(err == TernError::NO_ERROR); } auto stats = p.getStats(); uint64_t freshPicks = 0; uint64_t totalPicks = 0; for (const auto& b : stats.blockServices) { totalPicks += b.picks; if (b.blockServiceId == FRESH_ID) freshPicks = b.picks; } REQUIRE(totalPicks == (uint64_t)N); double freshShare = (double)freshPicks / totalPicks; // Fresh disk raw weight share = 2000 / 2090 ≈ 0.957. CHECK(freshShare > 0.9); _setCurrentTime(TernTime(0)); } TEST_CASE("picker intra-FD clamp preserves consistency with service blacklist") { // Verify intra-FD-clamped state stays consistent with the blacklist path: // blacklisting a service that was clamped must still produce a valid pick // without corrupting remaining FD weights. _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); std::unordered_map cache; auto mk = [](uint8_t fd, uint64_t avail) { BlockServiceCache e; e.locationId = 1; e.storageClass = FLASH_STORAGE; e.failureDomain = fdWith(fd).name.data; e.flags = BlockServiceFlags::EMPTY; e.availableBytes = avail; e.capacityBytes = avail * 10; e.blocks = 0; e.hasFiles = false; return e; }; // FD1 heterogeneous: one 1TB fresh + two 10GB full — the fresh one gets clamped. cache[1] = mk(1, 1'000'000'000'000ULL); cache[2] = mk(1, 10'000'000'000ULL); cache[3] = mk(1, 10'000'000'000ULL); // FD2, FD3 uniform 10GB each. cache[4] = mk(2, 10'000'000'000ULL); cache[5] = mk(3, 10'000'000'000ULL); p.update(cache); std::vector bl; BlacklistEntry e; e.blockService = BlockServiceId(1); bl.push_back(e); for (int i = 0; i < 500; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 3, bl, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 3); for (const auto& id : out) CHECK(id.u64 != 1); } _setCurrentTime(TernTime(0)); } TEST_CASE("picker global cap equalises heterogeneous FDs at max load") { // Three FDs with very different per-disk capacity. Global cap clamps every // disk to the global minimum (10GB), so every FD ends at 10 disks × 10GB = // 100GB. maxWeight == minWeight, and picking 3 from 3 FDs always covers all. _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); std::unordered_map cache; std::unordered_map serviceToFd; auto addSvc = [&](uint64_t id, uint8_t fdByte, uint64_t avail) { BlockServiceCache e; e.locationId = 1; e.storageClass = FLASH_STORAGE; e.failureDomain = fdWith(fdByte).name.data; e.flags = BlockServiceFlags::EMPTY; e.availableBytes = avail; e.capacityBytes = avail * 10; e.blocks = 0; e.hasFiles = false; cache[id] = e; serviceToFd[id] = fdByte; }; addSvc(1, 1, 2'000'000'000'000ULL); for (uint64_t id = 2; id <= 10; id++) addSvc(id, 1, 10'000'000'000ULL); for (uint64_t id = 11; id <= 20; id++) addSvc(id, 2, 100'000'000'000ULL); for (uint64_t id = 21; id <= 30; id++) addSvc(id, 3, 100'000'000'000ULL); p.update(cache); auto stats = p.getStats(); bool saw = false; for (const auto& ls : stats.locStorage) { if ((ls.key & 0xFF) == FLASH_STORAGE) { saw = true; CHECK(ls.maxWeight == ls.minWeight); } } CHECK(saw); const int N = 60000; std::unordered_map fdPicks; for (int i = 0; i < N; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 3, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 3); std::unordered_set seen; for (const auto& id : out) { uint8_t fd = serviceToFd[id.u64]; CHECK(seen.insert(fd).second); fdPicks[fd]++; } } // With 3 FDs and needed=3 and all equal weight, each FD must be picked every time. for (uint8_t fd = 1; fd <= 3; fd++) CHECK(fdPicks[fd] == N); _setCurrentTime(TernTime(0)); } TEST_CASE("picker uneven disk count per FD spreads per-disk load at max load") { // Two FDs with the same total bytes but very different disk counts: // FD1: 2 disks × 1TB = 2TB total, 2 disks // FD2: 50 disks × 40GB = 2TB total, 50 disks // Without a global per-disk cap, equal FD bytes make stride/weighted picks // hit each FD ~50% of the time, hammering FD1's two disks at ~25% each // while FD2's disks see ~1% each — a 25× per-disk imbalance. // With the global cap at max load (ratio=1) every disk is capped to the // global minimum (40GB), so FD weight becomes proportional to disk count // and per-disk pick rate equalises across all 52 disks. _setCurrentTime(ternNow()); auto p = makePicker(0_sec, 600'000'000, 600'000'000); std::unordered_map cache; std::unordered_map serviceToFd; auto addSvc = [&](uint64_t id, uint8_t fdByte, uint64_t avail) { BlockServiceCache e; e.locationId = 1; e.storageClass = FLASH_STORAGE; e.failureDomain = fdWith(fdByte).name.data; e.flags = BlockServiceFlags::EMPTY; e.availableBytes = avail; e.capacityBytes = avail * 10; e.blocks = 0; e.hasFiles = false; cache[id] = e; serviceToFd[id] = fdByte; }; const uint64_t FD1_AVAIL = 1'000'000'000'000ULL; // 1 TB per disk const uint64_t FD2_AVAIL = 40'000'000'000ULL; // 40 GB per disk uint64_t id = 1; for (int i = 0; i < 2; i++) addSvc(id++, 1, FD1_AVAIL); for (int i = 0; i < 50; i++) addSvc(id++, 2, FD2_AVAIL); p.update(cache); p.resetStats(); const int N = 200000; for (int i = 0; i < N; i++) { std::vector out; auto err = p.pick(1, FLASH_STORAGE, 1, {}, out); REQUIRE(err == TernError::NO_ERROR); REQUIRE(out.size() == 1); } auto stats = p.getStats(); uint64_t totalDiskPicks = 0; uint64_t maxDiskPicks = 0; uint64_t minDiskPicks = UINT64_MAX; for (const auto& b : stats.blockServices) { totalDiskPicks += b.picks; maxDiskPicks = std::max(maxDiskPicks, b.picks); minDiskPicks = std::min(minDiskPicks, b.picks); } REQUIRE(totalDiskPicks == (uint64_t)N); // Per-disk pick rate should be uniform across all 52 disks. double expectedPerDisk = (double)N / 52.0; double maxDeviation = std::max( std::abs((double)maxDiskPicks - expectedPerDisk) / expectedPerDisk, std::abs((double)minDiskPicks - expectedPerDisk) / expectedPerDisk); CHECK(maxDeviation < 0.20); // Sanity: max-to-min ratio should be near 1 — emphatically not 25x. CHECK((double)maxDiskPicks / minDiskPicks < 1.5); _setCurrentTime(TernTime(0)); }