blockservicepicker: sequential weighted pick + global per-disk cap

2026-05-12 15:21:08 -05:00 · 2026-05-08 16:16:43 +00:00
parent 63aefdd5d7
commit 2cb52f1e59
5 changed files with 196 additions and 483 deletions
@@ -10,7 +10,6 @@
 #include "Msgs.hpp"
 #include "BlockServicesCacheDB.hpp"
 #include <algorithm>
-#include <numeric>
 #include <unordered_set>

 namespace {
@@ -22,163 +21,59 @@ namespace {
        return bs.availableBytes > minSpaceRequiredForWrite && blockServiceFlagsWritable(bs.flags) && ternNow() - bs.firstSeen > writableDelay;
    }

-    // Cluster-wide bytes/sec from a single shard's accumulated bytes over `elapsed`.
    inline double throughputBytesPerSec(uint64_t accumulatedBytes, Duration elapsed) {
        double elapsedSec = static_cast<double>(elapsed.ns) / 1'000'000'000.0;
        return static_cast<double>(accumulatedBytes) * ShardId::SHARD_COUNT / elapsedSec;
    }

-    // Scale factor that pushes per-drive load toward `maxDriveThroughput`, clamped to >= 1.
-    // Returns 1.0 for the degenerate cases so callers can unconditionally use the result.
    inline double ratioFromThroughput(uint64_t maxDriveThroughput, uint64_t numDrives, double throughput) {
        if (maxDriveThroughput == 0 || numDrives == 0 || throughput <= 0.0) return 1.0;
        double ratio = static_cast<double>(maxDriveThroughput) * numDrives / throughput;
        return ratio < 1.0 ? 1.0 : ratio;
    }

-    // Clamp dominant FD weights so that maxFdWeight * maxBlocksToPick <= totalWeight.
-    // If maxFdRatio >= 1.0, also clamp so that maxFdWeight <= minFdWeight * maxFdRatio.
-    // Both caps are computed on original weights, then min(ratioCap, strideCap) is applied once.
-    void applyClamping(
+    // Cap every disk's weight to globalMinSvcWeight * maxFdRatio across the (loc, sc).
+    // At ratio = 1.0 this makes per-disk weights uniform, so FD weights become
+    // proportional to disk count and per-disk pick probability becomes uniform.
+    // At high ratio (low load) the cap is large and original capacity-proportional
+    // weights flow through unchanged.
+    void applyGlobalCap(
        BlockServicePicker::LocationStorageInfo& lsInfo,
        std::unordered_map<uint64_t, BlockServicePicker::State::ServiceLookup>& serviceToFdInfo,
-        uint8_t maxBlocksToPick,
-        double maxFdRatio = 0.0
+        double maxFdRatio
    ) {
-        auto& failureDomains = lsInfo.failureDomains;
-        if (failureDomains.empty()) return;
+        if (maxFdRatio < 1.0 || lsInfo.failureDomains.empty()) return;

-        // Phase 0: intra-FD (service-level) clamp. Prevents a single fresh disk
-        // in an otherwise-full FD from owning nearly all of the FD's weight,
-        // which would cause a single-disk hotspot under stridePick. Uses the
-        // same throughput-derived ratio as the FD-level pass below.
-        if (maxFdRatio >= 1.0) {
-            for (auto& fd : failureDomains) {
-                uint64_t minSvcWeight = UINT64_MAX;
-                for (const auto& svc : fd.services) {
-                    if (svc.availableBytes > 0) {
-                        minSvcWeight = std::min(minSvcWeight, svc.availableBytes);
-                    }
-                }
-                if (minSvcWeight == UINT64_MAX) continue;
-
-                double svcCapD = static_cast<double>(minSvcWeight) * maxFdRatio;
-                uint64_t svcCap = (svcCapD >= static_cast<double>(UINT64_MAX))
-                    ? UINT64_MAX : static_cast<uint64_t>(svcCapD);
-                if (svcCap == 0) svcCap = 1;
-
-                uint64_t newFdTotal = 0;
-                for (auto& svc : fd.services) {
-                    if (svc.availableBytes == 0) continue;
-                    if (svc.availableBytes > svcCap) {
-                        svc.availableBytes = svcCap;
-                        serviceToFdInfo[svc.id.u64].weight = svcCap;
-                    }
-                    newFdTotal += svc.availableBytes;
-                }
-                fd.totalWeight = newFdTotal;
-            }
-            uint64_t newLsTotal = 0;
-            for (const auto& fd : failureDomains) newLsTotal += fd.totalWeight;
-            lsInfo.totalWeight = newLsTotal;
-        }
-
-        uint64_t minFdWeight = UINT64_MAX;
-        uint64_t maxFdWeight = 0;
-        for (const auto& fd : failureDomains) {
-            if (fd.totalWeight > 0) {
-                minFdWeight = std::min(minFdWeight, fd.totalWeight);
-                maxFdWeight = std::max(maxFdWeight, fd.totalWeight);
-            }
-        }
-        if (maxFdWeight == 0) return;
-
-        // Compute ratio cap: maxFdWeight <= minFdWeight * maxFdRatio
-        uint64_t ratioCap = UINT64_MAX;
-        if (maxFdRatio >= 1.0 && minFdWeight > 0 && minFdWeight < UINT64_MAX) {
-            double ratioCapD = static_cast<double>(minFdWeight) * maxFdRatio;
-            ratioCap = (ratioCapD >= static_cast<double>(UINT64_MAX))
-                ? UINT64_MAX : static_cast<uint64_t>(ratioCapD);
-        }
-
-        // Compute stride cap: maxFdWeight * maxBlocksToPick <= totalWeight
-        uint64_t strideCap = UINT64_MAX;
-        if (maxFdWeight > lsInfo.totalWeight / maxBlocksToPick) {
-            size_t numFds = failureDomains.size();
-
-            // Sort FD indices by weight descending
-            std::vector<size_t> sortedIdx(numFds);
-            std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
-            std::sort(sortedIdx.begin(), sortedIdx.end(), [&](size_t a, size_t b) {
-                return failureDomains[a].totalWeight > failureDomains[b].totalWeight;
-            });
-
-            // Find cap C: the largest value such that C * maxBlocksToPick <= c*C + S_unclamped,
-            // where c = number of FDs clamped (those with weight > C) and
-            // S_unclamped = sum of unclamped FD weights.
-            // Rearranging: C = S_unclamped / (maxBlocksToPick - c)
-            uint64_t sumTop = 0;
-            bool found = false;
-
-            for (size_t c = 1; c <= numFds && c < static_cast<size_t>(maxBlocksToPick); c++) {
-                sumTop += failureDomains[sortedIdx[c - 1]].totalWeight;
-                uint64_t sUnclamped = lsInfo.totalWeight - sumTop;
-                uint64_t candidateCap = sUnclamped / (static_cast<size_t>(maxBlocksToPick) - c);
-
-                bool topAboveCap = failureDomains[sortedIdx[c - 1]].totalWeight > candidateCap;
-                bool nextBelowOrEqCap = (c == numFds) || (failureDomains[sortedIdx[c]].totalWeight <= candidateCap);
-
-                if (topAboveCap && nextBelowOrEqCap) {
-                    strideCap = candidateCap;
-                    found = true;
-                    break;
+        uint64_t minSvcWeight = UINT64_MAX;
+        for (const auto& fd : lsInfo.failureDomains) {
+            for (const auto& svc : fd.services) {
+                if (svc.availableBytes > 0) {
+                    minSvcWeight = std::min(minSvcWeight, svc.availableBytes);
                }
            }
-
-            if (!found) {
-                strideCap = lsInfo.totalWeight / maxBlocksToPick;
-            }
        }
+        if (minSvcWeight == UINT64_MAX) return;

-        uint64_t cap = std::min(ratioCap, strideCap);
-        if (cap >= maxFdWeight) return;
-        if (cap == 0) cap = 1;
+        double svcCapD = static_cast<double>(minSvcWeight) * maxFdRatio;
+        uint64_t svcCap = (svcCapD >= static_cast<double>(UINT64_MAX))
+            ? UINT64_MAX : static_cast<uint64_t>(svcCapD);
+        if (svcCap == 0) svcCap = 1;

-        // Single clamping loop: scale and trim FDs above cap
-        lsInfo.totalWeight = 0;
-        for (auto& fd : failureDomains) {
-            if (fd.totalWeight <= cap) {
-                lsInfo.totalWeight += fd.totalWeight;
-                continue;
-            }
-
-            double ratio = static_cast<double>(cap) / fd.totalWeight;
-            fd.totalWeight = 0;
+        uint64_t newLsTotal = 0;
+        for (auto& fd : lsInfo.failureDomains) {
+            uint64_t newFdTotal = 0;
            for (auto& svc : fd.services) {
                if (svc.availableBytes == 0) continue;
-                svc.availableBytes = std::max(static_cast<uint64_t>(svc.availableBytes * ratio), 1ul);
-                serviceToFdInfo[svc.id.u64].weight = svc.availableBytes;
-                fd.totalWeight += svc.availableBytes;
-            }
-
-            // Trim if per-service rounding pushed FD weight above cap
-            while (fd.totalWeight > cap) {
-                BlockServicePicker::BlockServiceInfo* maxSvc = nullptr;
-                uint64_t maxSvcW = 0;
-                for (auto& svc : fd.services) {
-                    if (svc.availableBytes > maxSvcW) {
-                        maxSvcW = svc.availableBytes;
-                        maxSvc = &svc;
-                    }
+                if (svc.availableBytes > svcCap) {
+                    svc.availableBytes = svcCap;
+                    serviceToFdInfo[svc.id.u64].weight = svcCap;
                }
-                if (!maxSvc || maxSvcW <= 1) break;
-                maxSvc->availableBytes--;
-                serviceToFdInfo[maxSvc->id.u64].weight--;
-                fd.totalWeight--;
+                newFdTotal += svc.availableBytes;
            }
-
-            lsInfo.totalWeight += fd.totalWeight;
+            fd.totalWeight = newFdTotal;
+            newLsTotal += newFdTotal;
        }
+        lsInfo.totalWeight = newLsTotal;
    }

    static constexpr size_t FAILURE_DOMAIN_NAME_SIZE = decltype(FailureDomain::name)::STATIC_SIZE;
@@ -188,68 +83,75 @@ namespace {
        std::array<uint8_t, FAILURE_DOMAIN_NAME_SIZE> fdData;
    };

-    // Stride-based pick: selects `needed` services from evenly-spaced points in the
-    // cumulative weight distribution. Returns true if all `needed` services were picked.
-    bool stridePick(
+    // Sequential weighted sampling without replacement over failure domains.
+    // For each of `needed` draws: pick an FD with probability proportional to its
+    // remaining weight, then weighted-sample a non-blacklisted disk in that FD,
+    // then drop the FD from the live set.
+    // Caller guarantees: at least `needed` FDs have positive weight in fdWeights.
+    void sequentialWeightedPick(
        const std::vector<BlockServicePicker::FailureDomainInfo>& failureDomains,
        const std::vector<uint64_t>& fdWeights,
-        uint64_t totalWeight,
        uint8_t needed,
        const std::unordered_set<uint64_t>& blacklistedServices,
        RandomGenerator& rng,
        std::vector<PickResult>& results
    ) {
-        if (totalWeight == 0 || needed == 0) return false;
-
-        uint64_t step = totalWeight / needed;
-
        results.clear();
        results.reserve(needed);

-        uint64_t offset = rng.generate64() % totalWeight;
-
-        for (uint8_t i = 0; i < needed; i++) {
-            uint64_t target = (offset + i * step) % totalWeight;
-
-            uint64_t cumulative = 0;
-            for (size_t fdIdx = 0; fdIdx < failureDomains.size(); fdIdx++) {
-                uint64_t fdWeight = fdWeights[fdIdx];
-                if (fdWeight == 0) continue;
-                // FD weight should not exceed step, or we may pick multiple from same FD and break guarantees
-                if (fdWeight > step) {
-                    results.clear();
-                    return false;
-                }
-
-                if (target < cumulative + fdWeight) {
-                    const auto& fdInfo = failureDomains[fdIdx];
-                    uint64_t fdTarget = target - cumulative;
-                    uint64_t svcCumulative = 0;
-
-                    for (const auto& svc : fdInfo.services) {
-                        if (blacklistedServices.contains(svc.id.u64)) continue;
-
-                        if (fdTarget < svcCumulative + svc.availableBytes) {
-                            results.push_back({svc.id, fdInfo.failureDomain.name.data});
-                            break;
-                        }
-                        svcCumulative += svc.availableBytes;
-                    }
-                    break;
-                }
-                cumulative += fdWeight;
+        std::vector<size_t> live;
+        std::vector<uint64_t> liveWeights;
+        live.reserve(failureDomains.size());
+        liveWeights.reserve(failureDomains.size());
+        uint64_t totalWeight = 0;
+        for (size_t i = 0; i < failureDomains.size(); i++) {
+            if (fdWeights[i] > 0) {
+                live.push_back(i);
+                liveWeights.push_back(fdWeights[i]);
+                totalWeight += fdWeights[i];
            }
        }

-        return results.size() == needed;
+        for (uint8_t k = 0; k < needed; k++) {
+            uint64_t target = rng.generate64() % totalWeight;
+            size_t chosen = 0;
+            uint64_t cumulative = 0;
+            for (size_t i = 0; i < live.size(); i++) {
+                cumulative += liveWeights[i];
+                if (target < cumulative) {
+                    chosen = i;
+                    break;
+                }
+            }
+
+            const auto& fdInfo = failureDomains[live[chosen]];
+            uint64_t fdTarget = rng.generate64() % liveWeights[chosen];
+            uint64_t svcCumulative = 0;
+            for (const auto& svc : fdInfo.services) {
+                if (blacklistedServices.contains(svc.id.u64)) continue;
+                svcCumulative += svc.availableBytes;
+                if (fdTarget < svcCumulative) {
+                    results.push_back({svc.id, fdInfo.failureDomain.name.data});
+                    break;
+                }
+            }
+
+            totalWeight -= liveWeights[chosen];
+            live[chosen] = live.back();
+            liveWeights[chosen] = liveWeights.back();
+            live.pop_back();
+            liveWeights.pop_back();
+        }
+
+        ALWAYS_ASSERT(results.size() == needed);
    }
 }

-BlockServicePicker::BlockServicePicker(Logger& logger, std::shared_ptr<XmonAgent>& xmon, uint8_t maxBlocksToPick, Duration writableDelay,
+BlockServicePicker::BlockServicePicker(Logger& logger, std::shared_ptr<XmonAgent>& xmon, Duration writableDelay,
                                       uint64_t hddDriveThroughput, uint64_t flashDriveThroughput,
                                       uint64_t minSpaceRequiredForWrite)
    : _state(nullptr), _rawState(nullptr), _rng(ternNow().ns), _env(logger, xmon, "block_service_picker"),
-      _maxBlocksToPick(maxBlocksToPick), _writableDelay(writableDelay),
+      _writableDelay(writableDelay),
      _hddDriveThroughput(hddDriveThroughput), _flashDriveThroughput(flashDriveThroughput),
      _minSpaceRequiredForWrite(minSpaceRequiredForWrite) {}

@@ -258,8 +160,6 @@ void BlockServicePicker::update(
 ) {
    auto next = std::make_shared<State>();

-    // Build weighted structure: group by location/storageClass, then by failure domain
-    // Map: (location, storageClass) -> map of (failure domain string) -> failure domain index
    std::unordered_map<uint16_t, std::unordered_map<std::string, size_t>> grouped;
    std::unordered_set<uint16_t> distinctBlockServiceTypeLoc;

@@ -302,7 +202,6 @@ void BlockServicePicker::update(

    _rawState.store(next, std::memory_order_release);

-    // Clone raw state and apply clamping, preserving existing throughput ratio
    auto clamped = std::make_shared<State>(*next);

    auto nowNs = ternNow().ns;
@@ -319,8 +218,6 @@ void BlockServicePicker::update(
            uint8_t storageClass = key & 0xFF;
            uint64_t maxDriveThroughput = (storageClass == FLASH_STORAGE) ? _flashDriveThroughput : _hddDriveThroughput;

-            // Recalculate throughput estimate from accumulated data, or initialize for new lcKeys.
-            // When maxDriveThroughput == 0 (throughput tracking disabled), skip ratio clamping.
            double ratio = 0.0;
            uint64_t lastEstimate = stats.lastThroughputEstimate.load(std::memory_order_relaxed);
            if (maxDriveThroughput > 0 && totalServices > 0) {
@@ -342,7 +239,7 @@ void BlockServicePicker::update(
                ratio = ratioFromThroughput(maxDriveThroughput, totalServices, static_cast<double>(lastEstimate));
            }

-            applyClamping(lsInfo, clamped->serviceToFdInfo, _maxBlocksToPick, ratio);
+            applyGlobalCap(lsInfo, clamped->serviceToFdInfo, ratio);

            uint64_t maxW = 0, minW = UINT64_MAX;
            for (const auto& fd : lsInfo.failureDomains) {
@@ -383,9 +280,9 @@ TernError BlockServicePicker::pick(
    uint64_t blockSize
 ) const {
    auto state = _state.load(std::memory_order_acquire);
-    if (!state || needed == 0 || needed > _maxBlocksToPick) {
-        LOG_DEBUG(_env, "pick failed: state=%s needed=%s maxBlocksToPick=%s",
-            state != nullptr, (int)needed, (int)_maxBlocksToPick);
+    if (!state || needed == 0) {
+        LOG_DEBUG(_env, "pick failed: state=%s needed=%s",
+            state != nullptr, (int)needed);
        return TernError::COULD_NOT_PICK_BLOCK_SERVICES;
    }

@@ -396,7 +293,6 @@ TernError BlockServicePicker::pick(
        key = lcKey(locationId, storageClass);
    }

-    // Throughput tracking — spike-triggered recalc (periodic recalc happens via update())
    if (blockSize > 0) {
        ALWAYS_ASSERT(blockSize <= MAXIMUM_SPAN_SIZE,
            "blockSize %s > MAXIMUM_SPAN_SIZE %s", blockSize, MAXIMUM_SPAN_SIZE);
@@ -410,19 +306,17 @@ TernError BlockServicePicker::pick(
        Duration elapsed = now - lastRecalcTime;
        uint64_t lastEstimate = stats.lastThroughputEstimate.load(std::memory_order_relaxed);

-        // Trigger a recalc if we've seen a sustained spike above the last estimate — this helps us react faster to sudden load increases.
-        // We require at least 1 second of data to avoid overreacting to noise in very short intervals.
-        // We don't care in load reduction scenarios, since the periodic recalc in update() will eventually restore throughput ratios and thus unblock any clamped FDs.
+        // Trigger a recalc if we've seen a sustained spike above the last estimate.
+        // We don't react to load reductions — the periodic recalc in update() will
+        // restore throughput ratios and unblock any clamped FDs.
        bool spikeTriggered = elapsed >= 1_sec && lastEstimate > 0 &&
            throughputBytesPerSec(accumulated, elapsed) > static_cast<double>(lastEstimate) * 1.1;

        if (spikeTriggered && _recalcMutex.try_lock()) {
            std::unique_lock recalcLock(_recalcMutex, std::adopt_lock);
-            // Re-read after acquiring lock (another thread may have recalced)
            lastRecalcTime = TernTime(stats.lastRecalcTimeNs.load(std::memory_order_relaxed));
            elapsed = now - lastRecalcTime;
            accumulated = stats.throughputBytes.load(std::memory_order_relaxed);
-            // we need to check again as update could have happened while we were waiting for the lock, and if so we can skip recalculation
            if (elapsed > 1_sec) {
                double currentThroughput = throughputBytesPerSec(accumulated, elapsed);
                uint64_t numDrives = stats.numDrives;
@@ -459,7 +353,7 @@ TernError BlockServicePicker::pick(
                            }
                        }

-                        applyClamping(lsInfoRef, newState->serviceToFdInfo, _maxBlocksToPick, newRatio);
+                        applyGlobalCap(lsInfoRef, newState->serviceToFdInfo, newRatio);

                        uint64_t maxW = 0, minW = UINT64_MAX;
                        for (const auto& fd : lsInfoRef.failureDomains) {
@@ -469,10 +363,8 @@ TernError BlockServicePicker::pick(
                        stats.maxWeight.store(maxW, std::memory_order_relaxed);
                        stats.minWeight.store(lsInfoRef.failureDomains.empty() ? 0 : minW, std::memory_order_relaxed);

-                        // CAS swap — if update() raced, discard
                        _state.compare_exchange_strong(expected, newState, std::memory_order_release);

-                        // Reload state for this pick
                        state = _state.load(std::memory_order_acquire);
                    }
                }
@@ -494,12 +386,9 @@ TernError BlockServicePicker::pick(
            blacklistedServices.insert(b.blockService.u64);
        }

-        // Build adjusted FD weights and lookup (copy and apply blacklist)
        std::vector<uint64_t> fdWeights;
        std::unordered_set<uint64_t> actuallyBlacklistedServices;
        fdWeights.reserve(lsInfo.failureDomains.size());
-        uint64_t totalWeight = 0;
-        uint64_t maxFdWeight = 0;

        for (const auto& fdInfo : lsInfo.failureDomains) {
            uint64_t adjustedWeight = fdInfo.totalWeight;
@@ -509,26 +398,30 @@ TernError BlockServicePicker::pick(
                    break;
                }
            }
-
            fdWeights.emplace_back(adjustedWeight);
-            totalWeight += adjustedWeight;
-            maxFdWeight = std::max(maxFdWeight, adjustedWeight);
        }

        for(const auto& blacklistEntry : blacklist) {
            auto svcIt = state->serviceToFdInfo.find(blacklistEntry.blockService.u64);
            if (svcIt != state->serviceToFdInfo.end()) {
                const auto& svcInfo = svcIt->second;
-                if (fdWeights[svcInfo.fdIndex] == 0) continue; // already blacklisted via FD
+                if (fdWeights[svcInfo.fdIndex] == 0) continue;
                if (svcInfo.lcKey == key) {
                    actuallyBlacklistedServices.insert(blacklistEntry.blockService.u64);
                    fdWeights[svcInfo.fdIndex] -= svcInfo.weight;
-                    totalWeight -= svcInfo.weight;
                }
            }
        }

-        auto commitResults = [&](const std::vector<PickResult>& results, bool rescaled) {
+        size_t liveFdCount = 0;
+        for (uint64_t w : fdWeights) {
+            if (w > 0) liveFdCount++;
+        }
+
+        if (liveFdCount >= needed) {
+            std::vector<PickResult> results;
+            sequentialWeightedPick(lsInfo.failureDomains, fdWeights, needed,
+                                   actuallyBlacklistedServices, _rng, results);
            out.clear();
            out.reserve(needed);
            std::lock_guard lock(_statsMutex);
@@ -539,57 +432,7 @@ TernError BlockServicePicker::pick(
                _failureDomainStats[fdStr].fetch_add(1, std::memory_order_relaxed);
            }
            _locStorageStats[key].totalPicks.fetch_add(needed, std::memory_order_relaxed);
-            if (rescaled) _locStorageStats[key].blacklistRescales.fetch_add(1, std::memory_order_relaxed);
-        };
-
-        // Count live FDs up front — if blacklisting left us with fewer FDs than
-        // `needed`, neither fast nor slow path can succeed; bail early.
-        size_t liveFdCount = 0;
-        for (uint64_t w : fdWeights) {
-            if (w > 0) liveFdCount++;
-        }
-
-        if (totalWeight > 0 && needed > 0 && liveFdCount >= needed) {
-            // Fast path: stride pick on pre-scaled weights with blacklist adjustments
-            std::vector<PickResult> results;
-            if (stridePick(lsInfo.failureDomains, fdWeights, totalWeight, needed,
-                           actuallyBlacklistedServices, _rng, results)) {
-                commitResults(results, false);
-                return TernError::NO_ERROR;
-            }
-
-            // Slow path: blacklisting broke the stride invariant (maxFdWeight > step).
-            // just pick `needed` distinct FDs uniformly
-            // then weighted-sample one non-blacklisted service per FD.
-            std::vector<size_t> liveFds;
-            liveFds.reserve(liveFdCount);
-            for (size_t i = 0; i < lsInfo.failureDomains.size(); i++) {
-                if (fdWeights[i] > 0) liveFds.push_back(i);
-            }
-
-            results.clear();
-            results.reserve(needed);
-            for (uint8_t k = 0; k < needed; k++) {
-                size_t j = k + static_cast<size_t>(_rng.generate64() % (liveFds.size() - k));
-                std::swap(liveFds[k], liveFds[j]);
-
-                const auto& fd = lsInfo.failureDomains[liveFds[k]];
-                uint64_t target = _rng.generate64() % fdWeights[liveFds[k]];
-                uint64_t acc = 0;
-                for (const auto& svc : fd.services) {
-                    if (actuallyBlacklistedServices.contains(svc.id.u64)) continue;
-                    if (target < acc + svc.availableBytes) {
-                        results.push_back({svc.id, fd.failureDomain.name.data});
-                        break;
-                    }
-                    acc += svc.availableBytes;
-                }
-            }
-
-            if (results.size() == needed) {
-                commitResults(results, true);
-                return TernError::NO_ERROR;
-            }
+            return TernError::NO_ERROR;
        }
    }

@@ -639,7 +482,6 @@ BlockServicePicker::StatsSnapshot BlockServicePicker::getStats() const {
            stats.writableBlockServices.load(std::memory_order_relaxed),
            stats.maxWeight.load(std::memory_order_relaxed),
            stats.minWeight.load(std::memory_order_relaxed),
-            stats.blacklistRescales.load(std::memory_order_relaxed),
            effectiveMaxRatio,
            lastEstimate
        });
@@ -677,7 +519,6 @@ void BlockServicePicker::resetStats() {

    for (auto& [key, stats] : _locStorageStats) {
        stats.totalPicks.store(0, std::memory_order_relaxed);
-        stats.blacklistRescales.store(0, std::memory_order_relaxed);
    }

    for (auto& [id, stats] : _blockServiceStats) {
@@ -57,7 +57,6 @@ struct BlockServicePicker {
        std::atomic<uint64_t> writableBlockServices{0};
        std::atomic<uint64_t> maxWeight{0};
        std::atomic<uint64_t> minWeight{0};
-        std::atomic<uint64_t> blacklistRescales{0};
        // Throughput tracking for dynamic ratio recalc
        std::atomic<uint64_t> throughputBytes{0};
        std::atomic<uint64_t> lastRecalcTimeNs{0};
@@ -76,13 +75,12 @@ struct BlockServicePicker {
    mutable std::unordered_map<std::string, std::atomic<uint64_t>> _failureDomainStats;
    mutable std::mutex _statsMutex;  // protects map structures (not atomic values)
    mutable std::mutex _recalcMutex;
-    const uint8_t _maxBlocksToPick;
    const Duration _writableDelay;
    const uint64_t _hddDriveThroughput;    // bytes/sec per drive
    const uint64_t _flashDriveThroughput;  // bytes/sec per drive
    const uint64_t _minSpaceRequiredForWrite;  // min available bytes for a block service to be writable

-    BlockServicePicker(Logger& logger, std::shared_ptr<XmonAgent>& xmon, uint8_t maxBlocksToPick, Duration writableDelay,
+    BlockServicePicker(Logger& logger, std::shared_ptr<XmonAgent>& xmon, Duration writableDelay,
                       uint64_t hddDriveThroughput, uint64_t flashDriveThroughput,
                       uint64_t minSpaceRequiredForWrite);

@@ -109,7 +107,6 @@ struct BlockServicePicker {
            uint64_t writableBlockServices;
            uint64_t maxWeight;
            uint64_t minWeight;
-            uint64_t blacklistRepicks;
            double effectiveMaxRatio;
            uint64_t throughputEstimate;
        };
@@ -190,7 +190,7 @@ BlockServicesCacheDB::BlockServicesCacheDB(Logger& logger, std::shared_ptr<XmonA
    _env(logger, xmon, "bs_cache_db"),
    _db(sharedDB.db()),
    _blockServicesCF(sharedDB.getCF("blockServicesCache")),
-    _picker(logger, xmon, 15, blockServiceWritableDelay, hddDriveThroughput, flashDriveThroughput, minSpaceRequiredForWrite)
+    _picker(logger, xmon, blockServiceWritableDelay, hddDriveThroughput, flashDriveThroughput, minSpaceRequiredForWrite)
 {
    LOG_INFO(_env, "Initializing block services cache DB");

@@ -2507,7 +2507,6 @@ public:
                _metricsBuilder.fieldU64("writable_block_services", ls.writableBlockServices);
                _metricsBuilder.fieldU64("max_fd_weight", ls.maxWeight);
                _metricsBuilder.fieldU64("min_fd_weight", ls.minWeight);
-                _metricsBuilder.fieldU64("blacklist_repicks", ls.blacklistRepicks);
                _metricsBuilder.fieldFloat("effective_max_ratio", ls.effectiveMaxRatio);
                _metricsBuilder.fieldU64("throughput_estimate", ls.throughputEstimate);
                _metricsBuilder.timestamp(now);
@@ -16,12 +16,11 @@
 static Logger testLogger(LogLevel::LOG_ERROR, STDERR_FILENO, false, false);
 static std::shared_ptr<XmonAgent> testXmon;

-static BlockServicePicker makePicker(uint8_t maxBlocksToPick = 15,
-                                     Duration writableDelay = 0_sec,
+static BlockServicePicker makePicker(Duration writableDelay = 0_sec,
                                     uint64_t hddDriveThroughput = 0,
                                     uint64_t flashDriveThroughput = 0,
                                     uint64_t minSpaceRequiredForWrite = 0) {
-    return BlockServicePicker(testLogger, testXmon, maxBlocksToPick, writableDelay,
+    return BlockServicePicker(testLogger, testXmon, writableDelay,
                              hddDriveThroughput, flashDriveThroughput, minSpaceRequiredForWrite);
 }

@@ -142,8 +141,7 @@ TEST_CASE("picker concurrency update while picks") {
 }

 TEST_CASE("picker weighted distribution") {
-    // Use maxBlocksToPick=1 to avoid scaling effects on small clusters
-    auto p = makePicker(1);
+    auto p = makePicker();

    // Create services with different weights
    // Service 1000: 10 MB available (weight 10M)
@@ -264,8 +262,7 @@ TEST_CASE("picker blacklist enforcement") {
 }

 TEST_CASE("picker weighted distribution with blacklist") {
-    // Use maxBlocksToPick=1 to avoid scaling effects on small clusters
-    auto p = makePicker(1);
+    auto p = makePicker();

    // Create services with varying weights across multiple failure domains
    // FD 1: service 100 (10MB), service 101 (10MB) - total 20MB
@@ -439,15 +436,17 @@ TEST_CASE("picker multi-service weighted distribution") {
        // Verify that picks are distributed according to weights
        uint64_t totalPicks = (uint64_t)NUM_ITERATIONS * needed;

-        // Check failure domain distribution
+        // Check failure domain distribution. Marginal pick probability under
+        // sequential weighted-without-replacement isn't exactly proportional
+        // to weight when needed/numFDs is large (heavy FDs saturate towards
+        // P=1), so widen the tolerance for larger picks.
        for (uint8_t fdByte = 1; fdByte <= 20; ++fdByte) {
            double expectedRatio = (double)fdTotalWeights[fdByte] / totalSystemWeight;
            double expectedPicks = totalPicks * expectedRatio;
            int actualPicks = fdPickCounts[fdByte];

-            // Allow 5% deviation for FD distribution
-            double tolerance = 0.05;
-            if (expectedPicks > 1000) {  // Only check FDs with significant expected picks
+            double tolerance = (needed >= 10) ? 0.10 : 0.05;
+            if (expectedPicks > 1000) {
                CHECK(actualPicks > expectedPicks * (1.0 - tolerance));
                CHECK(actualPicks < expectedPicks * (1.0 + tolerance));
            }
@@ -491,81 +490,6 @@ TEST_CASE("picker multi-service weighted distribution") {
    CHECK(allPickedServices.size() > serviceWeights.size() * 0.9);
 }

-TEST_CASE("picker weight clamping for dominant failure domain") {
-    auto p = makePicker();
-    std::vector<BlockServiceInfoShort> catalog;
-
-    // Scenario:
-    // FD 1: Dominant (10000)
-    // FD 2: Large (5000)
-    // FD 3-52: Small (100 each) - 50 FDs
-    // Total FDs: 52
-    //
-    // With clamping (maxBlocksToPick=15):
-    // FD1 and FD2 are both above the cap and get clamped to the same weight.
-    // Small FDs keep their exact weights (100).
-    // cap = S_unclamped / (15 - 2) = 5000 / 13 = 384
-    // newTotal = 2*384 + 50*100 = 5768
-
-    // Add FD 1
-    catalog.push_back(bs(1, 1, FLASH_STORAGE, 1));
-    // Add FD 2
-    catalog.push_back(bs(2, 1, FLASH_STORAGE, 2));
-    // Add FD 3-52
-    for (int i = 3; i <= 52; ++i) {
-        catalog.push_back(bs(i, 1, FLASH_STORAGE, i));
-    }
-
-    auto cache = makeCatalog(catalog);
-
-    // Set weights
-    cache[1].availableBytes = 10000;
-    cache[2].availableBytes = 5000;
-    for (int i = 3; i <= 52; ++i) {
-        cache[i].availableBytes = 100;
-    }
-
-    p.update(cache);
-
-    double cap = 384.0;
-    double total = 2 * cap + 50 * 100.0;
-    double probClamped = cap / total;       // ~6.66% for FD1 and FD2
-    double probSmall = 100.0 / total;       // ~1.73% for each small FD
-
-    // Run many iterations
-    const int NUM_ITERATIONS = 1000000;
-    std::unordered_map<uint64_t, int> picks;
-
-    for (int i = 0; i < NUM_ITERATIONS; ++i) {
-        std::vector<BlockServiceId> out;
-        auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
-        REQUIRE(err == TernError::NO_ERROR);
-        REQUIRE(out.size() == 1);
-        picks[out[0].u64]++;
-    }
-
-    // FD 1 and FD 2: both clamped to same weight, so approximately equal
-    double actualProb1 = (double)picks[1] / NUM_ITERATIONS;
-    double actualProb2 = (double)picks[2] / NUM_ITERATIONS;
-    CHECK(actualProb1 > probClamped * 0.85);
-    CHECK(actualProb1 < probClamped * 1.15);
-    CHECK(actualProb2 > probClamped * 0.85);
-    CHECK(actualProb2 < probClamped * 1.15);
-
-    // Small FDs: keep exact weights
-    double totalSmallPicks = 0;
-    for (int i = 3; i <= 52; ++i) {
-        totalSmallPicks += picks[i];
-    }
-    double actualProbSmallAvg = (totalSmallPicks / 50.0) / NUM_ITERATIONS;
-    CHECK(actualProbSmallAvg > probSmall * 0.85);
-    CHECK(actualProbSmallAvg < probSmall * 1.15);
-
-    // Clamped FDs > small FDs
-    CHECK(picks[1] > (totalSmallPicks / 50.0));
-    CHECK(picks[2] > (totalSmallPicks / 50.0));
-}
-
 TEST_CASE("picker never picks same failure domain twice") {
    // Test various configurations: different FD counts, service counts, and needed values
    struct TestConfig {
@@ -579,7 +503,7 @@ TEST_CASE("picker never picks same failure domain twice") {
        {3,  1, 2,  1000000},   // minimal: 3 FDs, 1 service each, pick 2
        {3,  1, 3,  1000000},   // pick all 3
        {5,  3, 4,  1000000},   // 5 FDs with 3 services each, pick 4
-        {15, 1, 15, 1000000},   // exactly maxBlocksToPick FDs
+        {15, 1, 15, 1000000},   // pick every one of 15 FDs
        {20, 5, 10, 1000000},   // 20 FDs, pick 10
        {3,  1, 2,  1},         // tiny weights (post-scaling edge case)
        {52, 1, 15, 100},       // many FDs, small weights
@@ -626,67 +550,13 @@ TEST_CASE("picker never picks same failure domain twice") {
    }
 }

-TEST_CASE("picker blacklist rescale metric is bumped") {
-    // Create a scenario where blacklisting a *service* (not FD) breaks the stride
-    // invariant, forcing the slow path (rescale).
-    // 3 FDs each with weight 100, maxBlocksToPick=3. step=100, maxFdWeight=100 → OK.
-    // Blacklisting a service worth 50 from FD3 → totalWeight=250, maxFdWeight=100,
-    // step=83 < 100 → invariant broken → slow path with rescale.
-    auto p = makePicker(3);
-
-    std::unordered_map<uint64_t, BlockServiceCache> cache;
-    auto makeEntry = [](uint8_t fd, uint64_t avail) {
-        BlockServiceCache entry;
-        entry.locationId = 1;
-        entry.storageClass = FLASH_STORAGE;
-        entry.failureDomain = fdWith(fd).name.data;
-        entry.flags = BlockServiceFlags::EMPTY;
-        entry.availableBytes = avail;
-        entry.capacityBytes = avail * 10;
-        entry.blocks = 0;
-        entry.hasFiles = false;
-        return entry;
-    };
-
-    cache[1] = makeEntry(1, 100);
-    cache[2] = makeEntry(2, 100);
-    cache[3] = makeEntry(3, 50);
-    cache[4] = makeEntry(3, 50);
-
-    p.update(cache);
-    p.resetStats();
-
-    // Blacklist service 3 (one of FD3's services, weight 50)
-    std::vector<BlacklistEntry> blacklist;
-    BlacklistEntry bl;
-    bl.blockService = BlockServiceId(3);
-    blacklist.push_back(bl);
-
-    for (int i = 0; i < 100; i++) {
-        std::vector<BlockServiceId> out;
-        auto err = p.pick(1, FLASH_STORAGE, 3, blacklist, out);
-        REQUIRE(err == TernError::NO_ERROR);
-        REQUIRE(out.size() == 3);
-    }
-
-    auto stats = p.getStats();
-    bool foundRescales = false;
-    for (const auto& ls : stats.locStorage) {
-        if (ls.blacklistRepicks > 0) {
-            foundRescales = true;
-            CHECK(ls.blacklistRepicks == 100);
-        }
-    }
-    CHECK(foundRescales);
-}
-
 TEST_CASE("picker extreme weight disparity uniform at max load") {
    // Scenario: 200 existing FDs (nearly full, 5GB/disk × 100 disks = 500GB/FD)
    // plus 5 brand-new FDs (empty, 20TB/disk × 100 disks = 2000TB/FD).
    // Weight ratio per FD: 2000TB / 500GB = 4000x.
    // At max load (default), ratio=1.0 clamps all FDs to minFdWeight, so distribution
    // should be near-uniform across all FDs regardless of capacity disparity.
-    auto p = makePicker(15, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    const uint64_t EXISTING_BYTES_PER_DISK = 5000000000ULL;       // 5 GB
    const uint64_t NEW_BYTES_PER_DISK = 20000000000000ULL;         // 20 TB
@@ -763,10 +633,9 @@ TEST_CASE("picker extreme weight disparity uniform at max load") {

 TEST_CASE("picker throughput adaptation increases ratio at low load") {
    // Low observed throughput → high effectiveMaxRatio → capacity-proportional picks.
-    // Use maxBlocksToPick=1 so stride clamping doesn't interfere (only ratio clamping matters).
    _setCurrentTime(ternNow());

-    auto p = makePicker(1, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    const int EXISTING_FDS = 4;
    const int NEW_FDS = 1;
@@ -843,12 +712,11 @@ TEST_CASE("picker throughput adaptation increases ratio at low load") {

 TEST_CASE("picker clamping ratio varies with load") {
    // 2 FDs: FD1 = 1GB (10 drives), FD2 = 100GB (10 drives). 100x weight difference.
-    // maxBlocksToPick=1 so stride clamping doesn't interfere.
-    // At max load (initial): ratio=1.0 → all FDs clamped to minFdWeight → near-uniform.
+    // At max load (initial): ratio=1.0 → all disks clamped to minSvcWeight → near-uniform.
    // At low load (after recalc): high ratio → capacity-proportional → FD2 gets ~100x more.
    _setCurrentTime(ternNow());

-    auto p = makePicker(1, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    std::unordered_map<uint64_t, BlockServiceCache> cache;
    std::unordered_map<uint64_t, uint8_t> serviceToFd;
@@ -907,8 +775,8 @@ TEST_CASE("picker clamping ratio varies with load") {
    // Max cluster throughput = 600MB/s * 20 drives = 12GB/s.
    // For ratio=5: cluster throughput = 12GB/s / 5 = 2.4GB/s.
    // Per-shard over 2s = 2.4GB/s * 2 / 256 ≈ 18.75MB → 1000 picks of ~18750 bytes.
-    // With ratio=5: ratioCap = minFdWeight(10GB) * 5 = 50GB.
-    // FD1 stays at 10GB, FD2 clamped from 1TB to 50GB → pick ratio ≈ 5x.
+    // With ratio=5: svcCap = minSvc(1GB) * 5 = 5GB per disk.
+    // FD1 disks stay at 1GB, FD2 disks clamped 100GB→5GB → FD weights 10GB vs 50GB → ~5x.
    for (int i = 0; i < 1000; i++) {
        std::vector<BlockServiceId> out;
        p.pick(1, FLASH_STORAGE, 1, {}, out, 18750);
@@ -948,71 +816,11 @@ TEST_CASE("picker insufficient live FDs returns error fast") {
    CHECK(out.size() == 0);
 }

-TEST_CASE("picker slow path picks distinct FDs with service blacklist") {
-    // Force the slow path: 3 FDs, each with two services. Blacklist one service
-    // in the heaviest FD so stride invariant (maxFdWeight <= step) breaks.
-    auto p = makePicker(3);
-
-    std::unordered_map<uint64_t, BlockServiceCache> cache;
-    auto mk = [](uint8_t fd, uint64_t avail) {
-        BlockServiceCache e;
-        e.locationId = 1;
-        e.storageClass = FLASH_STORAGE;
-        e.failureDomain = fdWith(fd).name.data;
-        e.flags = BlockServiceFlags::EMPTY;
-        e.availableBytes = avail;
-        e.capacityBytes = avail * 10;
-        e.blocks = 0;
-        e.hasFiles = false;
-        return e;
-    };
-    // FD1: svcs 10, 11 (both weight 100)  → FD total 200
-    // FD2: svcs 20, 21 (both weight 100)  → FD total 200
-    // FD3: svcs 30, 31 (both weight 100)  → FD total 200
-    cache[10] = mk(1, 100); cache[11] = mk(1, 100);
-    cache[20] = mk(2, 100); cache[21] = mk(2, 100);
-    cache[30] = mk(3, 100); cache[31] = mk(3, 100);
-
-    p.update(cache);
-    p.resetStats();
-
-    // Blacklist svc 30 from FD3 (weight 100): totalWeight=500, maxFdWeight=200,
-    // step = 500/3 = 166 < 200 → stride invariant broken, slow path taken.
-    std::vector<BlacklistEntry> bl;
-    BlacklistEntry e; e.blockService = BlockServiceId(30); bl.push_back(e);
-
-    const int ITER = 2000;
-    for (int i = 0; i < ITER; i++) {
-        std::vector<BlockServiceId> out;
-        auto err = p.pick(1, FLASH_STORAGE, 3, bl, out);
-        REQUIRE(err == TernError::NO_ERROR);
-        REQUIRE(out.size() == 3);
-
-        std::unordered_set<uint8_t> seenFds;
-        for (const auto& id : out) {
-            CHECK(id.u64 != 30);  // blacklisted
-            uint8_t fd = (id.u64 < 20) ? 1 : (id.u64 < 30) ? 2 : 3;
-            CHECK(seenFds.insert(fd).second);  // distinct FDs
-        }
-    }
-
-    auto stats = p.getStats();
-    bool sawRescale = false;
-    for (const auto& ls : stats.locStorage) {
-        if (ls.blacklistRepicks > 0) {
-            sawRescale = true;
-            // We took the slow path every iteration.
-            CHECK(ls.blacklistRepicks == (uint64_t)ITER);
-        }
-    }
-    CHECK(sawRescale);
-}
-
 TEST_CASE("picker drained lcKey resets throughput stats") {
    // if an lcKey loses all services, stale numDrives /
    // lastThroughputEstimate must be cleared so the spike path can't read them.
    _setCurrentTime(ternNow());
-    auto p = makePicker(1, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    std::vector<BlockServiceInfoShort> catalog{
        bs(1, 1, FLASH_STORAGE, 1),
@@ -1047,7 +855,7 @@ TEST_CASE("picker stale service id from dropped lcKey does not corrupt weights")
    // but still appears in a blacklist must not subtract bogus weight from
    // the current fdWeights.
    _setCurrentTime(ternNow());
-    auto p = makePicker(1, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    // Phase 1: service 99 lives in location=1.
    std::vector<BlockServiceInfoShort> catalog1{
@@ -1091,7 +899,7 @@ TEST_CASE("picker intra-FD clamp at max load spreads within heterogeneous FD") {
    // Phase 0 should clamp the fresh disk down to 10GB; each disk then receives ~1/10 of picks.
    // Without Phase 0, the fresh disk would receive ~99.5% of picks (2000 / 2090 of in-FD weight).
    _setCurrentTime(ternNow());
-    auto p = makePicker(1, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    std::unordered_map<uint64_t, BlockServiceCache> cache;
    auto mk = [](uint64_t avail) {
@@ -1145,7 +953,7 @@ TEST_CASE("picker intra-FD clamp no-op at low load preserves capacity-proportion
    // becomes large; Phase 0 becomes a no-op and fresh disk dominates in-FD picks
    // (this is the desirable "drain to fresh capacity when under-utilized" behaviour).
    _setCurrentTime(ternNow());
-    auto p = makePicker(1, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    std::unordered_map<uint64_t, BlockServiceCache> cache;
    auto mk = [](uint64_t avail) {
@@ -1207,7 +1015,7 @@ TEST_CASE("picker intra-FD clamp preserves consistency with service blacklist")
    // blacklisting a service that was clamped must still produce a valid pick
    // without corrupting remaining FD weights.
    _setCurrentTime(ternNow());
-    auto p = makePicker(3, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    std::unordered_map<uint64_t, BlockServiceCache> cache;
    auto mk = [](uint8_t fd, uint64_t avail) {
@@ -1247,14 +1055,12 @@ TEST_CASE("picker intra-FD clamp preserves consistency with service blacklist")
    _setCurrentTime(TernTime(0));
 }

-TEST_CASE("picker intra-FD clamp changes stride-cap outcome") {
-    // Without Phase 0: FD1 raw total = 2TB + 9×10GB ≈ 2.09TB dominates; cluster
-    // total ≈ 4.09TB; step = 1.36TB; FD1 > step → stride cap binds.
-    // With Phase 0: FD1 clamped to 10×10GB = 100GB; ratio cap equalises all 3 FDs
-    // to minFdWeight = 100GB → no stride cap needed; FD picks should be uniform
-    // and maxWeight == minWeight after update().
+TEST_CASE("picker global cap equalises heterogeneous FDs at max load") {
+    // Three FDs with very different per-disk capacity. Global cap clamps every
+    // disk to the global minimum (10GB), so every FD ends at 10 disks × 10GB =
+    // 100GB. maxWeight == minWeight, and picking 3 from 3 FDs always covers all.
    _setCurrentTime(ternNow());
-    auto p = makePicker(3, 0_sec, 600'000'000, 600'000'000);
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);

    std::unordered_map<uint64_t, BlockServiceCache> cache;
    std::unordered_map<uint64_t, uint8_t> serviceToFd;
@@ -1309,3 +1115,73 @@ TEST_CASE("picker intra-FD clamp changes stride-cap outcome") {

    _setCurrentTime(TernTime(0));
 }
+
+TEST_CASE("picker uneven disk count per FD spreads per-disk load at max load") {
+    // Two FDs with the same total bytes but very different disk counts:
+    //   FD1: 2 disks × 1TB   = 2TB total, 2 disks
+    //   FD2: 50 disks × 40GB = 2TB total, 50 disks
+    // Without a global per-disk cap, equal FD bytes make stride/weighted picks
+    // hit each FD ~50% of the time, hammering FD1's two disks at ~25% each
+    // while FD2's disks see ~1% each — a 25× per-disk imbalance.
+    // With the global cap at max load (ratio=1) every disk is capped to the
+    // global minimum (40GB), so FD weight becomes proportional to disk count
+    // and per-disk pick rate equalises across all 52 disks.
+    _setCurrentTime(ternNow());
+    auto p = makePicker(0_sec, 600'000'000, 600'000'000);
+
+    std::unordered_map<uint64_t, BlockServiceCache> cache;
+    std::unordered_map<uint64_t, uint8_t> serviceToFd;
+    auto addSvc = [&](uint64_t id, uint8_t fdByte, uint64_t avail) {
+        BlockServiceCache e;
+        e.locationId = 1;
+        e.storageClass = FLASH_STORAGE;
+        e.failureDomain = fdWith(fdByte).name.data;
+        e.flags = BlockServiceFlags::EMPTY;
+        e.availableBytes = avail;
+        e.capacityBytes = avail * 10;
+        e.blocks = 0;
+        e.hasFiles = false;
+        cache[id] = e;
+        serviceToFd[id] = fdByte;
+    };
+
+    const uint64_t FD1_AVAIL = 1'000'000'000'000ULL;  // 1 TB per disk
+    const uint64_t FD2_AVAIL = 40'000'000'000ULL;     // 40 GB per disk
+    uint64_t id = 1;
+    for (int i = 0; i < 2; i++)  addSvc(id++, 1, FD1_AVAIL);
+    for (int i = 0; i < 50; i++) addSvc(id++, 2, FD2_AVAIL);
+
+    p.update(cache);
+    p.resetStats();
+
+    const int N = 200000;
+    for (int i = 0; i < N; i++) {
+        std::vector<BlockServiceId> out;
+        auto err = p.pick(1, FLASH_STORAGE, 1, {}, out);
+        REQUIRE(err == TernError::NO_ERROR);
+        REQUIRE(out.size() == 1);
+    }
+
+    auto stats = p.getStats();
+    uint64_t totalDiskPicks = 0;
+    uint64_t maxDiskPicks = 0;
+    uint64_t minDiskPicks = UINT64_MAX;
+    for (const auto& b : stats.blockServices) {
+        totalDiskPicks += b.picks;
+        maxDiskPicks = std::max(maxDiskPicks, b.picks);
+        minDiskPicks = std::min(minDiskPicks, b.picks);
+    }
+    REQUIRE(totalDiskPicks == (uint64_t)N);
+
+    // Per-disk pick rate should be uniform across all 52 disks.
+    double expectedPerDisk = (double)N / 52.0;
+    double maxDeviation = std::max(
+        std::abs((double)maxDiskPicks - expectedPerDisk) / expectedPerDisk,
+        std::abs((double)minDiskPicks - expectedPerDisk) / expectedPerDisk);
+    CHECK(maxDeviation < 0.20);
+
+    // Sanity: max-to-min ratio should be near 1 — emphatically not 25x.
+    CHECK((double)maxDiskPicks / minDiskPicks < 1.5);
+
+    _setCurrentTime(TernTime(0));
+}