kmod: remove block service cache

With reduced span cache time the block service cache
is no longer needed. We also don't need to fetch
changed block services from registry as we'll get
it as part of span fetches.
This commit is contained in:
Miroslav Crnic
2025-12-09 15:34:09 +00:00
committed by GitHub
parent d05a360e8b
commit 768072e054
9 changed files with 12 additions and 295 deletions

View File

@@ -34,8 +34,7 @@ ternfs-client-objs += \
span.o \
bincode.o \
revision.o \
policy.o \
block_services.o
policy.o
EXTRA_CFLAGS = -I$(src) -g -DDEBUG -fdiagnostics-color=always -Wno-declaration-after-statement
@@ -60,7 +59,7 @@ ternfs-client-tests: revision.c extra-files
ternfs-client-clean:
$(MAKE) -C $(KDIR) M=$(PWD) clean
rm -f *.o *.ko
rm -f *.o *.ko
bincode_tests: bincode_tests.c bincodegen.h bincode.h

View File

@@ -20,7 +20,6 @@ extern int ternfs_fetch_block_timeout_jiffies;
extern int ternfs_write_block_timeout_jiffies;
extern int ternfs_block_service_connect_timeout_jiffies;
#define TERNFS_BLOCK_SERVICE_EXPECTED_PADDING 3
struct ternfs_block_service {
u64 id;
u32 ip1;
@@ -28,14 +27,8 @@ struct ternfs_block_service {
u16 port1;
u16 port2;
u8 flags;
u8 _[TERNFS_BLOCK_SERVICE_EXPECTED_PADDING];
};
static inline bool ternfs_block_services_equal(struct ternfs_block_service* l, struct ternfs_block_service* r) {
BUILD_BUG_ON(sizeof(struct ternfs_block_service) != offsetof(struct ternfs_block_service, _) + TERNFS_BLOCK_SERVICE_EXPECTED_PADDING);
return memcmp(l, r, offsetof(struct ternfs_block_service, _)) == 0;
}
// Returns an error immediately if it can't connect to the block service or anyway
// if it thinks the block service is no good.
//

View File

@@ -1,139 +0,0 @@
// Copyright 2025 XTX Markets Technologies Limited
//
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/hashtable.h>
#include <linux/stringhash.h>
#include <linux/slab.h>
#include "block_services.h"
#include "trace.h"
struct ternfs_stored_block_service {
struct hlist_node hnode;
spinlock_t lock;
u64 id;
struct ternfs_block_service __rcu* bs;
};
#define BS_BITS 14
#define BS_BUCKETS (1<<BS_BITS) // 16k, we currently have 10k disks
static DECLARE_HASHTABLE(block_services, BS_BITS);
static spinlock_t block_services_locks[BS_BUCKETS];
static struct ternfs_stored_block_service* find_block_service(u64 bs_id) {
struct ternfs_stored_block_service* bs;
hash_for_each_possible_rcu(block_services, bs, hnode, bs_id) {
if (likely(bs->id == bs_id)) {
return bs;
}
}
return NULL;
}
struct ternfs_stored_block_service* ternfs_upsert_block_service(struct ternfs_block_service* bs) {
// It is currently not possible to remove block services from hash map.
// Furthermore returned block service nodes are stored for arbitrary time period so this can't
// really work with RCU. If we add support for removing block services this code needs rethinking.
struct ternfs_stored_block_service* bs_node = find_block_service(bs->id);
if (likely(bs_node != NULL)) {
// We found one, check if we need to update
rcu_read_lock();
{
struct ternfs_block_service* existing_bs = rcu_dereference(bs_node->bs);
if (ternfs_block_services_equal(existing_bs, bs)) { // still the same, no update needed
rcu_read_unlock();
trace_eggsfs_upsert_block_service(bs->id, TERNFS_UPSERT_BLOCKSERVICE_MATCH);
return bs_node;
}
}
rcu_read_unlock();
trace_eggsfs_upsert_block_service(bs->id, TERNFS_UPSERT_BLOCKSERVICE_NOMATCH);
// Things differ, we do need to update
struct ternfs_block_service* new_bs = kmalloc(sizeof(struct ternfs_block_service), GFP_KERNEL);
if (new_bs == NULL) {
return ERR_PTR(-ENOMEM);
}
memcpy(new_bs, bs, sizeof(*bs));
// Swap the pointers
spin_lock(&bs_node->lock);
struct ternfs_block_service* old_bs = rcu_dereference_protected(bs_node->bs, lockdep_is_held(&bs_node->lock));
rcu_assign_pointer(bs_node->bs, new_bs);
spin_unlock(&bs_node->lock);
// TODO: switch to call_rcu
synchronize_rcu();
kfree(old_bs);
return bs_node;
}
trace_eggsfs_upsert_block_service(bs->id, TERNFS_UPSERT_BLOCKSERVICE_NEW);
// We need to add a new one. Allocate both struct and body
struct ternfs_stored_block_service* new_bs_node = kmalloc(sizeof(struct ternfs_stored_block_service), GFP_KERNEL);
if (new_bs_node == NULL) {
return ERR_PTR(-ENOMEM);
}
struct ternfs_block_service* new_bs = kmalloc(sizeof(struct ternfs_block_service), GFP_KERNEL);
if (new_bs == NULL) {
kfree(new_bs_node);
return ERR_PTR(-ENOMEM);
}
memcpy(new_bs, bs, sizeof(*bs));
rcu_assign_pointer(new_bs_node->bs, new_bs);
new_bs_node->id = bs->id;
spin_lock_init(&new_bs_node->lock);
// Hashing not strictly needed, the block service ids are already
// random...
int bucket = hash_min(bs->id, HASH_BITS(block_services));
spin_lock(&block_services_locks[bucket]);
// Check if somebody got to it first
bs_node = find_block_service(bs->id);
if (unlikely(bs_node != NULL)) {
// Let's not bother updating to our thing in this racy case
spin_unlock(&block_services_locks[bucket]);
kfree(new_bs);
kfree(new_bs_node);
return bs_node;
}
// Add it
hlist_add_head_rcu(&new_bs_node->hnode, &block_services[bucket]);
spin_unlock(&block_services_locks[bucket]);
return new_bs_node;
}
void ternfs_get_block_service(struct ternfs_stored_block_service* bs_node, struct ternfs_block_service* out_bs) {
rcu_read_lock();
struct ternfs_block_service* bs = rcu_dereference(bs_node->bs);
memcpy(out_bs, bs, sizeof(*bs));
rcu_read_unlock();
}
int ternfs_block_service_init(void) {
int i;
for (i = 0; i < BS_BUCKETS; i++) {
spin_lock_init(&block_services_locks[i]);
}
return 0;
}
void ternfs_block_service_exit(void) {
int bucket;
struct hlist_node* tmp;
struct ternfs_stored_block_service* bs;
// While this pattern is not safe in general, at this point everything should be unmounted
// and nothing should be accessing block services anyway
rcu_read_lock();
hash_for_each_safe(block_services, bucket, tmp, bs, hnode) {
kfree(rcu_dereference(bs->bs));
kfree(bs);
}
rcu_read_unlock();
}

View File

@@ -1,24 +0,0 @@
// Copyright 2025 XTX Markets Technologies Limited
//
// SPDX-License-Identifier: GPL-2.0-or-later
// This cache is never cleared -- but it is small. For 100k disks, which is
// what we're targeting, it'd be ~5MB.
#ifndef _TERNFS_BLOCK_SERVICE_H
#define _TERNFS_BLOCK_SERVICE_H
#include "block.h"
struct ternfs_stored_block_service;
// Creates or updates a specific block service. Very fast unless the block service is
// unseen so far, which should be rare.
struct ternfs_stored_block_service* ternfs_upsert_block_service(struct ternfs_block_service* bs);
// Gets block service.
void ternfs_get_block_service(struct ternfs_stored_block_service* bs_node, struct ternfs_block_service* bs);
int __init ternfs_block_service_init(void);
void __cold ternfs_block_service_exit(void);
#endif

View File

@@ -23,7 +23,6 @@
#include "file.h"
#include "debugfs.h"
#include "policy.h"
#include "block_services.h"
MODULE_LICENSE("GPL");
@@ -53,9 +52,6 @@ static int __init ternfs_init(void) {
err = ternfs_policy_init();
if (err) { goto out_policy; }
err = ternfs_block_service_init();
if (err) { goto out_block_service; }
err = ternfs_sysfs_init();
if (err) { goto out_sysfs; }
@@ -102,8 +98,6 @@ out_block:
out_sysctl:
ternfs_sysfs_exit();
out_sysfs:
ternfs_block_service_exit();
out_block_service:
ternfs_policy_exit();
out_policy:
ternfs_rs_exit();
@@ -124,7 +118,6 @@ static void __exit ternfs_exit(void) {
ternfs_block_exit();
ternfs_sysctl_exit();
ternfs_sysfs_exit();
ternfs_block_service_exit();
ternfs_policy_exit();
ternfs_rs_exit();

View File

@@ -22,9 +22,8 @@
#include "trace.h"
#include "intrshims.h"
#include "sysctl.h"
#include "block_services.h"
int ternfs_span_cache_retention_jiffies = 24 * 60 * 60 * HZ; // 1 day
int ternfs_span_cache_retention_jiffies = 10 * 60 * HZ; // 10 minutes
static struct kmem_cache* ternfs_block_span_cachep;
static struct kmem_cache* ternfs_inline_span_cachep;
@@ -375,16 +374,13 @@ static int fetch_span_blocks(struct fetch_span_pages_state* st) {
}
}
//fetch_stripe_trace(st, TERNFS_FETCH_STRIPE_BLOCK_START, i, 0);
hold_fetch_span_pages(st);
ternfs_debug("block start st=%p block_service=%016llx block_id=%016llx", st, block->id, block->id);
struct ternfs_block_service bs;
ternfs_get_block_service(block->bs, &bs);
// Fetches a single cell from the block
int block_err = ternfs_fetch_block_pages_with_crc(
&span_block_done,
(void*)st,
&bs, &st->blocks_pages[i], span->span.ino, block->id, block->crc, st->start_offset, st->size
&block->bs, &st->blocks_pages[i], span->span.ino, block->id, block->crc, st->start_offset, st->size
);
if (block_err) {
BUG_ON(list_empty(&st->blocks_pages[i]));
@@ -906,19 +902,13 @@ static void file_spans_cb_block(
block->id = block_id;
block->crc = crc;
// Populate bs cache
struct ternfs_block_service bs;
bs.id = bs_id;
bs.ip1 = ip1;
bs.port1 = port1;
bs.ip2 = ip2;
bs.port2 = port2;
bs.flags = flags;
block->bs = ternfs_upsert_block_service(&bs);
if (IS_ERR(block->bs)) {
ctx->err = PTR_ERR(block->bs);
return;
}
struct ternfs_block_service* bs = &block->bs;
bs->id = bs_id;
bs->ip1 = ip1;
bs->port1 = port1;
bs->ip2 = ip2;
bs->port2 = port2;
bs->flags = flags;
}
static void file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body) {

View File

@@ -59,7 +59,7 @@ struct ternfs_inline_span {
})
struct ternfs_block {
struct ternfs_stored_block_service* bs;
struct ternfs_block_service bs;
u64 id;
u32 crc;
};

View File

@@ -12,7 +12,6 @@
#include <linux/workqueue.h>
#include <linux/parser.h>
#include "block_services.h"
#include "log.h"
#include "inode.h"
#include "export.h"
@@ -171,92 +170,6 @@ static int ternfs_refresh_fs_info(struct ternfs_fs_info* info) {
atomic64_set(&info->cdc_addr2, ternfs_mk_addr(cdc_ip2.x, cdc_port2.x));
}
}
{
{
char changed_block_services_req[TERNFS_REGISTRY_REQ_HEADER_SIZE + TERNFS_LOCAL_CHANGED_BLOCK_SERVICES_REQ_SIZE];
struct ternfs_bincode_put_ctx ctx = {
.start = changed_block_services_req + TERNFS_REGISTRY_REQ_HEADER_SIZE,
.cursor = changed_block_services_req + TERNFS_REGISTRY_REQ_HEADER_SIZE,
.end = changed_block_services_req + sizeof(changed_block_services_req),
};
ternfs_local_changed_block_services_req_put_start(&ctx, start);
ternfs_local_changed_block_services_req_put_changed_since(&ctx, start, changed_since, info->block_services_last_changed_time);
ternfs_local_changed_block_services_req_put_end(&ctx, changed_since, end);
ternfs_write_registry_req_header(changed_block_services_req, TERNFS_LOCAL_CHANGED_BLOCK_SERVICES_REQ_SIZE, TERNFS_REGISTRY_LOCAL_CHANGED_BLOCK_SERVICES);
if (err = sendloop(registry_sock, changed_block_services_req, sizeof changed_block_services_req), err < 0) goto out_sock;
}
u32 registry_resp_len;
u8 registry_resp_kind;
{
char block_services_resp_header[TERNFS_REGISTRY_RESP_HEADER_SIZE];
if (err = recvloop(registry_sock, block_services_resp_header, sizeof block_services_resp_header), err < 0) goto out_sock;
err = ternfs_read_registry_resp_header(block_services_resp_header, &registry_resp_len, &registry_resp_kind);
if (err < 0) { goto out_sock; }
}
u64 last_changed;
u16 block_services_len;
{
char last_changed_and_len[sizeof(last_changed) + sizeof(block_services_len)];
if (registry_resp_len < sizeof(last_changed_and_len)) {
ternfs_debug("expected size of at least %ld for BlockServicesWithFlagChangeResp, got %d", sizeof(last_changed_and_len), registry_resp_len);
err = -EINVAL;
goto out_sock;
}
if (err = recvloop(registry_sock, last_changed_and_len, sizeof last_changed_and_len), err < 0) goto out_sock;
last_changed = get_unaligned_le64(last_changed_and_len);
block_services_len = get_unaligned_le16(last_changed_and_len + sizeof(last_changed));
registry_resp_len -= sizeof(last_changed_and_len);
}
{
if (registry_resp_len != TERNFS_BLOCK_SERVICE_SIZE * block_services_len) {
ternfs_debug("expected size of at least %d for %d BlockServices in BlockServicesWithFlagChangeResp, got %d",
TERNFS_BLOCK_SERVICE_SIZE * block_services_len, block_services_len, registry_resp_len);
err = -EINVAL;
goto out_sock;
}
u16 block_service_idx;
for (block_service_idx = 0; block_service_idx < block_services_len; block_service_idx++) {
char block_service_buf[TERNFS_BLOCK_SERVICE_SIZE];
if (err = recvloop(registry_sock, block_service_buf, sizeof block_service_buf), err < 0) goto out_sock;
struct ternfs_bincode_get_ctx bs_ctx = {
.buf = block_service_buf,
.end = block_service_buf + sizeof(block_service_buf),
.err = 0,
};
ternfs_block_service_get_start(&bs_ctx, start);
ternfs_block_service_get_addrs(&bs_ctx, start, addr_start);
ternfs_addrs_info_get_addr1(&bs_ctx, addr_start, ipport1_start);
ternfs_ip_port_get_addrs(&bs_ctx, ipport1_start, ip1);
ternfs_ip_port_get_port(&bs_ctx, ip1, port1);
ternfs_ip_port_get_end(&bs_ctx, port1, ipport1_end);
ternfs_addrs_info_get_addr2(&bs_ctx, ipport1_end, ipport2_start);
ternfs_ip_port_get_addrs(&bs_ctx, ipport2_start, ip2);
ternfs_ip_port_get_port(&bs_ctx, ip2, port2);
ternfs_ip_port_get_end(&bs_ctx, port2, ipport2_end);
ternfs_addrs_info_get_end(&bs_ctx, ipport2_end, addr_end);
ternfs_block_service_get_id(&bs_ctx, addr_end, bs_id);
ternfs_block_service_get_flags(&bs_ctx, bs_id, bs_flags);
ternfs_block_service_get_end(&bs_ctx, bs_flags, end);
ternfs_block_service_get_finish(&bs_ctx, end);
if (bs_ctx.err != 0) { err = ternfs_error_to_linux(bs_ctx.err); goto out_sock; }
struct ternfs_block_service bs;
bs.id = bs_id.x;
bs.ip1 = ip1.x;
bs.port1 = port1.x;
bs.ip2 = ip2.x;
bs.port2 = port2.x;
bs.flags = bs_flags.x;
struct ternfs_stored_block_service* sbs = ternfs_upsert_block_service(&bs);
if (IS_ERR(sbs)) {
err = PTR_ERR(sbs);
goto out_sock;
}
}
}
info->block_services_last_changed_time = last_changed;
}
{
static_assert(TERNFS_INFO_REQ_SIZE == 0);
char info_req[TERNFS_REGISTRY_REQ_HEADER_SIZE];
@@ -409,12 +322,6 @@ static struct ternfs_fs_info* ternfs_init_fs_info(struct net* net, const char* d
err = ternfs_init_shard_socket(&ternfs_info->sock);
if (err) { goto out_addr; }
// for the first update we will ask for everything that changed in last day.
// this is more than enough time for any older changed to be visible to shards and propagated through block info
u64 atime_ns = ktime_get_real_ns();
atime_ns -= min(atime_ns, 86400000000000ull);
ternfs_info->block_services_last_changed_time = atime_ns;
err = ternfs_refresh_fs_info(ternfs_info);
if (err != 0) { goto out_socket; }

View File

@@ -28,8 +28,6 @@ struct ternfs_fs_info {
atomic64_t capacity;
atomic64_t available;
u64 block_services_last_changed_time;
struct delayed_work registry_refresh_work;
kuid_t uid;