// Copyright 2025 XTX Markets Technologies Limited // // SPDX-License-Identifier: GPL-2.0-or-later #include "super.h" #include #include #include #include #include #include #include #include "log.h" #include "inode.h" #include "export.h" #include "metadata.h" #include "dentry.h" #include "net.h" #include "registry.h" #include "bincode.h" #include "sysfs.h" #include "err.h" #include "rs.h" #define MSECS_TO_JIFFIES(_ms) ((_ms * HZ) / 1000) int ternfs_registry_refresh_time_jiffies = MSECS_TO_JIFFIES(60000); unsigned int ternfs_readahead_pages = 10000; static void ternfs_free_fs_info(struct ternfs_fs_info* info) { ternfs_debug("info=%p", info); cancel_delayed_work_sync(&info->registry_refresh_work); ternfs_net_shard_free_socket(&info->sock); put_net(info->registry_addr.net); kfree(info->registry_addr.addr); kfree(info); } static inline int recvloop(struct socket *sock, char *target, size_t size) { int read_so_far; for (read_so_far = 0; read_so_far < size;) { struct kvec iov = { .iov_base = target + read_so_far, .iov_len = size - read_so_far, }; struct msghdr msg = {NULL}; int read = kernel_recvmsg(sock, &msg, &iov, 1, iov.iov_len, 0); if (read == 0) return -ECONNRESET; if (read < 0) return read; read_so_far += read; } return read_so_far; } static inline int sendloop(struct socket *sock, char *target, size_t size) { int written_so_far; for (written_so_far = 0; written_so_far < size;) { struct kvec iov = { .iov_base = target + written_so_far, .iov_len = size - written_so_far, }; struct msghdr msg = {NULL}; int written = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); if (written < 0) return written; written_so_far += written; } return written_so_far; } static int ternfs_refresh_fs_info(struct ternfs_fs_info* info) { int err; struct socket* registry_sock; err = ternfs_create_registry_socket(&info->registry_addr, ®istry_sock); if (err < 0) { return err; } { static_assert(TERNFS_LOCAL_SHARDS_REQ_SIZE == 0); char registry_req[TERNFS_REGISTRY_REQ_HEADER_SIZE]; ternfs_write_registry_req_header(registry_req, TERNFS_LOCAL_SHARDS_REQ_SIZE, TERNFS_REGISTRY_LOCAL_SHARDS); if (err = sendloop(registry_sock, registry_req, sizeof registry_req), err < 0) goto out_sock; char shards_resp_header[TERNFS_REGISTRY_RESP_HEADER_SIZE + 2]; // + 2 = list len if (err = recvloop(registry_sock, shards_resp_header, sizeof shards_resp_header), err < 0) goto out_sock; u32 registry_resp_len; u8 registry_resp_kind; err = ternfs_read_registry_resp_header(shards_resp_header, ®istry_resp_len, ®istry_resp_kind); if (err < 0) { goto out_sock; } u16 shard_info_len = get_unaligned_le16(shards_resp_header + sizeof(shards_resp_header) - 2); if (shard_info_len != 256) { ternfs_info("expected 256 shard infos, got %d", shard_info_len); err = -EIO; goto out_sock; } if (registry_resp_len != 2 + TERNFS_SHARD_INFO_SIZE*256) { ternfs_info("expected size of %d, got %d", 2 + TERNFS_SHARD_INFO_SIZE*256, registry_resp_len); err = -EIO; goto out_sock; } int shid; for (shid = 0; shid < 256; shid++) { char shard_info_resp[TERNFS_SHARD_INFO_SIZE]; if (err = recvloop(registry_sock, shard_info_resp, sizeof shard_info_resp), err < 0) goto out_sock; struct ternfs_bincode_get_ctx ctx = { .buf = shard_info_resp, .end = shard_info_resp + sizeof(shard_info_resp), .err = 0, }; ternfs_shard_info_get_start(&ctx, start); ternfs_shard_info_get_addrs(&ctx, start, addr_start); ternfs_addrs_info_get_addr1(&ctx, addr_start, ipport1_start); ternfs_ip_port_get_addrs(&ctx, ipport1_start, shard_ip1); ternfs_ip_port_get_port(&ctx, shard_ip1, shard_port1); ternfs_ip_port_get_end(&ctx, shard_port1, ipport1_end); ternfs_addrs_info_get_addr2(&ctx, ipport1_end, ipport2_start); ternfs_ip_port_get_addrs(&ctx, ipport2_start, shard_ip2); ternfs_ip_port_get_port(&ctx, shard_ip2, shard_port2); ternfs_ip_port_get_end(&ctx, shard_port2, ipport2_end); ternfs_addrs_info_get_end(&ctx, ipport2_end, addr_end); ternfs_shard_info_get_last_seen(&ctx, addr_end, last_seen); ternfs_shard_info_get_end(&ctx, last_seen, end); ternfs_shard_info_get_finish(&ctx, end); if (ctx.err != 0) { err = ternfs_error_to_linux(ctx.err); goto out_sock; } atomic64_set(&info->shard_addrs1[shid], ternfs_mk_addr(shard_ip1.x, shard_port1.x)); atomic64_set(&info->shard_addrs2[shid], ternfs_mk_addr(shard_ip2.x, shard_port2.x)); } } { static_assert(TERNFS_LOCAL_CDC_REQ_SIZE == 0); char cdc_req[TERNFS_REGISTRY_REQ_HEADER_SIZE]; ternfs_write_registry_req_header(cdc_req, TERNFS_LOCAL_CDC_REQ_SIZE, TERNFS_REGISTRY_LOCAL_CDC); if (err = sendloop(registry_sock, cdc_req, sizeof cdc_req), err < 0) goto out_sock; char cdc_resp_header[TERNFS_REGISTRY_RESP_HEADER_SIZE]; if (err = recvloop(registry_sock, cdc_resp_header, sizeof cdc_resp_header), err < 0) goto out_sock; u32 registry_resp_len; u8 registry_resp_kind; err = ternfs_read_registry_resp_header(cdc_resp_header, ®istry_resp_len, ®istry_resp_kind); if (err < 0) { goto out_sock; } if (registry_resp_len != TERNFS_LOCAL_CDC_RESP_SIZE) { ternfs_debug("expected size of %d, got %d", TERNFS_LOCAL_CDC_RESP_SIZE, registry_resp_len); err = -EINVAL; goto out_sock; } { char cdc_resp[TERNFS_LOCAL_CDC_RESP_SIZE]; if (err = recvloop(registry_sock, cdc_resp, sizeof cdc_resp), err < 0) goto out_sock; struct ternfs_bincode_get_ctx ctx = { .buf = cdc_resp, .end = cdc_resp + sizeof(cdc_resp), .err = 0, }; ternfs_local_cdc_resp_get_start(&ctx, start); ternfs_local_cdc_resp_get_addrs(&ctx, start, addr_start); ternfs_addrs_info_get_addr1(&ctx, addr_start, ipport1_start); ternfs_ip_port_get_addrs(&ctx, ipport1_start, cdc_ip1); ternfs_ip_port_get_port(&ctx, cdc_ip1, cdc_port1); ternfs_ip_port_get_end(&ctx, cdc_port1, ipport1_end); ternfs_addrs_info_get_addr2(&ctx, ipport1_end, ipport2_start); ternfs_ip_port_get_addrs(&ctx, ipport2_start, cdc_ip2); ternfs_ip_port_get_port(&ctx, cdc_ip2, cdc_port2); ternfs_ip_port_get_end(&ctx, cdc_port2, ipport2_end); ternfs_addrs_info_get_end(&ctx, ipport2_end, addr_end); ternfs_local_cdc_resp_get_last_seen(&ctx, addr_end, last_seen); ternfs_local_cdc_resp_get_end(&ctx, last_seen, end); ternfs_local_cdc_resp_get_finish(&ctx, end); if (ctx.err != 0) { err = ternfs_error_to_linux(ctx.err); goto out_sock; } atomic64_set(&info->cdc_addr1, ternfs_mk_addr(cdc_ip1.x, cdc_port1.x)); atomic64_set(&info->cdc_addr2, ternfs_mk_addr(cdc_ip2.x, cdc_port2.x)); } } { static_assert(TERNFS_INFO_REQ_SIZE == 0); char info_req[TERNFS_REGISTRY_REQ_HEADER_SIZE]; ternfs_write_registry_req_header(info_req, 0, TERNFS_REGISTRY_INFO); if (err = sendloop(registry_sock, info_req, sizeof info_req), err < 0) goto out_sock; char info_resp_header[TERNFS_REGISTRY_RESP_HEADER_SIZE]; if (err = recvloop(registry_sock, info_resp_header, sizeof info_resp_header), err < 0) goto out_sock; u32 info_resp_len; u8 info_resp_kind; err = ternfs_read_registry_resp_header(info_resp_header, &info_resp_len, &info_resp_kind); if (err < 0) { goto out_sock; } if (info_resp_len != TERNFS_INFO_RESP_SIZE) { ternfs_info("expected size of %d, got %d", TERNFS_INFO_RESP_SIZE, info_resp_len); err = -EIO; goto out_sock; } char info_resp[TERNFS_INFO_RESP_SIZE]; if (err = recvloop(registry_sock, info_resp, sizeof info_resp), err < 0) goto out_sock; struct ternfs_bincode_get_ctx ctx = { .buf = info_resp, .end = info_resp + sizeof(info_resp), .err = 0, }; ternfs_info_resp_get_start(&ctx, start); ternfs_info_resp_get_num_block_services(&ctx, start, num_block_services); ternfs_info_resp_get_num_failure_domains(&ctx, num_block_services, num_failure_domains); ternfs_info_resp_get_capacity(&ctx, num_failure_domains, capacity); ternfs_info_resp_get_available(&ctx, capacity, available); ternfs_info_resp_get_blocks(&ctx, available, blocks); ternfs_info_resp_get_end(&ctx, blocks, end); ternfs_info_resp_get_finish(&ctx, end); if (ctx.err != 0) { ternfs_info("response error %s (%d)", ternfs_err_str(ctx.err), ctx.err); err = ternfs_error_to_linux(ctx.err); goto out_sock; } atomic64_set(&(info->available), available.x); atomic64_set(&(info->capacity), capacity.x); } sock_release(registry_sock); return 0; out_sock: sock_release(registry_sock); return err; } static void ternfs_registry_refresh_work(struct work_struct* work) { struct ternfs_fs_info* info = container_of(container_of(work, struct delayed_work, work), struct ternfs_fs_info, registry_refresh_work); int err = ternfs_refresh_fs_info(info); if (err != 0 && err != -EAGAIN) { ternfs_warn("failed to refresh registry data: %d", err); } ternfs_debug("scheduling registry data refresh after %dms", jiffies_to_msecs(ternfs_registry_refresh_time_jiffies)); queue_delayed_work(system_long_wq, &info->registry_refresh_work, ternfs_registry_refresh_time_jiffies); } enum { Opt_err, Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask }; static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_umask, "umask=%u"}, {Opt_dmask, "dmask=%u"}, {Opt_fmask, "fmask=%u"}, {Opt_err, NULL} }; static int ternfs_parse_options(char* options, struct ternfs_fs_info* ternfs_info) { char *p; int option; substring_t args[MAX_OPT_ARGS]; // defaults ternfs_info->uid = make_kuid(&init_user_ns, 1000); ternfs_info->gid = make_kgid(&init_user_ns, 1000); ternfs_info->dmask = 0000; ternfs_info->fmask = 0111; if (!options) return 0; while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; ternfs_info->uid = make_kuid(&init_user_ns, option); if (!uid_valid(ternfs_info->uid)) return -EINVAL; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; ternfs_info->gid = make_kgid(&init_user_ns, option); if (!gid_valid(ternfs_info->gid)) return -EINVAL; break; case Opt_umask: if (match_octal(&args[0], &option)) return -EINVAL; ternfs_info->fmask = ternfs_info->dmask = option; break; case Opt_dmask: if (match_octal(&args[0], &option)) return -EINVAL; ternfs_info->dmask = option; break; case Opt_fmask: if (match_octal(&args[0], &option)) return -EINVAL; ternfs_info->fmask = option; break; default: ternfs_error("Invalid mount option \"%s\"", p); return -EINVAL; } } return 0; } static struct ternfs_fs_info* ternfs_init_fs_info(struct net* net, const char* dev_name, char* options) { int err; struct ternfs_fs_info* ternfs_info = kzalloc(sizeof(struct ternfs_fs_info), GFP_KERNEL); if (!ternfs_info) { err = -ENOMEM; goto out; } ternfs_info->registry_addr.net = get_net(net); ternfs_info->registry_addr.addr = kmalloc(strlen(dev_name)+1, GFP_KERNEL); if (!ternfs_info->registry_addr.addr) { err = -ENOMEM; goto out_info; } memcpy(ternfs_info->registry_addr.addr, dev_name, strlen(dev_name)+1); err = ternfs_parse_options(options, ternfs_info); if (err) { goto out_addr; } err = ternfs_init_shard_socket(&ternfs_info->sock); if (err) { goto out_addr; } err = ternfs_refresh_fs_info(ternfs_info); if (err != 0) { goto out_socket; } INIT_DELAYED_WORK(&ternfs_info->registry_refresh_work, ternfs_registry_refresh_work); queue_delayed_work(system_long_wq, &ternfs_info->registry_refresh_work, ternfs_registry_refresh_time_jiffies); return ternfs_info; out_socket: ternfs_net_shard_free_socket(&ternfs_info->sock); out_addr: kfree(ternfs_info->registry_addr.addr); out_info: put_net(ternfs_info->registry_addr.net); kfree(ternfs_info); out: return ERR_PTR(err); } static void ternfs_put_super(struct super_block* sb) { ternfs_debug("sb=%p", sb); ternfs_free_fs_info(sb->s_fs_info); sb->s_fs_info = NULL; } #define TERNFS_SUPER_MAGIC 0x45474753 // EGGS static int ternfs_statfs(struct dentry* dentry, struct kstatfs* stats) { struct ternfs_fs_info* info = (struct ternfs_fs_info*)dentry->d_sb->s_fs_info; stats->f_type = TERNFS_SUPER_MAGIC; stats->f_bsize = PAGE_SIZE; stats->f_frsize = PAGE_SIZE; stats->f_blocks = atomic64_read(&info->capacity) / PAGE_SIZE; stats->f_bfree = atomic64_read(&info->available) / PAGE_SIZE; stats->f_bavail = atomic64_read(&info->available) / PAGE_SIZE; stats->f_files = -1; stats->f_ffree = -1; stats->f_namelen = TERNFS_MAX_FILENAME; return 0; } static const struct super_operations ternfs_super_ops = { .alloc_inode = ternfs_inode_alloc, .evict_inode = ternfs_inode_evict, .free_inode = ternfs_inode_free, .put_super = ternfs_put_super, .statfs = ternfs_statfs, }; static struct dentry* ternfs_mount(struct file_system_type* fs_type, int flags, const char* dev_name, void* data) { int err; ternfs_info("mounting at %s", dev_name); ternfs_debug("fs_type=%p flags=%d dev_name=%s data=%s", fs_type, flags, dev_name, data ? (char*)data : ""); struct ternfs_fs_info* info = ternfs_init_fs_info(current->nsproxy->net_ns, dev_name, data); if (IS_ERR(info)) { err = PTR_ERR(info); goto out_err; } struct super_block* sb = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out_info; } sb->s_fs_info = info; sb->s_flags = SB_NOSUID | SB_NODEV | SB_NOEXEC | SB_NOATIME | SB_NODIRATIME; sb->s_iflags = SB_I_NOEXEC | SB_I_NODEV; sb->s_op = &ternfs_super_ops; sb->s_d_op = &ternfs_dentry_ops; sb->s_export_op = &ternfs_export_ops; sb->s_time_gran = 1; sb->s_time_min = 0; sb->s_time_max = U64_MAX/1000000000ull; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_magic = TERNFS_SUPER_MAGIC; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; sb->s_bdi = &noop_backing_dev_info; sb->s_bdi->ra_pages = ternfs_readahead_pages; struct inode* root = ternfs_get_inode_normal(sb, NULL, TERNFS_ROOT_INODE); if (IS_ERR(root)) { err = PTR_ERR(root); goto out_sb; } struct ternfs_inode* root_enode = TERNFS_I(root); err = ternfs_do_getattr(root_enode, ATTR_CACHE_NORM_TIMEOUT); if (err) { goto out_sb; } if (!root_enode->block_policy || !root_enode->span_policy || !root_enode->stripe_policy) { ternfs_warn("no policies for root directory!"); err = -EIO; goto out_sb; } sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; goto out_sb; } sb->s_flags |= SB_ACTIVE; return dget(sb->s_root); out_sb: deactivate_locked_super(sb); out_info: ternfs_free_fs_info(info); out_err: ternfs_debug("failed err=%d", err); return ERR_PTR(err); } static void ternfs_kill_sb(struct super_block* sb) { ternfs_debug("sb=%p", sb); kill_anon_super(sb); } static struct file_system_type ternfs_fs_type = { .owner = THIS_MODULE, .name = "eggsfs", .mount = ternfs_mount, .kill_sb = ternfs_kill_sb, // TODO is FS_RENAME_DOES_D_MOVE needed? .fs_flags = FS_RENAME_DOES_D_MOVE, }; MODULE_ALIAS_FS("eggsfs"); int __init ternfs_fs_init(void) { return register_filesystem(&ternfs_fs_type); } void __cold ternfs_fs_exit(void) { ternfs_debug("fs exit"); unregister_filesystem(&ternfs_fs_type); }