Files
ternfs-XTXMarkets/kmod/dir.c
Francesco Mazzoli 03a2a08ec9 Linux 6.12
2025-10-03 11:08:45 +01:00

591 lines
20 KiB
C

// Copyright 2025 XTX Markets Technologies Limited
//
// SPDX-License-Identifier: GPL-2.0-or-later
#include "dir.h"
#include <linux/mm.h>
#include <linux/slab.h>
#include "log.h"
#include "err.h"
#include "inode.h"
#include "metadata.h"
#include "trace.h"
static struct kmem_cache* ternfs_dirents_cachep;
static inline int ternfs_dir_entry_size(int name_len) {
return 8 + 8 + 1 + name_len; // inode + name_hash + len + name
}
static inline void ternfs_dir_entry_parse(const char* buf, u64* ino, u64* name_hash, u8* name_len, const char** name) {
*ino = get_unaligned_le64(buf); buf += 8;
*name_hash = get_unaligned_le64(buf); buf += 8;
*name_len = *(u8*)buf; buf++;
*name = buf; buf += *name_len;
}
static inline u64 ternfs_dir_entry_hash(const char* buf) {
return get_unaligned_le64(buf + 8);
}
static struct kmem_cache* ternfs_readdir_ctx_cachep;
struct ternfs_readdir_ctx {
struct ternfs_dirents* dirents;
struct page* current_page; // never NULL
u32 page_offset;
u16 page_n;
};
// Increments the refcount of the cache under RCU.
// After this we know the dirents won't be deallocated until we
// decrease the refcount.
static struct ternfs_dirents* ternfs_dir_get_cache(struct ternfs_inode* enode) {
struct ternfs_dirents* dirents;
rcu_read_lock();
dirents = rcu_dereference(enode->dir.dirents);
if (dirents) { atomic_inc(&dirents->refcount); }
rcu_read_unlock();
return dirents;
}
// Decrements the refcount, and frees up all the pages in the
// cache if we were the last ones. Must be called by
// somebody who called ternfs_dir_get_cache before.
//
// Note that ->dirents and ->dir.pages are both holding a reference, so
// once you clear those you should call `ternfs_dir_put_cache`.
static void ternfs_dir_put_cache(struct ternfs_dirents* dirents) {
int refs = atomic_dec_return(&dirents->refcount);
BUG_ON(refs < 0);
if (refs == 0) {
put_pages_list(&dirents->pages);
kmem_cache_free(ternfs_dirents_cachep, dirents);
}
}
static void __ternfs_dir_put_cache_rcu(struct rcu_head* rcu) {
struct ternfs_dirents* dirents = container_of(rcu, struct ternfs_dirents, rcu_head);
ternfs_dir_put_cache(dirents);
}
// Removes the cache from ->dir.dirents, and also calls `ternfs_dir_put_cache` to
// clear the reference. So calls to this too should be paired to a `ternfs_dir_get_cache`.
void ternfs_dir_drop_cache(struct ternfs_inode* enode) {
if (!atomic64_read((atomic64_t*)&enode->dir.dirents)) { return; } // early atomicless exit
// zero out dirents, and free it up after the grace period.
struct ternfs_dirents* dirents = (struct ternfs_dirents*)atomic64_xchg((atomic64_t*)&enode->dir.dirents, (u64)0);
if (likely(dirents)) {
call_rcu(&dirents->rcu_head, __ternfs_dir_put_cache_rcu);
}
}
static struct ternfs_dirents* ternfs_dir_read_all(struct dentry* dentry, u64 dir_seqno);
static int ternfs_dir_open(struct inode* inode, struct file* filp) {
struct ternfs_inode* enode = TERNFS_I(inode);
struct ternfs_readdir_ctx* ctx;
struct ternfs_dirents* dirents;
int err;
trace_eggsfs_vfs_opendir_enter(inode);
if (get_jiffies_64() >= enode->dir.mtime_expiry) {
err = ternfs_dir_revalidate(enode);
if (err) { return err; }
}
ctx = kmem_cache_alloc(ternfs_readdir_ctx_cachep, GFP_KERNEL);
if (!ctx) { return -ENOMEM; }
again: // progress: whoever wins the lock either get pages or fails
dirents = ternfs_dir_get_cache(enode);
if (dirents && dirents->mtime != READ_ONCE(enode->mtime)) {
ternfs_dir_put_cache(dirents);
ternfs_dir_drop_cache(enode);
dirents = NULL;
}
if (!dirents) {
int seqno;
if (!ternfs_latch_try_acquire(&enode->dir.dirents_latch, seqno)) {
ternfs_latch_wait(&enode->dir.dirents_latch, seqno);
goto again;
}
// If somebody else got to creating the page already, drop it,
// otherwise we'll leak.
dirents = ternfs_dir_get_cache(enode);
if (dirents && dirents->mtime != READ_ONCE(enode->mtime)) {
ternfs_dir_put_cache(dirents);
ternfs_dir_drop_cache(enode);
dirents = NULL;
}
if (!dirents) {
u64 dir_mtime = READ_ONCE(enode->mtime);
dirents = ternfs_dir_read_all(filp->f_path.dentry, dir_mtime);
if (!IS_ERR(dirents)) {
// If the mtime matches (which will be the case unless somebody
// else revalidated between the READ_ONCE above), then the refcount
// starts at two: one for the ->dir.dirents, and one for the
// ->dirents below. Otherwise only for ->dirents (basically
// we have these pages to be private to this read).
if (likely(READ_ONCE(enode->mtime) == dir_mtime)) {
atomic_set(&dirents->refcount, 2);
rcu_assign_pointer(enode->dir.dirents, dirents);
} else {
atomic_set(&dirents->refcount, 1);
}
}
}
ternfs_latch_release(&enode->dir.dirents_latch, seqno);
}
if (IS_ERR(dirents)) { err = PTR_ERR(dirents); goto out_err; }
ctx->dirents = dirents;
ctx->current_page = list_first_entry(&dirents->pages, struct page, lru);
ctx->page_offset = 0;
ctx->page_n = 0;
filp->private_data = ctx;
trace_eggsfs_vfs_opendir_exit(inode, 0);
return 0;
out_err:
kmem_cache_free(ternfs_readdir_ctx_cachep, ctx);
trace_eggsfs_vfs_opendir_exit(inode, err);
ternfs_debug("err=%d", err);
return err;
}
static void update_dcache(struct dentry* parent, u64 dir_seqno, const char* name, int name_len, u64 edge_creation_time, u64 ino) {
struct qstr filename = QSTR_INIT(name, name_len);
struct dentry* dentry;
struct dentry* alias;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
struct inode* parent_inode = parent->d_inode;
if (unlikely(parent_inode == NULL)) { // TODO can this ever happen?
ternfs_warn("got NULL in dentry parent (ino %016llx)", ino);
return;
}
ternfs_debug("parent=%pd name=%*pE ino=%llx", parent, name_len, name, ino);
filename.hash = full_name_hash(parent, filename.name, filename.len);
dentry = d_lookup(parent, &filename);
if (!dentry) {
again:
dentry = d_alloc_parallel(parent, &filename, &wq);
if (IS_ERR(dentry)) { return; }
}
struct ternfs_inode* enode = NULL;
if (!d_in_lookup(dentry)) { // d_alloc_parallel sets in_lookup
// pre-existing entry
struct inode* old_inode = d_inode(dentry);
if (old_inode && old_inode->i_ino == ino) {
WRITE_ONCE(dentry->d_time, dir_seqno);
enode = TERNFS_I(old_inode);
goto out;
}
d_invalidate(dentry);
dput(dentry);
goto again;
} else {
// new entry
struct inode* inode = ternfs_get_inode_normal(parent->d_sb, TERNFS_I(parent_inode), ino);
if (!IS_ERR(inode)) {
enode = TERNFS_I(inode);
enode->edge_creation_time = edge_creation_time;
}
ternfs_debug("inode=%p is_err=%d", inode, IS_ERR(inode));
// d_splice_alias propagates error in inode
WRITE_ONCE(dentry->d_time, dir_seqno);
alias = d_splice_alias(inode, dentry);
ternfs_debug("alias=%p is_err=%d", alias, IS_ERR(alias));
d_lookup_done(dentry);
if (alias) {
dput(dentry);
dentry = alias;
}
if (IS_ERR(dentry)) {
ternfs_debug("dentry err=%ld", PTR_ERR(dentry));
return;
}
}
out:
// Speculatively fetch stat info. Note that this will never
// block.
if (enode) {
int err = ternfs_start_async_getattr(enode);
if (err < 0 && err != -ERESTARTSYS) {
ternfs_warn("could not send async getattr: %d", err);
}
}
dput(dentry);
};
struct ternfs_dir_fill_ctx {
struct dentry* parent;
struct ternfs_dirents* dirents;
// kmap'd address of last page
char* page_addr;
// current offset inside last page
u32 page_off;
// Number of ternfs dir entries in the last page
u16 page_n;
};
static struct page* dir_fill_ctx_last_page(struct ternfs_dir_fill_ctx* ctx) {
return list_last_entry(&ctx->dirents->pages, struct page, lru);
}
static int ternfs_dir_add_entry(struct ternfs_dir_fill_ctx* ctx, u64 name_hash, const char* name, int name_len, u64 ino) {
if (name_len > TERNFS_MAX_FILENAME) { return -ENAMETOOLONG; }
// Flush current page, and start a new one
if (ctx->page_off + ternfs_dir_entry_size(name_len) > PAGE_SIZE) {
struct page* page = alloc_page(GFP_KERNEL);
if (!page) { return -ENOMEM; }
// unmap and store number of entries
kunmap(dir_fill_ctx_last_page(ctx));
dir_fill_ctx_last_page(ctx)->index = ctx->page_n;
// add new page, reset ctx indices
list_add_tail(&page->lru, &ctx->dirents->pages);
ctx->page_addr = kmap(page);
ctx->page_off = 0;
ctx->page_n = 0;
static_assert(sizeof(page->private) == sizeof(name_hash));
page->private = name_hash;
}
if (ctx->page_off + ternfs_dir_entry_size(name_len) > PAGE_SIZE) {
WARN_ON(ctx->page_off + ternfs_dir_entry_size(name_len) > PAGE_SIZE);
return -EIO;
}
char* entry = ctx->page_addr + ctx->page_off;
put_unaligned_le64(ino, entry); entry += 8;
put_unaligned_le64(name_hash, entry); entry += 8;
*(u8*)entry = (u8)name_len; entry++;
memcpy(entry, name, name_len);
ctx->page_off += ternfs_dir_entry_size(name_len);
ctx->page_n++;
ctx->dirents->max_hash = name_hash;
return 0;
}
int ternfs_dir_readdir_entry_cb(void* ptr, u64 hash, const char* name, int name_len, u64 edge_creation_time, u64 ino) {
struct ternfs_dir_fill_ctx* ctx = ptr;
ternfs_debug("hash=%llx ino=%llx, name_len=%d, name=%*pE", hash, ino, name_len, name_len, name);
update_dcache(ctx->parent, ctx->dirents->mtime, name, name_len, edge_creation_time, ino);
return ternfs_dir_add_entry(ctx, hash, name, name_len, ino);
}
// mut hold
static struct ternfs_dirents* ternfs_dir_read_all(struct dentry* dentry, u64 dir_mtime) {
struct inode* inode = d_inode(dentry);
struct ternfs_inode* enode = TERNFS_I(inode);
struct ternfs_dirents* dirents = kmem_cache_alloc(ternfs_dirents_cachep, GFP_KERNEL);
if (!dirents) { return ERR_PTR(-ENOMEM); }
dirents->mtime = dir_mtime;
INIT_LIST_HEAD(&dirents->pages);
atomic_set(&dirents->refcount, 0); // the caller sets this up
dirents->max_hash = 0;
int err;
struct page* first_page = alloc_page(GFP_KERNEL);
if (!first_page) {
err = -ENOMEM;
goto out_err_dirents;
}
list_add_tail(&first_page->lru, &dirents->pages);
first_page->private = 0; // first hash = 0
bool eof = false;
u64 continuation_key = 0;
struct ternfs_dir_fill_ctx ctx = {
.parent = dentry,
.dirents = dirents,
.page_addr = kmap(first_page),
.page_off = 0,
.page_n = 0,
};
while (!eof) {
ternfs_debug("cont_key=%llx", continuation_key);
err = ternfs_shard_readdir((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, continuation_key, &ctx, &continuation_key);
if (err) {
err = ternfs_error_to_linux(err);
goto out_err;
}
eof = continuation_key == 0;
}
kunmap(dir_fill_ctx_last_page(&ctx));
dir_fill_ctx_last_page(&ctx)->index = ctx.page_n;
return dirents;
out_err:
kunmap(dir_fill_ctx_last_page(&ctx));
put_pages_list(&ctx.dirents->pages);
out_err_dirents:
kmem_cache_free(ternfs_dirents_cachep, dirents);
ternfs_info("err=%d", err);
return ERR_PTR(err);
};
// vfs: shared inode.i_rwsem
// vfs: mutex file.f_pos_lock
static int ternfs_dir_read(struct file* filp, struct dir_context* dctx) {
struct ternfs_readdir_ctx* ctx = filp->private_data;
WARN_ON(!ctx);
if (!ctx) { return -EINVAL; }
ternfs_debug("ctx=%p ctx.current_page=%p ctx.dirents=%p ctx.page_offset=%u ctx.page_n=%u pos=%lld", ctx, ctx->current_page, ctx->dirents, ctx->page_offset, ctx->page_n, dctx->pos);
// For our offsets, we just use the edge hash. This is technically not guaranteed
// to be correct: there might be collisions which mean that multiple elements will
// have the same offset. However this is very unlikely -- we need to get to roughly
// 5 million elements in a directory for the probability of a collision happening
// being more than 1 in a million (assuming that the hash that we use is a good
// hashing function). And seeking directories is somewhat uncommon -- the code below
// will just work for normal directory listings (not skipping over duplicate offsets,
// if they're present). So we accept it.
if (dctx->pos < 0) {
ternfs_warn("got negative pos %lld, this should never happen", dctx->pos);
return -EIO;
}
char* page_addr;
// The first thing we do is restore coherency between the dctx pos and the
// ctx pos, by making sure current_page is the correct one, and that we're
// positioned _before_ the entry we're looking for. The code below will then
// skip forward to the right entry.
if (dctx->pos == 0 || dctx->pos == 1) {
// If the context position is zero (.) or one (..), we've just started. We reset the
// context to the beginning and we're good to go.
ternfs_debug("we're at the beginning, will start with dots");
ctx->current_page = list_first_entry(&ctx->dirents->pages, struct page, lru);
ctx->page_offset = 0;
ctx->page_n = 0;
page_addr = kmap(ctx->current_page);
} else {
// Otherwise, we make sure we're in the right page, if there is such a page.
// In the very common case where we're already at the right entry (from where
// we left off the last time), we just leave things as they are, so no skipping
// forward will happen.
page_addr = kmap(ctx->current_page);
if (likely(
ctx->page_n < ctx->current_page->index && // we're not out of bounds
ternfs_dir_entry_hash(page_addr + ctx->page_offset) == dctx->pos // we're at the right place
)) {
// we're good, nothing to do
ternfs_debug("resuming readdir");
} else if (likely(dctx->pos > ctx->dirents->max_hash)) {
// we're done for good
ternfs_debug("pos out of bounds (%lld > %llu), returning", dctx->pos, ctx->dirents->max_hash);
return 0;
} else {
// First try to rewind if we're at an earlier position. We know we'll stop
// when we hit zero.
for (; ctx->current_page->private > dctx->pos; ctx->current_page = list_prev_entry(ctx->current_page, lru)) {
ternfs_debug("rewinding");
}
// Then check if we need to go forward, by checking if we have a next page,
// and if we do, checking if the position is <= than its first hash.
for (;;) {
if (list_is_last(&ctx->current_page->lru, &ctx->dirents->pages)) {
ternfs_debug("ran out of pages to forward");
break;
}
struct page* next_page = list_next_entry(ctx->current_page, lru);
if (next_page->private > dctx->pos) {
ternfs_debug("found right page to forward to");
break;
}
ternfs_debug("forwarding");
ctx->current_page = next_page;
}
// Finally, reset position inside page. The code below will forward to
// the right hash.
page_addr = kmap(ctx->current_page);
ctx->page_offset = 0;
ctx->page_n = 0;
}
}
// emit dots first
if (!dir_emit_dots(filp, dctx)) {
ternfs_debug("exiting after dots");
return 0;
}
// move along, nothing to look here
if (ctx->page_n >= ctx->current_page->index) {
ternfs_debug("we're out of elements before even starting");
return 0;
}
for (;;) {
const char* entry = page_addr + ctx->page_offset;
ternfs_debug("next page_addr=%p ctx.page_offset=%u entry=%p", page_addr, ctx->page_offset, entry);
u64 ino;
u64 hash;
u8 name_len;
const char* name;
ternfs_dir_entry_parse(entry, &ino, &hash, &name_len, &name);
ternfs_debug("name=%*pE ino=0x%016llx", name_len, name, ino);
if (likely(hash >= dctx->pos)) {
dctx->pos = hash;
int dtype;
switch (ternfs_inode_type(ino)) {
case TERNFS_INODE_DIRECTORY: dtype = DT_DIR; break;
case TERNFS_INODE_FILE: dtype = DT_REG; break;
case TERNFS_INODE_SYMLINK: dtype = DT_LNK; break;
default: dtype = DT_UNKNOWN; break;
}
ternfs_debug("emitting name=%*pE ino=%016llx dtype=%d hash=%llu", name_len, name, ino, dtype, hash);
if (!dir_emit(dctx, name, name_len, ino, dtype)) {
ternfs_debug("break");
kunmap(ctx->current_page);
break;
}
} else {
ternfs_debug("skipping hash %llu < dctx->pos %lld", hash, dctx->pos);
}
ctx->page_offset += ternfs_dir_entry_size(name_len);
ctx->page_n++;
ternfs_debug(
"next ctx=%p ctx.current_page=%p ctx.dirents=%p ctx.page_offset=%u ctx.page_n=%u page_n=%ld", ctx,
ctx->current_page, ctx->dirents, ctx->page_offset, ctx->page_n, ctx->current_page->index
);
if (ctx->page_n >= ctx->current_page->index) {
kunmap(ctx->current_page);
ctx->page_n = 0;
ctx->page_offset = 0;
if (list_is_last(&ctx->current_page->lru, &ctx->dirents->pages)) {
// we've run out of pages
ternfs_debug("ran out of pages, exiting");
ctx->current_page = list_first_entry(&ctx->dirents->pages, struct page, lru);
dctx->pos++; // otherwise we'll loop forever
break;
} else {
ctx->current_page = list_next_entry(ctx->current_page, lru);
page_addr = kmap(ctx->current_page);
}
}
}
ternfs_debug("done, exit pos %lld", dctx->pos);
return 0;
}
static int ternfs_dir_close(struct inode* inode, struct file* filp) {
struct ternfs_readdir_ctx* ctx = (struct ternfs_readdir_ctx*)filp->private_data;
WARN_ON(!ctx);
if (!ctx) { return -EINVAL; }
trace_eggsfs_vfs_closedir_enter(inode);
ternfs_dir_put_cache(ctx->dirents);
kmem_cache_free(ternfs_readdir_ctx_cachep, ctx);
trace_eggsfs_vfs_closedir_exit(inode, 0);
return 0;
}
static loff_t ternfs_dir_seek(struct file* file, loff_t offset, int whence) {
struct inode* inode = file_inode(file);
inode_lock(inode);
switch (whence) {
case SEEK_SET:
break;
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_END:
offset = -EOPNOTSUPP;
goto out_err;
default:
offset = -EINVAL;
goto out_err;
}
if (offset < 0) {
offset = -EINVAL;
goto out_err;
}
ternfs_debug("setting dir position to %lld", offset);
vfs_setpos(file, offset, MAX_LFS_FILESIZE);
out_err:
inode_unlock(inode);
return offset;
}
static int ternfs_dir_fsync(struct file* f, loff_t start, loff_t end, int datasync) {
return 0;
}
struct file_operations ternfs_dir_operations = {
.open = ternfs_dir_open,
.iterate_shared = ternfs_dir_read,
.release = ternfs_dir_close,
.llseek = ternfs_dir_seek,
.fsync = ternfs_dir_fsync,
};
int __init ternfs_dir_init(void) {
ternfs_dirents_cachep = kmem_cache_create(
"ternfs_dirents_cache",
sizeof(struct ternfs_dirents),
0,
SLAB_RECLAIM_ACCOUNT,
NULL
);
if (!ternfs_dirents_cachep) {
return -ENOMEM;
}
ternfs_readdir_ctx_cachep = kmem_cache_create(
"ternfs_readdir_ctx_cache",
sizeof(struct ternfs_readdir_ctx),
0,
SLAB_RECLAIM_ACCOUNT,
NULL
);
if (!ternfs_readdir_ctx_cachep) {
kmem_cache_destroy(ternfs_dirents_cachep);
return -ENOMEM;
}
return 0;
}
void __cold ternfs_dir_exit(void) {
ternfs_debug("dir exit");
kmem_cache_destroy(ternfs_readdir_ctx_cachep);
kmem_cache_destroy(ternfs_dirents_cachep);
}