Files
ternfs-XTXMarkets/kmod/dir.c
Francesco Mazzoli ca5debd8e7 Configurable timeouts
2023-07-06 19:39:12 +01:00

463 lines
15 KiB
C

#include "dir.h"
#include <linux/mm.h>
#include <linux/slab.h>
#include "log.h"
#include "err.h"
#include "inode.h"
#include "metadata.h"
#include "trace.h"
#define MSECS_TO_JIFFIES(_ms) ((_ms * HZ) / 1000)
// sysctls
int eggsfs_dir_refresh_time_jiffies = MSECS_TO_JIFFIES(250);
#define eggsfs_dir_get_page_n(_page) ({ *((((u32*)&(_page)->private))+1); })
#define eggsfs_dir_set_page_n(_page, _n) ({ *((((u32*)&(_page)->private))+1) = _n; })
static struct kmem_cache* eggsfs_readdir_ctx_cachep;
static inline int eggsfs_dir_entry_size(int name_len) {
return 8 + 1 + name_len; // inode + len + name
}
static inline void eggsfs_dir_entry_parse(const char* buf, u64* ino, u8* name_len, const char** name) {
*ino = get_unaligned_le64(buf); buf += 8;
*name_len = *(u8*)buf; buf++;
*name = buf; buf += *name_len;
}
struct eggsfs_readdir_ctx {
struct page* current_page;
struct page* pages;
u32 page_offset;
u32 page_n;
};
// Increments the refcount of the cache in ->private under RCU.
// After this we know the page won't be deallocated until we
// decrease the refcount.
static struct page* eggsfs_dir_get_cache(struct eggsfs_inode* enode) {
struct page* page;
rcu_read_lock();
page = rcu_dereference(enode->dir.pages);
if (page) { atomic_inc((atomic_t*)&page->private); }
rcu_read_unlock();
return page;
}
// Decrements the refcount, and frees up all the pages in the
// cache if we were the last ones. Must be called by
// somebody who called eggsfs_dir_get_cache before.
//
// Note that ->current_page and ->dir_pages are both holding a
// reference, so once you clear those you should call `eggsfs_dir_put_cache`.
static void eggsfs_dir_put_cache(struct page* page) {
if (atomic_dec_and_test((atomic_t*)&page->private)) {
LIST_HEAD(pages);
while (page) {
struct page* next = (struct page*)page->mapping;
page->mapping = NULL;
page->private = 0;
list_add_tail(&page->lru, &pages);
page = next;
}
put_pages_list(&pages);
}
}
static void __eggsfs_dir_put_cache_rcu(struct rcu_head* rcu) {
struct page* page = container_of(rcu, struct page, rcu_head);
eggsfs_dir_put_cache(page);
}
// Removes the cache from ->dir_pages, and also calls
// `eggsfs_dir_put_cache` to clear the reference.
void eggsfs_dir_drop_cache(struct eggsfs_inode* enode) {
struct page* page;
static_assert(sizeof(struct rcu_head) == sizeof(struct list_head));
if (!atomic64_read((atomic64_t*)&enode->dir.pages)) { return; }
page = (struct page*)atomic64_xchg((atomic64_t*)&enode->dir.pages, (u64)0);
if (page) { call_rcu(&page->rcu_head, __eggsfs_dir_put_cache_rcu); }
}
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_seqno);
static int eggsfs_dir_open(struct inode* inode, struct file* filp) {
struct eggsfs_inode* enode = EGGSFS_I(inode);
struct eggsfs_readdir_ctx* ctx;
struct page* page;
int err;
trace_eggsfs_vfs_opendir_enter(inode);
if (get_jiffies_64() >= enode->mtime_expiry) {
err = eggsfs_dir_revalidate(enode);
if (err) { return err; }
}
ctx = kmem_cache_alloc(eggsfs_readdir_ctx_cachep, GFP_KERNEL);
if (!ctx) { return -ENOMEM; }
again: // progress: whoever wins the lock either get pages or fails
page = eggsfs_dir_get_cache(enode);
if (page && page->index != READ_ONCE(enode->mtime)) {
eggsfs_dir_put_cache(page);
eggsfs_dir_drop_cache(enode);
page = NULL;
}
if (!page) {
int seqno;
if (!eggsfs_latch_try_acquire(&enode->dir.pages_latch, seqno)) {
err = eggsfs_latch_wait_killable(&enode->dir.pages_latch, seqno);
if (err) { goto out_ctx; }
goto again;
}
// If somebody else got to creating the page already, drop it,
// otherwise we'll leak.
page = eggsfs_dir_get_cache(enode);
if (page && page->index != READ_ONCE(enode->mtime)) {
eggsfs_dir_put_cache(page);
eggsfs_dir_drop_cache(enode);
page = NULL;
}
if (!page) {
u64 dir_mtime = READ_ONCE(enode->mtime);
page = eggsfs_dir_read_all(filp->f_path.dentry, dir_mtime);
if (!IS_ERR(page)) {
// If the mtime matches (which will be the case unless somebody
// else revalidated between the READ_ONCE above), then the refcount
// starts at two: one for the ->dir_pages, and one for the
// ->current_page below. Otherwise only for ->current_page (basically
// we have these pages to be private to this read).
if (likely(READ_ONCE(enode->mtime) == dir_mtime)) {
atomic_set((atomic_t*)&page->private, 2);
rcu_assign_pointer(enode->dir.pages, page);
} else {
atomic_set((atomic_t*)&page->private, 1);
}
}
}
eggsfs_latch_release(&enode->dir.pages_latch, seqno);
}
if (IS_ERR(page)) { err = PTR_ERR(page); goto out_ctx; }
ctx->current_page = page;
ctx->pages = page;
ctx->page_offset = 0;
ctx->page_n = 0;
filp->private_data = ctx;
trace_eggsfs_vfs_opendir_exit(inode, 0);
return 0;
out_ctx:
kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx);
trace_eggsfs_vfs_opendir_exit(inode, err);
eggsfs_debug("err=%d", err);
return err;
}
static void update_dcache(struct dentry* parent, u64 dir_seqno, const char* name, int name_len, u64 edge_creation_time, u64 ino) {
struct qstr filename = QSTR_INIT(name, name_len);
struct dentry* dentry;
struct dentry* alias;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
struct inode* parent_inode = parent->d_inode;
if (unlikely(parent_inode == NULL)) { // TODO can this ever happen?
eggsfs_warn("got NULL in dentry parent (ino %016llx)", ino);
return;
}
eggsfs_debug("parent=%pd name=%*pE ino=%llx", parent, name_len, name, ino);
filename.hash = full_name_hash(parent, filename.name, filename.len);
dentry = d_lookup(parent, &filename);
if (!dentry) {
again:
dentry = d_alloc_parallel(parent, &filename, &wq);
if (IS_ERR(dentry)) { return; }
}
if (!d_in_lookup(dentry)) { // d_alloc_parallel sets in_lookup
// pre-existing entry
struct inode* old_inode = d_inode(dentry);
if (old_inode && old_inode->i_ino == ino) {
WRITE_ONCE(dentry->d_time, dir_seqno);
goto out;
}
d_invalidate(dentry);
dput(dentry);
goto again;
} else {
// new entry
struct inode* inode = eggsfs_get_inode(parent->d_sb, EGGSFS_I(parent_inode), ino);
if (!IS_ERR(inode)) {
struct eggsfs_inode* enode = EGGSFS_I(inode);
enode->edge_creation_time = edge_creation_time;
}
eggsfs_debug("inode=%p is_err=%d", inode, IS_ERR(inode));
// d_splice_alias propagates error in inode
WRITE_ONCE(dentry->d_time, dir_seqno);
alias = d_splice_alias(inode, dentry);
eggsfs_debug("alias=%p is_err=%d", alias, IS_ERR(alias));
d_lookup_done(dentry);
if (alias) {
dput(dentry);
dentry = alias;
}
if (IS_ERR(dentry)) {
eggsfs_debug("dentry err=%ld", PTR_ERR(dentry));
return;
}
}
out:
dput(dentry);
};
struct eggsfs_dir_fill_ctx {
struct dentry* parent;
u64 dir_mtime;
// NULL or kmap'ed at page_addr
struct page* current_page;
// kmap'd address of current_page
char* page_addr;
// Current offset inside the page we're writing
u32 page_off;
// Number of eggsfs dir entries in the current page
u32 page_n;
};
static int eggsfs_dir_add_entry(struct eggsfs_dir_fill_ctx* ctx, const char* name, int name_len, u64 ino) {
if (name_len > EGGSFS_MAX_FILENAME) { return -ENAMETOOLONG; }
// Flush current page, and start a new one, linking it in ->mapping
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
struct page* page = alloc_page(GFP_KERNEL);
if (!page) { return -ENOMEM; }
kunmap(ctx->current_page);
ctx->current_page->mapping = (void*)page;
eggsfs_dir_set_page_n(ctx->current_page, ctx->page_n);
ctx->current_page = page;
ctx->page_addr = kmap(page);
ctx->page_off = 0;
ctx->page_n = 0;
}
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
WARN_ON(ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE);
return -EIO;
}
char* entry = ctx->page_addr + ctx->page_off;
put_unaligned_le64(ino, entry); entry += 8;
*(u8*)entry = (u8)name_len; entry++;
memcpy(entry, name, name_len);
ctx->page_off += eggsfs_dir_entry_size(name_len);
++ctx->page_n;
return 0;
}
int eggsfs_dir_readdir_entry_cb(void* ptr, const char* name, int name_len, u64 hash, u64 edge_creation_time, u64 ino) {
struct eggsfs_dir_fill_ctx* ctx = ptr;
eggsfs_debug("hash=%llx ino=%llx, name_len=%d, name=%*pE", hash, ino, name_len, name_len, name);
update_dcache(ctx->parent, ctx->dir_mtime, name, name_len, edge_creation_time, ino);
return eggsfs_dir_add_entry(ctx, name, name_len, ino);
}
// mut hold
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_mtime) {
bool eof = false;
int err;
u64 continuation_key = 0;
struct inode* inode = d_inode(dentry);
struct eggsfs_inode* enode = EGGSFS_I(inode);
struct eggsfs_dir_fill_ctx ctx = {
.parent = dentry,
.dir_mtime = dir_mtime,
};
struct page* first_page;
first_page = alloc_page(GFP_KERNEL);
if (!first_page) { return ERR_PTR(-ENOMEM); }
ctx.current_page = first_page;
ctx.page_addr = kmap(first_page);
ctx.page_off = 0;
ctx.page_n = 0;
first_page->index = dir_mtime;
while (!eof) {
eggsfs_debug("cont_key=%llx", continuation_key);
err = eggsfs_shard_readdir((struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, continuation_key, &ctx, &continuation_key);
if (err) { err = eggsfs_error_to_linux(err); goto out_err; }
eof = continuation_key == 0;
}
kunmap(ctx.current_page);
eggsfs_dir_set_page_n(ctx.current_page, ctx.page_n);
return first_page;
out_err:
kunmap(ctx.current_page);
while (first_page) {
struct page* next = (struct page*)first_page->mapping;
first_page->mapping = NULL;
put_page(first_page);
first_page = next;
}
eggsfs_info("err=%d", err);
return ERR_PTR(err);
};
// vfs: exclusive inode.i_rwsem
// vfs: mutex file.f_pos_lock
static int eggsfs_dir_read(struct file* filp, struct dir_context* dctx) {
struct eggsfs_readdir_ctx* ctx = filp->private_data;
char* page_addr;
WARN_ON(!ctx);
if (!ctx) { return -EINVAL; }
eggsfs_debug("ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u", ctx,
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n);
if (!dir_emit_dots(filp, dctx)) { return 0; }
if (!ctx->current_page || !eggsfs_dir_get_page_n(ctx->current_page)) { return 0; }
page_addr = kmap(ctx->current_page);
while (ctx->current_page) {
int dtype;
const char* entry = page_addr + ctx->page_offset;
eggsfs_debug("next page_addr=%p ctx.page_offset=%u entry=%p", page_addr, ctx->page_offset, entry);
u64 ino;
u8 name_len;
const char* name;
eggsfs_dir_entry_parse(entry, &ino, &name_len, &name);
eggsfs_debug("name=%*pE ino=0x%016llx", name_len, name, ino);
switch (eggsfs_inode_type(ino)) {
case EGGSFS_INODE_DIRECTORY: dtype = DT_DIR; break;
case EGGSFS_INODE_FILE: dtype = DT_REG; break;
case EGGSFS_INODE_SYMLINK: dtype = DT_LNK; break;
default: dtype = DT_UNKNOWN;
}
if (!dir_emit(dctx, name, name_len, ino, dtype)) {
eggsfs_debug("break");
break;
}
ctx->page_offset += eggsfs_dir_entry_size(name_len);
++ctx->page_n;
++dctx->pos;
eggsfs_debug(
"next ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u page_n=%u", ctx,
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n, eggsfs_dir_get_page_n(ctx->current_page)
);
if (ctx->page_n >= eggsfs_dir_get_page_n(ctx->current_page)) {
kunmap(ctx->current_page);
ctx->current_page = (struct page*)ctx->current_page->mapping;
eggsfs_debug("next page current_page = %p", ctx->current_page);
ctx->page_offset = 0;
ctx->page_n = 0;
page_addr = kmap(ctx->current_page);
}
}
if (ctx->current_page) {
eggsfs_debug("unmap current_page = %p", ctx->current_page);
kunmap(ctx->current_page);
}
eggsfs_debug("done");
return 0;
}
#if 0
// multiple seeks at the same time?
static loff_t eggsfs_llseek_dir(struct file* filp, loff_t offset, int whence) {
int err;
struct eggsfs_readdir_ctx* ctx = filp->private_data;
loff_t dir_offset;
if (whence != SEEK_CUR) { return -EINVAL; }
// concurrency with readdir?
// TODO: if offset == 0 reacquire pages
ctx->current_page = ctx->pages;
ctx->page_offset = 0;
ctx->page_n = 0;
if (offset > 2) {
dir_offset = offset - 2;
while (ctx->current_page && eggsfs_dir_get_page_n(ctx->current_page) <= dir_offset) {
dir_offset -= eggsfs_dir_get_page_n(ctx->current_page);
ctx->current_page = (struct page*)ctx->current_page->mapping;
}
if (ctx->current_page) {
char* page_addr = (char*)kmap(ctx->current_page);
while (ctx->page_n < dir_offset) {
BUG_ON(ctx->page_offset >= PAGE_SIZE);
u64 ino;
u8 name_len;
const char* name;
eggsfs_dir_entry_parse(page_addr + ctx->page_offset, &ino, &name_len, &name);
ctx->page_offset += eggsfs_dir_entry_size(name_len);
++ctx->page_n;
}
kunmap(ctx->current_page);
}
}
filp->f_pos = offset;
return offset;
}
#endif
static int eggsfs_dir_close(struct inode* inode, struct file* filp) {
struct eggsfs_readdir_ctx* ctx = (struct eggsfs_readdir_ctx*)filp->private_data;
WARN_ON(!ctx);
if (!ctx) { return -EINVAL; }
trace_eggsfs_vfs_closedir_enter(inode);
eggsfs_dir_put_cache(ctx->pages);
kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx);
trace_eggsfs_vfs_closedir_exit(inode, 0);
return 0;
}
struct file_operations eggsfs_dir_operations = {
.open = eggsfs_dir_open,
.iterate = eggsfs_dir_read,
.iterate_shared = eggsfs_dir_read,
.release = eggsfs_dir_close,
};
int __init eggsfs_dir_init(void) {
eggsfs_readdir_ctx_cachep = kmem_cache_create(
"eggsfs_readdir_ctx_cache",
sizeof(struct eggsfs_readdir_ctx),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL
);
if (!eggsfs_readdir_ctx_cachep) { return -ENOMEM; }
return 0;
}
void __cold eggsfs_dir_exit(void) {
kmem_cache_destroy(eggsfs_readdir_ctx_cachep);
}