mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-01-05 18:40:16 -06:00
463 lines
15 KiB
C
463 lines
15 KiB
C
#include "dir.h"
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/slab.h>
|
|
|
|
#include "log.h"
|
|
#include "err.h"
|
|
#include "inode.h"
|
|
#include "metadata.h"
|
|
#include "trace.h"
|
|
|
|
#define MSECS_TO_JIFFIES(_ms) ((_ms * HZ) / 1000)
|
|
|
|
// sysctls
|
|
int eggsfs_dir_refresh_time_jiffies = MSECS_TO_JIFFIES(250);
|
|
#define eggsfs_dir_get_page_n(_page) ({ *((((u32*)&(_page)->private))+1); })
|
|
#define eggsfs_dir_set_page_n(_page, _n) ({ *((((u32*)&(_page)->private))+1) = _n; })
|
|
|
|
static struct kmem_cache* eggsfs_readdir_ctx_cachep;
|
|
|
|
static inline int eggsfs_dir_entry_size(int name_len) {
|
|
return 8 + 1 + name_len; // inode + len + name
|
|
}
|
|
|
|
static inline void eggsfs_dir_entry_parse(const char* buf, u64* ino, u8* name_len, const char** name) {
|
|
*ino = get_unaligned_le64(buf); buf += 8;
|
|
*name_len = *(u8*)buf; buf++;
|
|
*name = buf; buf += *name_len;
|
|
}
|
|
|
|
struct eggsfs_readdir_ctx {
|
|
struct page* current_page;
|
|
struct page* pages;
|
|
u32 page_offset;
|
|
u32 page_n;
|
|
};
|
|
|
|
// Increments the refcount of the cache in ->private under RCU.
|
|
// After this we know the page won't be deallocated until we
|
|
// decrease the refcount.
|
|
static struct page* eggsfs_dir_get_cache(struct eggsfs_inode* enode) {
|
|
struct page* page;
|
|
rcu_read_lock();
|
|
page = rcu_dereference(enode->dir.pages);
|
|
if (page) { atomic_inc((atomic_t*)&page->private); }
|
|
rcu_read_unlock();
|
|
return page;
|
|
}
|
|
|
|
// Decrements the refcount, and frees up all the pages in the
|
|
// cache if we were the last ones. Must be called by
|
|
// somebody who called eggsfs_dir_get_cache before.
|
|
//
|
|
// Note that ->current_page and ->dir_pages are both holding a
|
|
// reference, so once you clear those you should call `eggsfs_dir_put_cache`.
|
|
static void eggsfs_dir_put_cache(struct page* page) {
|
|
if (atomic_dec_and_test((atomic_t*)&page->private)) {
|
|
LIST_HEAD(pages);
|
|
while (page) {
|
|
struct page* next = (struct page*)page->mapping;
|
|
page->mapping = NULL;
|
|
page->private = 0;
|
|
list_add_tail(&page->lru, &pages);
|
|
page = next;
|
|
}
|
|
put_pages_list(&pages);
|
|
}
|
|
}
|
|
|
|
static void __eggsfs_dir_put_cache_rcu(struct rcu_head* rcu) {
|
|
struct page* page = container_of(rcu, struct page, rcu_head);
|
|
eggsfs_dir_put_cache(page);
|
|
}
|
|
|
|
// Removes the cache from ->dir_pages, and also calls
|
|
// `eggsfs_dir_put_cache` to clear the reference.
|
|
void eggsfs_dir_drop_cache(struct eggsfs_inode* enode) {
|
|
struct page* page;
|
|
static_assert(sizeof(struct rcu_head) == sizeof(struct list_head));
|
|
if (!atomic64_read((atomic64_t*)&enode->dir.pages)) { return; }
|
|
page = (struct page*)atomic64_xchg((atomic64_t*)&enode->dir.pages, (u64)0);
|
|
if (page) { call_rcu(&page->rcu_head, __eggsfs_dir_put_cache_rcu); }
|
|
}
|
|
|
|
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_seqno);
|
|
|
|
static int eggsfs_dir_open(struct inode* inode, struct file* filp) {
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
struct eggsfs_readdir_ctx* ctx;
|
|
struct page* page;
|
|
int err;
|
|
|
|
trace_eggsfs_vfs_opendir_enter(inode);
|
|
|
|
if (get_jiffies_64() >= enode->mtime_expiry) {
|
|
err = eggsfs_dir_revalidate(enode);
|
|
if (err) { return err; }
|
|
}
|
|
|
|
ctx = kmem_cache_alloc(eggsfs_readdir_ctx_cachep, GFP_KERNEL);
|
|
if (!ctx) { return -ENOMEM; }
|
|
|
|
again: // progress: whoever wins the lock either get pages or fails
|
|
page = eggsfs_dir_get_cache(enode);
|
|
if (page && page->index != READ_ONCE(enode->mtime)) {
|
|
eggsfs_dir_put_cache(page);
|
|
eggsfs_dir_drop_cache(enode);
|
|
page = NULL;
|
|
}
|
|
if (!page) {
|
|
int seqno;
|
|
if (!eggsfs_latch_try_acquire(&enode->dir.pages_latch, seqno)) {
|
|
err = eggsfs_latch_wait_killable(&enode->dir.pages_latch, seqno);
|
|
if (err) { goto out_ctx; }
|
|
goto again;
|
|
}
|
|
|
|
// If somebody else got to creating the page already, drop it,
|
|
// otherwise we'll leak.
|
|
page = eggsfs_dir_get_cache(enode);
|
|
if (page && page->index != READ_ONCE(enode->mtime)) {
|
|
eggsfs_dir_put_cache(page);
|
|
eggsfs_dir_drop_cache(enode);
|
|
page = NULL;
|
|
}
|
|
|
|
if (!page) {
|
|
u64 dir_mtime = READ_ONCE(enode->mtime);
|
|
page = eggsfs_dir_read_all(filp->f_path.dentry, dir_mtime);
|
|
if (!IS_ERR(page)) {
|
|
// If the mtime matches (which will be the case unless somebody
|
|
// else revalidated between the READ_ONCE above), then the refcount
|
|
// starts at two: one for the ->dir_pages, and one for the
|
|
// ->current_page below. Otherwise only for ->current_page (basically
|
|
// we have these pages to be private to this read).
|
|
if (likely(READ_ONCE(enode->mtime) == dir_mtime)) {
|
|
atomic_set((atomic_t*)&page->private, 2);
|
|
rcu_assign_pointer(enode->dir.pages, page);
|
|
} else {
|
|
atomic_set((atomic_t*)&page->private, 1);
|
|
}
|
|
}
|
|
}
|
|
eggsfs_latch_release(&enode->dir.pages_latch, seqno);
|
|
}
|
|
if (IS_ERR(page)) { err = PTR_ERR(page); goto out_ctx; }
|
|
|
|
ctx->current_page = page;
|
|
ctx->pages = page;
|
|
ctx->page_offset = 0;
|
|
ctx->page_n = 0;
|
|
|
|
filp->private_data = ctx;
|
|
|
|
trace_eggsfs_vfs_opendir_exit(inode, 0);
|
|
return 0;
|
|
|
|
out_ctx:
|
|
kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx);
|
|
trace_eggsfs_vfs_opendir_exit(inode, err);
|
|
eggsfs_debug("err=%d", err);
|
|
return err;
|
|
}
|
|
|
|
static void update_dcache(struct dentry* parent, u64 dir_seqno, const char* name, int name_len, u64 edge_creation_time, u64 ino) {
|
|
struct qstr filename = QSTR_INIT(name, name_len);
|
|
struct dentry* dentry;
|
|
struct dentry* alias;
|
|
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
|
|
struct inode* parent_inode = parent->d_inode;
|
|
if (unlikely(parent_inode == NULL)) { // TODO can this ever happen?
|
|
eggsfs_warn("got NULL in dentry parent (ino %016llx)", ino);
|
|
return;
|
|
}
|
|
|
|
eggsfs_debug("parent=%pd name=%*pE ino=%llx", parent, name_len, name, ino);
|
|
|
|
filename.hash = full_name_hash(parent, filename.name, filename.len);
|
|
|
|
dentry = d_lookup(parent, &filename);
|
|
if (!dentry) {
|
|
again:
|
|
dentry = d_alloc_parallel(parent, &filename, &wq);
|
|
if (IS_ERR(dentry)) { return; }
|
|
}
|
|
if (!d_in_lookup(dentry)) { // d_alloc_parallel sets in_lookup
|
|
// pre-existing entry
|
|
struct inode* old_inode = d_inode(dentry);
|
|
if (old_inode && old_inode->i_ino == ino) {
|
|
WRITE_ONCE(dentry->d_time, dir_seqno);
|
|
goto out;
|
|
}
|
|
d_invalidate(dentry);
|
|
dput(dentry);
|
|
goto again;
|
|
} else {
|
|
// new entry
|
|
struct inode* inode = eggsfs_get_inode(parent->d_sb, EGGSFS_I(parent_inode), ino);
|
|
if (!IS_ERR(inode)) {
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
enode->edge_creation_time = edge_creation_time;
|
|
}
|
|
eggsfs_debug("inode=%p is_err=%d", inode, IS_ERR(inode));
|
|
// d_splice_alias propagates error in inode
|
|
WRITE_ONCE(dentry->d_time, dir_seqno);
|
|
alias = d_splice_alias(inode, dentry);
|
|
eggsfs_debug("alias=%p is_err=%d", alias, IS_ERR(alias));
|
|
d_lookup_done(dentry);
|
|
if (alias) {
|
|
dput(dentry);
|
|
dentry = alias;
|
|
}
|
|
if (IS_ERR(dentry)) {
|
|
eggsfs_debug("dentry err=%ld", PTR_ERR(dentry));
|
|
return;
|
|
}
|
|
}
|
|
|
|
out:
|
|
dput(dentry);
|
|
};
|
|
|
|
struct eggsfs_dir_fill_ctx {
|
|
struct dentry* parent;
|
|
u64 dir_mtime;
|
|
|
|
// NULL or kmap'ed at page_addr
|
|
struct page* current_page;
|
|
// kmap'd address of current_page
|
|
char* page_addr;
|
|
// Current offset inside the page we're writing
|
|
u32 page_off;
|
|
// Number of eggsfs dir entries in the current page
|
|
u32 page_n;
|
|
};
|
|
|
|
static int eggsfs_dir_add_entry(struct eggsfs_dir_fill_ctx* ctx, const char* name, int name_len, u64 ino) {
|
|
if (name_len > EGGSFS_MAX_FILENAME) { return -ENAMETOOLONG; }
|
|
// Flush current page, and start a new one, linking it in ->mapping
|
|
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
|
|
struct page* page = alloc_page(GFP_KERNEL);
|
|
if (!page) { return -ENOMEM; }
|
|
|
|
kunmap(ctx->current_page);
|
|
ctx->current_page->mapping = (void*)page;
|
|
eggsfs_dir_set_page_n(ctx->current_page, ctx->page_n);
|
|
|
|
ctx->current_page = page;
|
|
ctx->page_addr = kmap(page);
|
|
ctx->page_off = 0;
|
|
ctx->page_n = 0;
|
|
}
|
|
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
|
|
WARN_ON(ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE);
|
|
return -EIO;
|
|
}
|
|
|
|
char* entry = ctx->page_addr + ctx->page_off;
|
|
put_unaligned_le64(ino, entry); entry += 8;
|
|
*(u8*)entry = (u8)name_len; entry++;
|
|
memcpy(entry, name, name_len);
|
|
ctx->page_off += eggsfs_dir_entry_size(name_len);
|
|
++ctx->page_n;
|
|
return 0;
|
|
}
|
|
|
|
int eggsfs_dir_readdir_entry_cb(void* ptr, const char* name, int name_len, u64 hash, u64 edge_creation_time, u64 ino) {
|
|
struct eggsfs_dir_fill_ctx* ctx = ptr;
|
|
eggsfs_debug("hash=%llx ino=%llx, name_len=%d, name=%*pE", hash, ino, name_len, name_len, name);
|
|
update_dcache(ctx->parent, ctx->dir_mtime, name, name_len, edge_creation_time, ino);
|
|
return eggsfs_dir_add_entry(ctx, name, name_len, ino);
|
|
}
|
|
|
|
// mut hold
|
|
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_mtime) {
|
|
bool eof = false;
|
|
int err;
|
|
u64 continuation_key = 0;
|
|
struct inode* inode = d_inode(dentry);
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
struct eggsfs_dir_fill_ctx ctx = {
|
|
.parent = dentry,
|
|
.dir_mtime = dir_mtime,
|
|
};
|
|
struct page* first_page;
|
|
|
|
first_page = alloc_page(GFP_KERNEL);
|
|
if (!first_page) { return ERR_PTR(-ENOMEM); }
|
|
ctx.current_page = first_page;
|
|
ctx.page_addr = kmap(first_page);
|
|
ctx.page_off = 0;
|
|
ctx.page_n = 0;
|
|
first_page->index = dir_mtime;
|
|
|
|
while (!eof) {
|
|
eggsfs_debug("cont_key=%llx", continuation_key);
|
|
err = eggsfs_shard_readdir((struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, continuation_key, &ctx, &continuation_key);
|
|
if (err) { err = eggsfs_error_to_linux(err); goto out_err; }
|
|
eof = continuation_key == 0;
|
|
}
|
|
|
|
kunmap(ctx.current_page);
|
|
eggsfs_dir_set_page_n(ctx.current_page, ctx.page_n);
|
|
|
|
return first_page;
|
|
|
|
out_err:
|
|
kunmap(ctx.current_page);
|
|
while (first_page) {
|
|
struct page* next = (struct page*)first_page->mapping;
|
|
first_page->mapping = NULL;
|
|
put_page(first_page);
|
|
first_page = next;
|
|
}
|
|
eggsfs_info("err=%d", err);
|
|
return ERR_PTR(err);
|
|
};
|
|
|
|
// vfs: exclusive inode.i_rwsem
|
|
// vfs: mutex file.f_pos_lock
|
|
static int eggsfs_dir_read(struct file* filp, struct dir_context* dctx) {
|
|
struct eggsfs_readdir_ctx* ctx = filp->private_data;
|
|
char* page_addr;
|
|
|
|
WARN_ON(!ctx);
|
|
if (!ctx) { return -EINVAL; }
|
|
|
|
eggsfs_debug("ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u", ctx,
|
|
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n);
|
|
|
|
if (!dir_emit_dots(filp, dctx)) { return 0; }
|
|
if (!ctx->current_page || !eggsfs_dir_get_page_n(ctx->current_page)) { return 0; }
|
|
|
|
page_addr = kmap(ctx->current_page);
|
|
|
|
while (ctx->current_page) {
|
|
int dtype;
|
|
const char* entry = page_addr + ctx->page_offset;
|
|
eggsfs_debug("next page_addr=%p ctx.page_offset=%u entry=%p", page_addr, ctx->page_offset, entry);
|
|
|
|
u64 ino;
|
|
u8 name_len;
|
|
const char* name;
|
|
eggsfs_dir_entry_parse(entry, &ino, &name_len, &name);
|
|
eggsfs_debug("name=%*pE ino=0x%016llx", name_len, name, ino);
|
|
|
|
switch (eggsfs_inode_type(ino)) {
|
|
case EGGSFS_INODE_DIRECTORY: dtype = DT_DIR; break;
|
|
case EGGSFS_INODE_FILE: dtype = DT_REG; break;
|
|
case EGGSFS_INODE_SYMLINK: dtype = DT_LNK; break;
|
|
default: dtype = DT_UNKNOWN;
|
|
}
|
|
|
|
if (!dir_emit(dctx, name, name_len, ino, dtype)) {
|
|
eggsfs_debug("break");
|
|
break;
|
|
}
|
|
|
|
ctx->page_offset += eggsfs_dir_entry_size(name_len);
|
|
++ctx->page_n;
|
|
++dctx->pos;
|
|
|
|
eggsfs_debug(
|
|
"next ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u page_n=%u", ctx,
|
|
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n, eggsfs_dir_get_page_n(ctx->current_page)
|
|
);
|
|
|
|
if (ctx->page_n >= eggsfs_dir_get_page_n(ctx->current_page)) {
|
|
kunmap(ctx->current_page);
|
|
ctx->current_page = (struct page*)ctx->current_page->mapping;
|
|
eggsfs_debug("next page current_page = %p", ctx->current_page);
|
|
ctx->page_offset = 0;
|
|
ctx->page_n = 0;
|
|
page_addr = kmap(ctx->current_page);
|
|
}
|
|
}
|
|
|
|
if (ctx->current_page) {
|
|
eggsfs_debug("unmap current_page = %p", ctx->current_page);
|
|
kunmap(ctx->current_page);
|
|
}
|
|
|
|
eggsfs_debug("done");
|
|
return 0;
|
|
}
|
|
|
|
#if 0
|
|
// multiple seeks at the same time?
|
|
static loff_t eggsfs_llseek_dir(struct file* filp, loff_t offset, int whence) {
|
|
int err;
|
|
struct eggsfs_readdir_ctx* ctx = filp->private_data;
|
|
loff_t dir_offset;
|
|
|
|
if (whence != SEEK_CUR) { return -EINVAL; }
|
|
// concurrency with readdir?
|
|
// TODO: if offset == 0 reacquire pages
|
|
|
|
ctx->current_page = ctx->pages;
|
|
ctx->page_offset = 0;
|
|
ctx->page_n = 0;
|
|
if (offset > 2) {
|
|
dir_offset = offset - 2;
|
|
while (ctx->current_page && eggsfs_dir_get_page_n(ctx->current_page) <= dir_offset) {
|
|
dir_offset -= eggsfs_dir_get_page_n(ctx->current_page);
|
|
ctx->current_page = (struct page*)ctx->current_page->mapping;
|
|
}
|
|
if (ctx->current_page) {
|
|
char* page_addr = (char*)kmap(ctx->current_page);
|
|
while (ctx->page_n < dir_offset) {
|
|
BUG_ON(ctx->page_offset >= PAGE_SIZE);
|
|
u64 ino;
|
|
u8 name_len;
|
|
const char* name;
|
|
eggsfs_dir_entry_parse(page_addr + ctx->page_offset, &ino, &name_len, &name);
|
|
ctx->page_offset += eggsfs_dir_entry_size(name_len);
|
|
++ctx->page_n;
|
|
}
|
|
kunmap(ctx->current_page);
|
|
}
|
|
}
|
|
|
|
filp->f_pos = offset;
|
|
|
|
return offset;
|
|
}
|
|
#endif
|
|
|
|
static int eggsfs_dir_close(struct inode* inode, struct file* filp) {
|
|
struct eggsfs_readdir_ctx* ctx = (struct eggsfs_readdir_ctx*)filp->private_data;
|
|
|
|
WARN_ON(!ctx);
|
|
if (!ctx) { return -EINVAL; }
|
|
|
|
trace_eggsfs_vfs_closedir_enter(inode);
|
|
eggsfs_dir_put_cache(ctx->pages);
|
|
kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx);
|
|
trace_eggsfs_vfs_closedir_exit(inode, 0);
|
|
return 0;
|
|
}
|
|
|
|
struct file_operations eggsfs_dir_operations = {
|
|
.open = eggsfs_dir_open,
|
|
.iterate = eggsfs_dir_read,
|
|
.iterate_shared = eggsfs_dir_read,
|
|
.release = eggsfs_dir_close,
|
|
};
|
|
|
|
int __init eggsfs_dir_init(void) {
|
|
eggsfs_readdir_ctx_cachep = kmem_cache_create(
|
|
"eggsfs_readdir_ctx_cache",
|
|
sizeof(struct eggsfs_readdir_ctx),
|
|
0,
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
|
|
NULL
|
|
);
|
|
if (!eggsfs_readdir_ctx_cachep) { return -ENOMEM; }
|
|
return 0;
|
|
}
|
|
|
|
void __cold eggsfs_dir_exit(void) {
|
|
kmem_cache_destroy(eggsfs_readdir_ctx_cachep);
|
|
}
|