mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-01-06 11:00:10 -06:00
465 lines
16 KiB
C
465 lines
16 KiB
C
#include "inode.h"
|
|
|
|
#include <linux/sched/mm.h>
|
|
|
|
#include "log.h"
|
|
#include "dir.h"
|
|
#include "metadata.h"
|
|
#include "dentry.h"
|
|
#include "trace.h"
|
|
#include "err.h"
|
|
#include "file.h"
|
|
#include "wq.h"
|
|
#include "span.h"
|
|
|
|
static struct kmem_cache* eggsfs_inode_cachep;
|
|
|
|
struct inode* eggsfs_inode_alloc(struct super_block* sb) {
|
|
struct eggsfs_inode* enode;
|
|
|
|
eggsfs_debug("sb=%p", sb);
|
|
|
|
enode = (struct eggsfs_inode*)kmem_cache_alloc(eggsfs_inode_cachep, GFP_NOFS);
|
|
if (!enode) {
|
|
return NULL;
|
|
}
|
|
|
|
enode->mtime = 0;
|
|
enode->mtime_expiry = 0;
|
|
enode->edge_creation_time = 0;
|
|
eggsfs_latch_init(&enode->getattr_update_latch);
|
|
|
|
eggsfs_debug("done enode=%p", enode);
|
|
return &enode->inode;
|
|
}
|
|
|
|
void eggsfs_inode_evict(struct inode* inode) {
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
eggsfs_debug("enode=%p", enode);
|
|
if (S_ISDIR(inode->i_mode)) {
|
|
eggsfs_dir_drop_cache(enode);
|
|
} else if (S_ISREG(inode->i_mode)) {
|
|
eggsfs_drop_file_spans(enode);
|
|
}
|
|
truncate_inode_pages(&inode->i_data, 0);
|
|
clear_inode(inode);
|
|
}
|
|
|
|
void eggsfs_inode_free(struct inode* inode) {
|
|
// We need to clear the spans
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
eggsfs_debug("enode=%p", enode);
|
|
kmem_cache_free(eggsfs_inode_cachep, enode);
|
|
}
|
|
|
|
static void eggsfs_inode_init_once(void* ptr) {
|
|
struct eggsfs_inode* enode = (struct eggsfs_inode*)ptr;
|
|
|
|
inode_init_once(&enode->inode);
|
|
}
|
|
|
|
int __init eggsfs_inode_init(void) {
|
|
eggsfs_inode_cachep = kmem_cache_create(
|
|
"eggsfs_inode_cache", sizeof(struct eggsfs_inode),
|
|
0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, eggsfs_inode_init_once
|
|
);
|
|
if (!eggsfs_inode_cachep) { return -ENOMEM; }
|
|
|
|
return 0;
|
|
}
|
|
|
|
void __cold eggsfs_inode_exit(void) {
|
|
eggsfs_debug("inode exit");
|
|
rcu_barrier();
|
|
kmem_cache_destroy(eggsfs_inode_cachep);
|
|
}
|
|
|
|
|
|
int eggsfs_do_getattr(struct eggsfs_inode* enode) {
|
|
int err;
|
|
s64 seqno;
|
|
|
|
again: // progress: whoever wins the lock won't try again
|
|
eggsfs_debug("enode=%p id=0x%016lx mtime=%lld mtime_expiry=%lld", enode, enode->inode.i_ino, enode->mtime, enode->mtime_expiry);
|
|
|
|
if (eggsfs_latch_try_acquire(&enode->getattr_update_latch, seqno)) {
|
|
u64 ts = get_jiffies_64();
|
|
if (smp_load_acquire(&enode->mtime_expiry) > ts) { err = 0; goto out; }
|
|
|
|
u64 mtime;
|
|
u64 expiry;
|
|
bool has_atime = false;
|
|
u64 atime;
|
|
if (S_ISDIR(enode->inode.i_mode)) {
|
|
u64 owner;
|
|
struct eggsfs_policy_body block_policy;
|
|
struct eggsfs_policy_body span_policy;
|
|
struct eggsfs_policy_body stripe_policy;
|
|
err = eggsfs_shard_getattr_dir(
|
|
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
|
enode->inode.i_ino,
|
|
&mtime,
|
|
&owner,
|
|
&block_policy,
|
|
&span_policy,
|
|
&stripe_policy
|
|
);
|
|
if (err == 0) {
|
|
if (block_policy.len) {
|
|
enode->block_policy = eggsfs_upsert_policy(enode->inode.i_ino, BLOCK_POLICY_TAG, block_policy.body, block_policy.len);
|
|
}
|
|
if (span_policy.len) {
|
|
enode->span_policy = eggsfs_upsert_policy(enode->inode.i_ino, SPAN_POLICY_TAG, span_policy.body, span_policy.len);
|
|
}
|
|
if (stripe_policy.len) {
|
|
enode->stripe_policy = eggsfs_upsert_policy(enode->inode.i_ino, STRIPE_POLICY_TAG, stripe_policy.body, stripe_policy.len);
|
|
}
|
|
expiry = ts + eggsfs_dir_refresh_time_jiffies;
|
|
}
|
|
} else {
|
|
// Make it so that this function is only called once. Afterwards
|
|
// we always keep this up to date internally, until the file is
|
|
// linked in which case it never changes size nor mtime TODO
|
|
// this will change soon as we add utime.
|
|
if (enode->file.status == EGGSFS_FILE_STATUS_READING || enode->file.status == EGGSFS_FILE_STATUS_NONE) {
|
|
eggsfs_debug("updating getattr for reading file");
|
|
u64 size;
|
|
err = eggsfs_shard_getattr_file(
|
|
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
|
enode->inode.i_ino,
|
|
&mtime,
|
|
&atime,
|
|
&size
|
|
);
|
|
has_atime = true;
|
|
if (err == EGGSFS_ERR_FILE_NOT_FOUND && enode->file.status == EGGSFS_FILE_STATUS_NONE) { // probably just created
|
|
enode->inode.i_size = 0;
|
|
expiry = 0;
|
|
mtime = 0;
|
|
} else if (err == 0) {
|
|
enode->inode.i_size = size;
|
|
expiry = ts + eggsfs_file_refresh_time_jiffies;
|
|
}
|
|
} else {
|
|
BUG_ON(enode->file.status != EGGSFS_FILE_STATUS_WRITING);
|
|
expiry = ~(uint64_t)0; // we take care of this from now on
|
|
enode->inode.i_size = 0;
|
|
mtime = 0;
|
|
}
|
|
}
|
|
if (err) { err = eggsfs_error_to_linux(err); goto out; }
|
|
else {
|
|
WRITE_ONCE(enode->mtime, mtime);
|
|
eggsfs_debug("got mtime %llu", mtime);
|
|
enode->inode.i_mtime.tv_sec = mtime / 1000000000;
|
|
enode->inode.i_mtime.tv_nsec = mtime % 1000000000;
|
|
if (has_atime) {
|
|
eggsfs_debug("got atime %llu", mtime);
|
|
enode->inode.i_atime.tv_sec = atime / 1000000000;
|
|
enode->inode.i_atime.tv_nsec = atime % 1000000000;
|
|
}
|
|
}
|
|
|
|
smp_store_release(&enode->mtime_expiry, expiry);
|
|
|
|
out:
|
|
WRITE_ONCE(enode->getattr_err, err);
|
|
eggsfs_latch_release(&enode->getattr_update_latch, seqno);
|
|
eggsfs_debug("out mtime=%llu mtime_expiry=%llu", enode->mtime, enode->mtime_expiry);
|
|
return err;
|
|
} else {
|
|
err = eggsfs_latch_wait_killable(&enode->getattr_update_latch, seqno);
|
|
if (err) { return err; }
|
|
if (READ_ONCE(enode->getattr_err)) { goto again; }
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
static struct eggsfs_inode* eggsfs_create_internal(struct inode* parent, int itype, struct dentry* dentry) {
|
|
int err;
|
|
|
|
BUG_ON(dentry->d_inode);
|
|
|
|
if (current->group_leader->mm == NULL) {
|
|
// internal-repo/blob/main/docs/kmod-file-tracking.md
|
|
eggsfs_warn("current->group_leader->mm = NULL, called from kernel thread?");
|
|
return(ERR_PTR(-EIO));
|
|
}
|
|
|
|
struct eggsfs_inode* parent_enode = EGGSFS_I(parent);
|
|
|
|
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) { return ERR_PTR(-ENAMETOOLONG); }
|
|
|
|
eggsfs_debug("creating %*s in %016lx", dentry->d_name.len, dentry->d_name.name, parent->i_ino);
|
|
|
|
u64 ino, cookie;
|
|
err = eggsfs_error_to_linux(eggsfs_shard_create_file(
|
|
(struct eggsfs_fs_info*)parent->i_sb->s_fs_info, eggsfs_inode_shard(parent->i_ino),
|
|
itype, dentry->d_name.name, dentry->d_name.len, &ino, &cookie
|
|
));
|
|
if (err) { return ERR_PTR(eggsfs_error_to_linux(err)); }
|
|
// make this dentry stick around?
|
|
|
|
struct inode* inode = eggsfs_get_inode(parent->i_sb, parent_enode, ino); // new_inode
|
|
if (IS_ERR(inode)) { return ERR_PTR(PTR_ERR(inode)); }
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
|
|
enode->file.status = EGGSFS_FILE_STATUS_WRITING;
|
|
|
|
// Initialize all the transient specific fields
|
|
enode->file.cookie = cookie;
|
|
atomic_set(&enode->file.transient_err, 0);
|
|
enode->file.writing_span = NULL;
|
|
sema_init(&enode->file.flushing_span_sema, 1); // 1 = free to flush
|
|
enode->file.owner = current->group_leader;
|
|
enode->file.mm = current->group_leader->mm;
|
|
// We might need the mm beyond the file lifetime, to clear up block write pages MM_FILEPAGES
|
|
mmgrab(enode->file.mm);
|
|
|
|
return enode;
|
|
}
|
|
|
|
static int eggsfs_create(struct inode* parent, struct dentry* dentry, umode_t mode, bool excl) {
|
|
struct eggsfs_inode* enode = eggsfs_create_internal(parent, EGGSFS_INODE_FILE, dentry);
|
|
if (IS_ERR(enode)) { return PTR_ERR(enode); }
|
|
|
|
// This is not valid yet, we need to fill it in first
|
|
d_instantiate(dentry, &enode->inode);
|
|
d_invalidate(dentry);
|
|
|
|
return 0;
|
|
}
|
|
|
|
// vfs: refcount of path->dentry
|
|
static int eggsfs_getattr(const struct path* path, struct kstat* stat, u32 request_mask, unsigned int query_flags) {
|
|
struct inode* inode = d_inode(path->dentry);
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
|
|
trace_eggsfs_vfs_getattr_enter(inode);
|
|
|
|
// >= so that eggsfs_dir_refresh_time=0 causes revalidation at every call to this function
|
|
if (get_jiffies_64() >= smp_load_acquire(&enode->mtime_expiry)) {
|
|
int err;
|
|
|
|
// FIXME: symlinks
|
|
if (!(request_mask & STATX_MTIME) && (S_ISDIR(inode->i_mode) || !(request_mask & STATX_SIZE))) {
|
|
goto done;
|
|
}
|
|
|
|
trace_eggsfs_vfs_getattr_lock(inode);
|
|
|
|
// dentry refcount also protects the inode (e.g. d_delete will not turn used dentry into a negative one),
|
|
// so no need to grab anything before we start waiting for stuff
|
|
err = eggsfs_do_getattr(enode);
|
|
if (err) {
|
|
trace_eggsfs_vfs_getattr_exit(inode, err);
|
|
return err;
|
|
}
|
|
}
|
|
|
|
done:
|
|
generic_fillattr(inode, stat);
|
|
trace_eggsfs_vfs_getattr_exit(inode, 0);
|
|
return 0;
|
|
}
|
|
|
|
static int eggsfs_setattr(struct dentry* dentry, struct iattr* attr) {
|
|
// ATTR_TIMES_SET/ATTR_TOUCH is just to distinguish between a touch
|
|
// and an explicit time set
|
|
// Note that `utimes_common` unconditionally sets ATTR_CTIME, but we
|
|
// can't do much with it.
|
|
if (attr->ia_valid & ~(ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_TIMES_SET|ATTR_CTIME|ATTR_TOUCH)) {
|
|
eggsfs_debug("skipping setattr, ia_valid=%08x", attr->ia_valid);
|
|
return -EPERM;
|
|
}
|
|
|
|
uint64_t now = ktime_get_real_ns();
|
|
|
|
uint64_t atime = 0;
|
|
if (attr->ia_valid & ATTR_ATIME) {
|
|
if (attr->ia_valid & ATTR_ATIME_SET) {
|
|
atime = (attr->ia_atime.tv_sec * 1000000000ll) + attr->ia_atime.tv_nsec;
|
|
} else {
|
|
atime = now;
|
|
}
|
|
eggsfs_debug("setting atime to %llu", atime);
|
|
atime |= 1ull << 63;
|
|
}
|
|
|
|
uint64_t mtime = 0;
|
|
if (attr->ia_valid & ATTR_MTIME) {
|
|
if (attr->ia_valid & ATTR_MTIME_SET) {
|
|
mtime = (attr->ia_mtime.tv_sec * 1000000000ll) + attr->ia_mtime.tv_nsec;
|
|
} else {
|
|
mtime = now;
|
|
}
|
|
eggsfs_debug("setting mtime to %llu", mtime);
|
|
mtime |= 1ull << 63;
|
|
}
|
|
|
|
int err = eggsfs_shard_set_time((struct eggsfs_fs_info*)dentry->d_inode->i_sb->s_fs_info, dentry->d_inode->i_ino, mtime, atime);
|
|
if (err) { return err; }
|
|
|
|
// could copy out attributes instead?
|
|
smp_store_release(&EGGSFS_I(dentry->d_inode)->mtime_expiry, 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int eggsfs_symlink(struct inode* dir, struct dentry* dentry, const char* path) {
|
|
struct eggsfs_inode* enode = eggsfs_create_internal(dir, EGGSFS_INODE_SYMLINK, dentry);
|
|
if (IS_ERR(enode)) { return PTR_ERR(enode); }
|
|
|
|
enode->file.status = EGGSFS_FILE_STATUS_WRITING; // needed for flush to flush below
|
|
|
|
size_t len = strlen(path);
|
|
|
|
// We now need to write out the symlink contents...
|
|
loff_t ppos = 0;
|
|
struct kvec vec;
|
|
struct iov_iter from;
|
|
vec.iov_base = (void*)path;
|
|
vec.iov_len = len;
|
|
iov_iter_kvec(&from, READ, &vec, 1, vec.iov_len);
|
|
inode_lock(&enode->inode);
|
|
int err = eggsfs_file_write(enode, 0, &ppos, &from);
|
|
inode_unlock(&enode->inode);
|
|
if (err < 0) { return err; }
|
|
// ...and flush them
|
|
err = eggsfs_file_flush(enode, dentry);
|
|
if (err < 0) { return err; }
|
|
|
|
// Now we link the dentry in
|
|
d_instantiate(dentry, &enode->inode);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const char* eggsfs_get_link(struct dentry* dentry, struct inode* inode, struct delayed_call* destructor) {
|
|
// Can't be bothered to think about RCU
|
|
if (dentry == NULL) { return ERR_PTR(-ECHILD); }
|
|
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
char* buf = eggsfs_read_link(enode);
|
|
|
|
if (IS_ERR(buf)) { return buf; }
|
|
|
|
destructor->fn = eggsfs_link_destructor;
|
|
destructor->arg = buf;
|
|
|
|
return buf;
|
|
}
|
|
|
|
static const struct inode_operations eggsfs_dir_inode_ops = {
|
|
.create = eggsfs_create,
|
|
.lookup = eggsfs_lookup,
|
|
.unlink = eggsfs_unlink,
|
|
.mkdir = eggsfs_mkdir,
|
|
.rmdir = eggsfs_rmdir,
|
|
.rename = eggsfs_rename,
|
|
.getattr = eggsfs_getattr,
|
|
.symlink = eggsfs_symlink,
|
|
};
|
|
|
|
static const struct inode_operations eggsfs_file_inode_ops = {
|
|
.getattr = eggsfs_getattr,
|
|
.setattr = eggsfs_setattr,
|
|
};
|
|
|
|
static const struct inode_operations eggsfs_symlink_inode_ops = {
|
|
.getattr = eggsfs_getattr,
|
|
.setattr = eggsfs_setattr,
|
|
.get_link = eggsfs_get_link,
|
|
};
|
|
|
|
extern struct file_operations eggsfs_dir_operations;
|
|
|
|
struct inode* eggsfs_get_inode(struct super_block* sb, struct eggsfs_inode* parent, u64 ino) {
|
|
trace_eggsfs_get_inode_enter(ino);
|
|
|
|
struct inode* inode = iget_locked(sb, ino); // new_inode when possible?
|
|
if (!inode) {
|
|
trace_eggsfs_get_inode_exit(ino, NULL, false, 0);
|
|
return ERR_PTR(-ENOMEM);
|
|
}
|
|
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
|
|
|
bool new = inode->i_state & I_NEW;
|
|
if (new) {
|
|
switch (eggsfs_inode_type(ino)) {
|
|
case EGGSFS_INODE_DIRECTORY: inode->i_mode = S_IFDIR; break;
|
|
case EGGSFS_INODE_FILE: inode->i_mode = S_IFREG; break;
|
|
case EGGSFS_INODE_SYMLINK: inode->i_mode = S_IFLNK; break;
|
|
default: BUG();
|
|
}
|
|
inode->i_mode |= S_ISDIR(inode->i_mode) ? 0777 : 0666;
|
|
inode->i_blocks = 0;
|
|
inode->i_size = 0;
|
|
|
|
inode->i_op = NULL;
|
|
inode->i_fop = NULL;
|
|
if (S_ISDIR(inode->i_mode)) {
|
|
inode->i_op = &eggsfs_dir_inode_ops;
|
|
inode->i_fop = &eggsfs_dir_operations;
|
|
|
|
rcu_assign_pointer(enode->dir.pages, NULL);
|
|
eggsfs_latch_init(&enode->dir.pages_latch);
|
|
} else if (S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
|
|
if (unlikely(S_ISLNK(inode->i_mode))) {
|
|
inode->i_op = &eggsfs_symlink_inode_ops;
|
|
} else {
|
|
inode->i_op = &eggsfs_file_inode_ops;
|
|
}
|
|
inode->i_fop = &eggsfs_file_operations;
|
|
|
|
enode->file.status = EGGSFS_FILE_STATUS_NONE;
|
|
|
|
// Init normal file stuff -- that's always there. The transient
|
|
// file stuff is only filled in if needed.
|
|
enode->file.spans = RB_ROOT;
|
|
init_rwsem(&enode->file.spans_lock);
|
|
atomic64_set(&enode->file.prefetch_section, 0);
|
|
atomic_set(&enode->file.in_flight, 0);
|
|
init_waitqueue_head(&enode->file.in_flight_wq);
|
|
}
|
|
|
|
inode->i_uid = make_kuid(&init_user_ns, 1000);
|
|
inode->i_gid = make_kgid(&init_user_ns, 1000);
|
|
|
|
// Only == NULL when we're getting the root inode
|
|
if (parent) {
|
|
enode->block_policy = parent->block_policy;
|
|
enode->span_policy = parent->span_policy;
|
|
enode->stripe_policy = parent->stripe_policy;
|
|
} else {
|
|
if (ino != EGGSFS_ROOT_INODE) {
|
|
struct eggsfs_inode* root = EGGSFS_I(sb->s_root->d_inode);
|
|
enode->block_policy = root->block_policy;
|
|
enode->span_policy = root->span_policy;
|
|
enode->stripe_policy = root->stripe_policy;
|
|
} else {
|
|
enode->block_policy = NULL;
|
|
enode->span_policy = NULL;
|
|
enode->stripe_policy = NULL;
|
|
}
|
|
}
|
|
|
|
unlock_new_inode(inode);
|
|
}
|
|
|
|
trace_eggsfs_get_inode_exit(ino, inode, new, 0);
|
|
return inode;
|
|
}
|
|
|
|
void eggsfs_wait_in_flight(struct eggsfs_inode* enode) {
|
|
if (atomic_read(&enode->file.in_flight) == 0) { return; }
|
|
|
|
long res = wait_event_timeout(enode->file.in_flight_wq, atomic_read(&enode->file.in_flight) == 0, 10 * HZ);
|
|
if (res > 0) { return; }
|
|
eggsfs_warn("waited for 10 seconds for in flight requests for inode %016lx, either some requests are stuck or this is a bug, will wait for a minute more", enode->inode.i_ino);
|
|
|
|
res = wait_event_timeout(enode->file.in_flight_wq, atomic_read(&enode->file.in_flight) == 0, 60 * HZ);
|
|
if (res > 0) { return; }
|
|
eggsfs_warn("waited for 60 seconds for in flight requests for inode %016lx, either some requests are stuck or this is a bug", enode->inode.i_ino);
|
|
}
|