Continue tightening various ownership structures

Also, start renaming static stuff taking `eggsfs` out, I get tired
typing. Various other tweaks, too.
This commit is contained in:
Francesco Mazzoli
2023-06-09 11:34:46 +00:00
parent 72631dca4d
commit 583b53a111
21 changed files with 749 additions and 404 deletions

1
kmod/bincode.c Normal file
View File

@@ -0,0 +1 @@
#include "bincodegen.c"

View File

@@ -150,4 +150,6 @@ static const u8 EGGSFS_FLASH_STORAGE = 3;
#define EGGSFS_BLOCK_SERVICE_DONT_READ (EGGSFS_BLOCK_SERVICE_STALE | EGGSFS_BLOCK_SERVICE_NO_READ | EGGSFS_BLOCK_SERVICE_DECOMMISSIONED)
#define EGGSFS_BLOCK_SERVICE_DONT_WRITE (EGGSFS_BLOCK_SERVICE_STALE | EGGSFS_BLOCK_SERVICE_NO_WRITE | EGGSFS_BLOCK_SERVICE_DECOMMISSIONED)
#define EGGSFS_MAX_STRIPES 15
#endif

109
kmod/bincodegen.c Normal file
View File

@@ -0,0 +1,109 @@
const char* eggsfs_err_str(int err) {
switch (err) {
case 10: return "INTERNAL_ERROR";
case 11: return "FATAL_ERROR";
case 12: return "TIMEOUT";
case 13: return "MALFORMED_REQUEST";
case 14: return "MALFORMED_RESPONSE";
case 15: return "NOT_AUTHORISED";
case 16: return "UNRECOGNIZED_REQUEST";
case 17: return "FILE_NOT_FOUND";
case 18: return "DIRECTORY_NOT_FOUND";
case 19: return "NAME_NOT_FOUND";
case 20: return "EDGE_NOT_FOUND";
case 21: return "EDGE_IS_LOCKED";
case 22: return "TYPE_IS_DIRECTORY";
case 23: return "TYPE_IS_NOT_DIRECTORY";
case 24: return "BAD_COOKIE";
case 25: return "INCONSISTENT_STORAGE_CLASS_PARITY";
case 26: return "LAST_SPAN_STATE_NOT_CLEAN";
case 27: return "COULD_NOT_PICK_BLOCK_SERVICES";
case 28: return "BAD_SPAN_BODY";
case 29: return "SPAN_NOT_FOUND";
case 30: return "BLOCK_SERVICE_NOT_FOUND";
case 31: return "CANNOT_CERTIFY_BLOCKLESS_SPAN";
case 32: return "BAD_NUMBER_OF_BLOCKS_PROOFS";
case 33: return "BAD_BLOCK_PROOF";
case 34: return "CANNOT_OVERRIDE_NAME";
case 35: return "NAME_IS_LOCKED";
case 36: return "MTIME_IS_TOO_RECENT";
case 37: return "MISMATCHING_TARGET";
case 38: return "MISMATCHING_OWNER";
case 39: return "MISMATCHING_CREATION_TIME";
case 40: return "DIRECTORY_NOT_EMPTY";
case 41: return "FILE_IS_TRANSIENT";
case 42: return "OLD_DIRECTORY_NOT_FOUND";
case 43: return "NEW_DIRECTORY_NOT_FOUND";
case 44: return "LOOP_IN_DIRECTORY_RENAME";
case 45: return "DIRECTORY_HAS_OWNER";
case 46: return "FILE_IS_NOT_TRANSIENT";
case 47: return "FILE_NOT_EMPTY";
case 48: return "CANNOT_REMOVE_ROOT_DIRECTORY";
case 49: return "FILE_EMPTY";
case 50: return "CANNOT_REMOVE_DIRTY_SPAN";
case 51: return "BAD_SHARD";
case 52: return "BAD_NAME";
case 53: return "MORE_RECENT_SNAPSHOT_EDGE";
case 54: return "MORE_RECENT_CURRENT_EDGE";
case 55: return "BAD_DIRECTORY_INFO";
case 56: return "DEADLINE_NOT_PASSED";
case 57: return "SAME_SOURCE_AND_DESTINATION";
case 58: return "SAME_DIRECTORIES";
case 59: return "SAME_SHARD";
case 60: return "BAD_PROTOCOL_VERSION";
case 61: return "BAD_CERTIFICATE";
case 62: return "BLOCK_TOO_RECENT_FOR_DELETION";
case 63: return "BLOCK_FETCH_OUT_OF_BOUNDS";
case 64: return "BAD_BLOCK_CRC";
case 65: return "BLOCK_TOO_BIG";
case 66: return "BLOCK_NOT_FOUND";
default: return "UNKNOWN";
}
}
const char* eggsfs_shard_kind_str(int kind) {
switch (kind) {
case 1: return "LOOKUP";
case 2: return "STAT_FILE";
case 4: return "STAT_DIRECTORY";
case 5: return "READ_DIR";
case 6: return "CONSTRUCT_FILE";
case 7: return "ADD_SPAN_INITIATE";
case 8: return "ADD_SPAN_CERTIFY";
case 9: return "LINK_FILE";
case 10: return "SOFT_UNLINK_FILE";
case 11: return "FILE_SPANS";
case 12: return "SAME_DIRECTORY_RENAME";
case 16: return "ADD_INLINE_SPAN";
case 14: return "SNAPSHOT_LOOKUP";
default: return "UNKNOWN";
}
}
const char* eggsfs_cdc_kind_str(int kind) {
switch (kind) {
case 1: return "MAKE_DIRECTORY";
case 2: return "RENAME_FILE";
case 3: return "SOFT_UNLINK_DIRECTORY";
case 4: return "RENAME_DIRECTORY";
default: return "UNKNOWN";
}
}
const char* eggsfs_shuckle_kind_str(int kind) {
switch (kind) {
case 3: return "SHARDS";
case 7: return "CDC";
case 8: return "INFO";
default: return "UNKNOWN";
}
}
const char* eggsfs_blocks_kind_str(int kind) {
switch (kind) {
case 2: return "FETCH_BLOCK";
case 3: return "WRITE_BLOCK";
default: return "UNKNOWN";
}
}

View File

@@ -1,4 +1,4 @@
#include "namei.h"
#include "dentry.h"
#include "log.h"
#include "dir.h"

View File

@@ -1,5 +1,5 @@
#ifndef _EGGSFS_NAMEI_H
#define _EGGSFS_NAMEI_H
#ifndef _EGGSFS_DENTRY_H
#define _EGGSFS_DENTRY_H
#include <linux/fs.h>
@@ -7,6 +7,8 @@
EGGSFS_DECLARE_COUNTER(eggsfs_stat_dir_revalidations);
extern struct dentry_operations eggsfs_dentry_ops;
struct dentry* eggsfs_lookup(struct inode* dir, struct dentry* dentry, unsigned int flags);
int eggsfs_mkdir(struct inode* dir, struct dentry* dentry, umode_t mode);
int eggsfs_rmdir(struct inode* dir, struct dentry* dentry);
@@ -17,6 +19,4 @@ int eggsfs_rename(
unsigned int flags
);
extern struct dentry_operations eggsfs_dentry_ops;
#endif

View File

@@ -15,6 +15,8 @@ int eggsfs_dir_refresh_time; // in jiffies
#define eggsfs_dir_get_page_n(_page) ({ *((((u32*)&(_page)->private))+1); })
#define eggsfs_dir_set_page_n(_page, _n) ({ *((((u32*)&(_page)->private))+1) = _n; })
static struct kmem_cache* eggsfs_readdir_ctx_cachep;
static inline int eggsfs_dir_entry_size(int name_len) {
return 8 + 1 + name_len; // inode + len + name
}
@@ -94,7 +96,7 @@ static int eggsfs_dir_open(struct inode* inode, struct file* filp) {
if (err) { return err; }
}
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
ctx = kmem_cache_alloc(eggsfs_readdir_ctx_cachep, GFP_KERNEL);
if (!ctx) { return -ENOMEM; }
again: // progress: whoever wins the lock either get pages or fails
@@ -153,7 +155,7 @@ again: // progress: whoever wins the lock either get pages or fails
return 0;
out_ctx:
kfree(ctx);
kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx);
trace_eggsfs_vfs_opendir_exit(inode, err);
eggsfs_debug("err=%d", err);
return err;
@@ -430,7 +432,7 @@ static int eggsfs_dir_close(struct inode* inode, struct file* filp) {
trace_eggsfs_vfs_closedir_enter(inode);
eggsfs_dir_put_cache(ctx->pages);
kfree(ctx);
kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx);
trace_eggsfs_vfs_closedir_exit(inode, 0);
return 0;
}
@@ -441,3 +443,19 @@ struct file_operations eggsfs_dir_operations = {
.iterate_shared = eggsfs_dir_read,
.release = eggsfs_dir_close,
};
int __init eggsfs_dir_init(void) {
eggsfs_readdir_ctx_cachep = kmem_cache_create(
"eggsfs_readdir_ctx_cache",
sizeof(struct eggsfs_readdir_ctx),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL
);
if (!eggsfs_readdir_ctx_cachep) { return -ENOMEM; }
return 0;
}
void __cold eggsfs_dir_exit(void) {
kmem_cache_destroy(eggsfs_readdir_ctx_cachep);
}

View File

@@ -37,5 +37,8 @@ static inline int eggsfs_dir_needs_reval(struct eggsfs_inode* dir, struct dentry
return ret;
}
int __init eggsfs_dir_init(void);
void __cold eggsfs_dir_exit(void);
#endif

View File

@@ -1,5 +1,6 @@
#include "err.h"
#include "log.h"
#include "bincode.h"
bool eggsfs_unexpected_error(int err) {
switch (err) {
@@ -14,7 +15,7 @@ bool eggsfs_unexpected_error(int err) {
case EGGSFS_ERR_FILE_NOT_FOUND: return false;
case EGGSFS_ERR_DIRECTORY_NOT_FOUND: return false;
case EGGSFS_ERR_NAME_NOT_FOUND: return false;
case EGGSFS_ERR_EDGE_NOT_FOUND: return true;
case EGGSFS_ERR_EDGE_NOT_FOUND: return false;
case EGGSFS_ERR_EDGE_IS_LOCKED: return true;
case EGGSFS_ERR_TYPE_IS_DIRECTORY: return true;
case EGGSFS_ERR_TYPE_IS_NOT_DIRECTORY: return true;
@@ -33,7 +34,7 @@ bool eggsfs_unexpected_error(int err) {
case EGGSFS_ERR_MTIME_IS_TOO_RECENT: return true;
case EGGSFS_ERR_MISMATCHING_TARGET: return true;
case EGGSFS_ERR_MISMATCHING_OWNER: return true;
case EGGSFS_ERR_MISMATCHING_CREATION_TIME: return true;
case EGGSFS_ERR_MISMATCHING_CREATION_TIME: return false;
case EGGSFS_ERR_DIRECTORY_NOT_EMPTY: return true;
case EGGSFS_ERR_FILE_IS_TRANSIENT: return true;
case EGGSFS_ERR_OLD_DIRECTORY_NOT_FOUND: return true;
@@ -62,6 +63,7 @@ bool eggsfs_unexpected_error(int err) {
case EGGSFS_ERR_BLOCK_TOO_BIG: return true;
case EGGSFS_ERR_BLOCK_NOT_FOUND: return true;
case -ERESTARTSYS: return false;
case -ETIMEDOUT: return false;
default: return true;
}
}
@@ -70,7 +72,7 @@ bool eggsfs_unexpected_error(int err) {
int eggsfs_error_to_linux(int err) {
bool unexpected = eggsfs_unexpected_error(err);
if (unexpected) {
eggsfs_warn("unexpected eggsfs error %d", err);
eggsfs_warn("unexpected eggsfs error %s (%d)", eggsfs_err_str(err), err);
}
WARN_ON(unexpected);
if (err == 0) { return 0; }

View File

@@ -15,6 +15,9 @@
#include "trace.h"
#include "span.h"
#include "wq.h"
#include "bincode.h"
static struct kmem_cache* eggsfs_transient_span_cachep;
struct eggsfs_transient_span {
// The transient span holds a reference to this. As long as the
@@ -65,20 +68,26 @@ static int eggsfs_file_open(struct inode* inode, struct file* filp) {
return 0;
}
// starts out with refcount = 1
static struct eggsfs_transient_span* eggsfs_new_transient_span(struct eggsfs_inode* enode, u64 offset) {
struct eggsfs_transient_span* span = kzalloc(sizeof(struct eggsfs_transient_span), GFP_KERNEL);
if (span == NULL) { return span; }
span->enode = enode;
static void eggsfs_init_transient_span(void* p) {
struct eggsfs_transient_span* span = (struct eggsfs_transient_span*)p;
INIT_LIST_HEAD(&span->pages);
int i;
for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) {
INIT_LIST_HEAD(&span->blocks[i]);
}
ihold(&enode->inode);
spin_lock_init(&span->blocks_proofs_lock);
}
// starts out with refcount = 1
static struct eggsfs_transient_span* eggsfs_new_transient_span(struct eggsfs_inode* enode, u64 offset) {
struct eggsfs_transient_span* span = kmem_cache_alloc(eggsfs_transient_span_cachep, GFP_KERNEL);
if (span == NULL) { return span; }
span->enode = enode;
eggsfs_in_flight_begin(enode);
atomic_set(&span->refcount, 1);
span->offset = offset;
span->written = 0;
memset(span->blocks_proofs, 0, sizeof(span->blocks_proofs));
return span;
}
@@ -88,6 +97,7 @@ static void eggsfs_hold_transient_span(struct eggsfs_transient_span* span) {
static void eggsfs_put_transient_span(struct eggsfs_transient_span* span) {
if (atomic_dec_return(&span->refcount) == 0) {
BUG_ON(spin_is_locked(&span->blocks_proofs_lock));
// free pages, adjust OOM score
int num_pages = 0;
#define FREE_PAGES(__pages) \
@@ -104,9 +114,9 @@ static void eggsfs_put_transient_span(struct eggsfs_transient_span* span) {
}
#undef FREE_PAGES
add_mm_counter(span->enode->file.mm, MM_FILEPAGES, -num_pages);
iput(&span->enode->inode);
eggsfs_in_flight_end(span->enode);
// Free the span itself
kfree(span);
kmem_cache_free(eggsfs_transient_span_cachep, span);
}
}
@@ -348,8 +358,6 @@ static int eggsfs_start_flushing(struct eggsfs_inode* enode, bool non_blocking)
// Here we might block even if we're non blocking, since we don't have
// async metadata requests yet, which is a bit unfortunate.
u32* cell_crcs = NULL;
if (span->storage_class == EGGSFS_EMPTY_STORAGE) {
up(&enode->file.flushing_span_sema); // nothing to do
} else if (span->storage_class == EGGSFS_INLINE_STORAGE) {
@@ -370,8 +378,8 @@ static int eggsfs_start_flushing(struct eggsfs_inode* enode, bool non_blocking)
int D = eggsfs_data_blocks(span->parity);
int B = eggsfs_blocks(span->parity);
int S = span->stripes;
cell_crcs = kzalloc(sizeof(uint32_t)*B*span->stripes, GFP_KERNEL);
if (cell_crcs == NULL) { err = -ENOMEM; goto out_err_keep_writing; }
uint32_t cell_crcs[EGGSFS_MAX_BLOCKS*EGGSFS_MAX_STRIPES];
memset(cell_crcs, 0, sizeof(cell_crcs));
eggsfs_debug("size=%d, D=%d, B=%d, S=%d", span->written, D, B, S);
int i, j;
@@ -486,7 +494,6 @@ out:
// we don't need this anymore (the requests might though)
eggsfs_put_transient_span(span);
enode->file.writing_span = NULL;
if (cell_crcs) { kfree(cell_crcs); }
return err;
out_err_fpu:
@@ -496,7 +503,6 @@ out_err_fpu:
// When we got an error before even starting to flush
out_err_keep_writing:
up(&enode->file.flushing_span_sema); // nothing is flushing
if (cell_crcs) { kfree(cell_crcs); }
return err;
}
@@ -607,22 +613,20 @@ static ssize_t eggsfs_file_write_iter(struct kiocb* iocb, struct iov_iter* from)
int eggsfs_file_flush(struct eggsfs_inode* enode, struct dentry* dentry) {
inode_lock(&enode->inode);
int err = 0;
// Not writing, there's nothing to do, there's nothing to do, files are immutable
if (enode->file.status != EGGSFS_FILE_STATUS_WRITING) {
eggsfs_debug("status=%d, won't flush", enode->file.status);
inode_unlock(&enode->inode);
return 0;
goto out_early;
}
// We are in another process, skip
if (enode->file.owner != current->group_leader) {
eggsfs_debug("owner=%p != group_leader=%p, won't flush", enode->file.owner, current->group_leader);
inode_unlock(&enode->inode);
return 0;
goto out_early;
}
int err = 0;
// If the owner doesn't have an mm anymore, it means that the file
// is being torn down after the process has been terminated. In that case
// we shouldn't even link the file.
@@ -668,7 +672,9 @@ out:
}
enode->file.writing_span = NULL;
}
out_early:
inode_unlock(&enode->inode);
eggsfs_wait_in_flight(enode);
return err;
}
@@ -690,7 +696,7 @@ static ssize_t eggsfs_file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
eggsfs_debug("start of read loop, *ppos=%llu", *ppos);
struct eggsfs_span* span = NULL;
while (*ppos < inode->i_size && iov_iter_count(to)) {
if (span) { eggsfs_span_put(span, true); }
if (span) { eggsfs_put_span(span, true); }
span = eggsfs_get_span(enode, *ppos);
if (IS_ERR(span)) {
written = PTR_ERR(span);
@@ -759,7 +765,7 @@ static ssize_t eggsfs_file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
eggsfs_debug("before loop end, remaining %lu", iov_iter_count(to));
}
out:
if (span) { eggsfs_span_put(span, true); }
if (span) { eggsfs_put_span(span, true); }
eggsfs_debug("out of the loop, written=%ld", written);
if (unlikely(written < 0)) {
eggsfs_debug("reading failed, err=%ld", written);
@@ -767,6 +773,60 @@ out:
return written;
}
void eggsfs_link_destructor(void* buf) {
kfree(buf);
}
char* eggsfs_write_link(struct eggsfs_inode* enode) {
eggsfs_debug("ino=%016lx", enode->inode.i_ino);
BUG_ON(eggsfs_inode_type(enode->inode.i_ino) != EGGSFS_INODE_SYMLINK);
// size might not be filled in
int err = eggsfs_do_getattr(enode);
if (err) { return ERR_PTR(err); }
BUG_ON(enode->inode.i_size > PAGE_SIZE); // for simplicity...
size_t size = enode->inode.i_size;
eggsfs_debug("size=%lu", size);
struct eggsfs_span* span = eggsfs_get_span(enode, 0);
if (span == NULL) {
eggsfs_debug("got no span, empty file?");
return "";
}
if (IS_ERR(span)) { return ERR_CAST(span); }
BUG_ON(span->end-span->start != size); // again, for simplicity
char* buf = kmalloc(size+1, GFP_KERNEL);
if (buf == NULL) { err = -ENOMEM; goto out_err; }
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
memcpy(buf, EGGSFS_INLINE_SPAN(span)->body, size);
} else {
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
struct page* page = eggsfs_get_span_page(block_span, 0);
BUG_ON(page == NULL);
if (IS_ERR(page)) { err = PTR_ERR(page); goto out_err; }
char* page_buf = kmap(page);
memcpy(buf, page_buf, size);
kunmap(page);
}
buf[size] = '\0';
eggsfs_put_span(span, true);
eggsfs_debug("link %*pE", (int)size, buf);
return buf;
out_err:
eggsfs_debug("get_link err=%d", err);
eggsfs_put_span(span, false);
return ERR_PTR(err);
}
const struct file_operations eggsfs_file_operations = {
.open = eggsfs_file_open,
.read_iter = eggsfs_file_read_iter,
@@ -774,3 +834,20 @@ const struct file_operations eggsfs_file_operations = {
.flush = eggsfs_file_flush_internal,
.llseek = generic_file_llseek,
};
int __init eggsfs_file_init(void) {
eggsfs_transient_span_cachep = kmem_cache_create(
"eggsfs_transient_span_cache",
sizeof(struct eggsfs_transient_span),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
&eggsfs_init_transient_span
);
if (!eggsfs_transient_span_cachep) { return -ENOMEM; }
return 0;
}
void __cold eggsfs_file_exit(void) {
// TODO: handle case where there still are requests in flight.
kmem_cache_destroy(eggsfs_transient_span_cachep);
}

View File

@@ -1,15 +1,13 @@
#ifndef _EGGSFS_FILESIMPLE_H
#define _EGGSFS_FILESIMPLE_H
#include <linux/list.h>
#include <linux/workqueue.h>
#ifndef _EGGSFS_FILE_H
#define _EGGSFS_FILE_H
#include "inode.h"
// Must be called with `spans_lock` taken.
struct eggsfs_transient_span* eggsfs_add_new_span(struct list_head* spans);
ssize_t eggsfs_file_write(struct eggsfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from);
int eggsfs_file_flush(struct eggsfs_inode* enode, struct dentry* dentry);
void eggsfs_flush_transient_spans(struct work_struct* work);
void eggsfs_link_destructor(void*);
char* eggsfs_write_link(struct eggsfs_inode* enode);
int eggsfs_shard_add_span_initiate_block_cb(
void* data, int block, u32 ip1, u16 port1, u32 ip2, u16 port2, u64 block_service_id, u64 block_id, u64 certificate
@@ -17,7 +15,7 @@ int eggsfs_shard_add_span_initiate_block_cb(
extern const struct file_operations eggsfs_file_operations;
ssize_t eggsfs_file_write(struct eggsfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from);
int eggsfs_file_flush(struct eggsfs_inode* enode, struct dentry* dentry);
int __init eggsfs_file_init(void);
void __cold eggsfs_file_exit(void);
#endif

View File

@@ -5,7 +5,7 @@
#include "log.h"
#include "dir.h"
#include "metadata.h"
#include "namei.h"
#include "dentry.h"
#include "trace.h"
#include "err.h"
#include "file.h"
@@ -39,15 +39,6 @@ void eggsfs_inode_evict(struct inode* inode) {
if (S_ISDIR(inode->i_mode)) {
eggsfs_dir_drop_cache(enode);
} else if (S_ISREG(inode->i_mode)) {
// Make sure no prefetching is taking place (otherwise the
// dropping of spans below would fail).
int prefetches = atomic_read(&enode->file.prefetches);
if (prefetches) {
wait_event(enode->file.prefetches_wq, atomic_read(&enode->file.prefetches) == 0);
}
// TODO verify that there are no flushing spans -- it should always be
// the case given that we get references to the inode
// Free span cache
eggsfs_drop_file_spans(enode);
}
truncate_inode_pages(&inode->i_data, 0);
@@ -266,66 +257,19 @@ static int eggsfs_symlink(struct inode* dir, struct dentry* dentry, const char*
return 0;
}
static void eggsfs_get_link_destructor(void* buf) {
kfree(buf);
}
static const char* eggsfs_get_link(struct dentry* dentry, struct inode* inode, struct delayed_call* destructor) {
eggsfs_debug("ino=%016lx", inode->i_ino);
struct eggsfs_inode* enode = EGGSFS_I(inode);
BUG_ON(eggsfs_inode_type(enode->inode.i_ino) != EGGSFS_INODE_SYMLINK);
// Can't be bothered to think about RCU
if (dentry == NULL) { return ERR_PTR(-ECHILD); }
// size might not be filled in
int err = eggsfs_do_getattr(enode);
if (err) { return ERR_PTR(err); }
struct eggsfs_inode* enode = EGGSFS_I(inode);
char* buf = eggsfs_write_link(enode);
BUG_ON(enode->inode.i_size > PAGE_SIZE); // for simplicity...
size_t size = enode->inode.i_size;
if (IS_ERR(buf)) { return buf; }
eggsfs_debug("size=%lu", size);
struct eggsfs_span* span = eggsfs_get_span(enode, 0);
if (span == NULL) {
eggsfs_debug("got no span, empty file?");
return "";
}
if (IS_ERR(span)) { return ERR_CAST(span); }
BUG_ON(span->end-span->start != size); // again, for simplicity
char* buf = kmalloc(size+1, GFP_KERNEL);
if (buf == NULL) { err = -ENOMEM; goto out_err; }
destructor->fn = eggsfs_get_link_destructor;
destructor->fn = eggsfs_link_destructor;
destructor->arg = buf;
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
memcpy(buf, EGGSFS_INLINE_SPAN(span)->body, size);
} else {
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
struct page* page = eggsfs_get_span_page(block_span, 0);
BUG_ON(page == NULL);
if (IS_ERR(page)) { err = PTR_ERR(page); goto out_err; }
char* page_buf = kmap(page);
memcpy(buf, page_buf, size);
kunmap(page);
}
buf[size] = '\0';
eggsfs_span_put(span, true);
eggsfs_debug("link %*pE", (int)size, buf);
return buf;
out_err:
eggsfs_debug("get_link err=%d", err);
eggsfs_span_put(span, false);
return ERR_PTR(err);
}
static const struct inode_operations eggsfs_dir_inode_ops = {
@@ -389,12 +333,14 @@ struct inode* eggsfs_get_inode(struct super_block* sb, struct eggsfs_inode* pare
inode->i_fop = &eggsfs_file_operations;
enode->file.status = EGGSFS_FILE_STATUS_NONE;
// Init normal file stuff -- that's always there.
// Init normal file stuff -- that's always there. The transient
// file stuff is only filled in if needed.
enode->file.spans = RB_ROOT;
init_rwsem(&enode->file.spans_lock);
atomic64_set(&enode->file.prefetch_section, 0);
atomic_set(&enode->file.prefetches, 0);
init_waitqueue_head(&enode->file.prefetches_wq);
atomic_set(&enode->file.in_flight, 0);
init_waitqueue_head(&enode->file.in_flight_wq);
}
// FIXME
@@ -418,3 +364,12 @@ struct inode* eggsfs_get_inode(struct super_block* sb, struct eggsfs_inode* pare
return inode;
}
void eggsfs_wait_in_flight(struct eggsfs_inode* enode) {
if (atomic_read(&enode->file.in_flight) == 0) { return; }
// Wait for 10 secs, then give up
long res = wait_event_timeout(enode->file.in_flight_wq, atomic_read(&enode->file.in_flight) == 0, 10 * HZ);
if (res > 0) { return; }
eggsfs_warn("waited for 10 seconds for in flight requests for inode %016lx, either some requests are stuck or this is a bug.", enode->inode.i_ino);
}

View File

@@ -69,23 +69,31 @@ struct eggsfs_inode_file {
struct rb_root spans;
struct rw_semaphore spans_lock;
atomic_t prefetches; // how many prefetches are ongoing
wait_queue_head_t prefetches_wq; // waited on by evictor
// low 32 bits: section start. high 32 bits: section end. Rouded up to
// PAGE_SIZE, since we know each stripe is at PAGE_SIZE boundary anyway.
// This means that prefetch with be somewhat broken for files > 16TiB.
// This is what we use in our prefetch heuristic.
atomic64_t prefetch_section;
atomic_t prefetches_in_flight;
wait_queue_head_t prefetches_in_flight_wq;
// Transient file stuff. Only initialized on file creation, otherwise it's garbage.
// On top of sometimes holding references to inodes, we also have this
// mechanism to try to make sure that every in-flight operation is done
// before the file is closed. This should ensure that we don't leak inodes
// after unmounting.
atomic_t in_flight;
wait_queue_head_t in_flight_wq;
// Transient file stuff. Only initialized on file creation (rather than opening),
// otherwise it's garbage.
// Could be factored out to separate data structure since it's completely useless
// when writing.
// when reading.
u64 cookie;
// If we've encountered an error such that we want to stop writing to this file
// forever.
atomic_t transient_err;
// Span we're currently writing to. If NULL we're yet to write anything.
// Span we're currently writing to. Might be NULL.
struct eggsfs_transient_span* writing_span;
// Whether we're currently flushing a span (block write + add span certify)
struct semaphore flushing_span_sema;
@@ -137,7 +145,6 @@ struct eggsfs_inode {
};
#define EGGSFS_I(ptr) container_of(ptr, struct eggsfs_inode, inode)
#define EGGSFS_INODE_NEED_GETATTR 1
#define EGGSFS_INODE_DIRECTORY 1
#define EGGSFS_INODE_FILE 2
@@ -153,16 +160,30 @@ static inline u32 eggsfs_inode_shard(u64 ino) {
struct inode* eggsfs_get_inode(struct super_block* sb, struct eggsfs_inode* parent, u64 ino);
// super ops
struct inode* eggsfs_inode_alloc(struct super_block* sb);
void eggsfs_inode_evict(struct inode* inode);
void eggsfs_inode_free(struct inode* inode);
int __init eggsfs_inode_init(void);
void __cold eggsfs_inode_exit(void);
// inode ops
int eggsfs_do_getattr(struct eggsfs_inode* enode);
static inline void eggsfs_in_flight_begin(struct eggsfs_inode* enode) {
ihold(&enode->inode);
atomic_inc(&enode->file.in_flight);
smp_mb__after_atomic();
}
static inline void eggsfs_in_flight_end(struct eggsfs_inode* enode) {
smp_mb__before_atomic();
int remaining = atomic_dec_return(&enode->file.in_flight);
if (remaining == 0) { wake_up_all(&enode->file.in_flight_wq); }
iput(&enode->inode);
}
void eggsfs_wait_in_flight(struct eggsfs_inode* enode);
int __init eggsfs_inode_init(void);
void __cold eggsfs_inode_exit(void);
#endif

View File

@@ -15,6 +15,8 @@
#include "rs.h"
#include "log.h"
#include "span.h"
#include "dir.h"
#include "file.h"
MODULE_LICENSE("GPL");
@@ -51,9 +53,18 @@ static int __init eggsfs_init(void) {
err = eggsfs_rs_init();
if (err) { goto out_rs; }
err = eggsfs_dir_init();
if (err) { goto out_dir; }
err = eggsfs_file_init();
if (err) { goto out_file; }
return 0;
out_file:
eggsfs_dir_exit();
out_dir:
eggsfs_rs_exit();
out_rs:
eggsfs_fs_exit();
out_fs:
@@ -72,17 +83,20 @@ out_sysfs:
}
static void __exit eggsfs_exit(void) {
eggsfs_file_exit();
eggsfs_dir_exit();
eggsfs_fs_exit();
eggsfs_inode_exit();
eggsfs_block_exit();
eggsfs_sysctl_exit();
eggsfs_sysfs_exit();
destroy_workqueue(eggsfs_wq);
// tracepoint_synchronize_unregister() must be called before the end of
// the module exit function to make sure there is no caller left using
// the probe. This, and the fact that preemption is disabled around the
// probe call, make sure that probe removal and module unload are safe.
tracepoint_synchronize_unregister();
destroy_workqueue(eggsfs_wq);
}
module_init(eggsfs_init);

View File

@@ -592,23 +592,32 @@ int eggsfs_shard_add_span_initiate(
// Not worth writing a bincodegen stuff for this variable sized request, of which we have
// only two instance.
int B = eggsfs_blocks(parity);
size_t msg_size =
8 + // FileId
8 + // Cookie
8 + // ByteOffset
4 + // Size
4 + // CRC
1 + // StorageClass
2 + // Blacklist (just length)
1 + // Parity
1 + // Stripes
4 + // CellSize
2 + (4*stripes*B); // Crcs
char* req = kmalloc(EGGSFS_SHARD_HEADER_SIZE + msg_size, GFP_KERNEL);
if (B > EGGSFS_MAX_BLOCKS) {
return -EINVAL;
}
#define MSG_SIZE(__B, __stripes) ( \
8 + /* FileId */ \
8 + /* Cookie */ \
8 + /* ByteOffset */ \
4 + /* Size */ \
4 + /* CRC */ \
1 + /* StorageClass */ \
2 + /* Blacklist (just length) */ \
1 + /* Parity */ \
1 + /* Stripes */ \
4 + /* CellSize */ \
2 + (4*__stripes*__B) /* CRCs */ \
)
size_t msg_size = MSG_SIZE(stripes, B);
char req[EGGSFS_SHARD_HEADER_SIZE + MSG_SIZE(EGGSFS_MAX_STRIPES, EGGSFS_MAX_BLOCKS)];
if (req == NULL) {
return -ENOMEM;
}
#undef MSG_SIZE
{
PREPARE_SHARD_REQ_CTX_INNER(msg_size);
eggsfs_add_span_initiate_req_put_start(&ctx, start);
@@ -629,7 +638,6 @@ int eggsfs_shard_add_span_initiate(
}
eggsfs_add_span_initiate_req_put_end(&ctx, req_crcs, end);
skb = eggsfs_send_shard_req(info, eggsfs_inode_shard(file), req_id, &ctx, &attempts);
kfree(req);
if (IS_ERR(skb)) { return PTR_ERR(skb); }
}
@@ -685,19 +693,24 @@ int eggsfs_shard_add_span_certify(
u64 req_id = alloc_request_id();
u8 kind = EGGSFS_SHARD_ADD_SPAN_CERTIFY;
// Not worth writing a bincodegen stuff for this variable sized request, of which we have
// only two instance.
int B = eggsfs_blocks(parity);
size_t msg_size =
8 + // FileId
8 + // Cookie
8 + // ByteOffset
2 + (8*2*B); // Proofs
char* req = kmalloc(EGGSFS_SHARD_HEADER_SIZE + msg_size, GFP_KERNEL);
if (req == NULL) {
return -ENOMEM;
if (unlikely(B > EGGSFS_MAX_BLOCKS)) {
return -EINVAL;
}
// Not worth writing a bincodegen stuff for this variable sized request, of which we have
// only two instances.
#define MSG_SIZE(__B) ( \
8 + /* FileId */ \
8 + /* Cookie */ \
8 + /* ByteOffset */ \
2 + (8*2*__B) /* Proofs */ \
)
char req[EGGSFS_SHARD_HEADER_SIZE + MSG_SIZE(EGGSFS_MAX_BLOCKS)];
size_t msg_size = MSG_SIZE(B);
#undef MSG_SIZE
{
PREPARE_SHARD_REQ_CTX_INNER(msg_size);
eggsfs_add_span_certify_req_put_start(&ctx, start);
@@ -714,7 +727,6 @@ int eggsfs_shard_add_span_certify(
}
eggsfs_add_span_certify_req_put_end(&ctx, req_proofs, end);
skb = eggsfs_send_shard_req(info, eggsfs_inode_shard(file), req_id, &ctx, &attempts);
kfree(req);
if (IS_ERR(skb)) { return PTR_ERR(skb); }
}

View File

@@ -226,7 +226,7 @@ struct sk_buff* eggsfs_metadata_request(
u64 start_t = get_jiffies_64();
u64 elapsed_ms = 0;
#define LOG_STR "unexpected error: req_id=%llu shard_id=%d kind_str=%s kind=%d addr=%pI4:%d attempts=%d elapsed=%llums"
#define LOG_STR "req_id=%llu shard_id=%d kind_str=%s kind=%d addr=%pI4:%d attempts=%d elapsed=%llums"
#define LOG_ARGS req_id, shard_id, kind_str, kind, &addr->sin_addr, ntohs(addr->sin_port), *attempts, elapsed_ms
#define WARN_LATE if (elapsed_ms > 1000) { eggsfs_warn("late request: " LOG_STR, LOG_ARGS); }

View File

@@ -74,4 +74,7 @@ fi
# Attach
tmux attach-session -t uovo:1
# ./eggs/eggstests -kmod -filter 'mounted|rsync|large' -drop-cached-spans-every 100ms -short -binaries-dir $(pwd)/eggs
# ./eggs/eggstests -verbose -kmod -filter 'mounted|rsync|large' -drop-cached-spans-every 100ms -short -binaries-dir $(pwd)/eggs
# sudo sysctl fs.eggsfs.debug=1
# ./eggs/eggstests -verbose -kmod -filter 'mounted' -cfg fsTest.numDirs=1 -cfg fsTest.numFiles=10 -short -binaries-dir $(pwd)/eggs

File diff suppressed because it is too large Load Diff

View File

@@ -18,9 +18,9 @@ extern unsigned long eggsfs_span_cache_max_size_drop;
extern unsigned long eggsfs_span_cache_min_avail_mem_drop;
struct eggsfs_span {
struct eggsfs_inode* enode;
struct rb_node node; // to look them up
struct list_head lru; // to decide what to evict
u64 ino; // for logging and various other non-essential things.
struct rb_node node;
struct list_head lru;
u64 start;
u64 end;
u8 storage_class; // used to determine which type of span this is enclosed in
@@ -47,15 +47,25 @@ struct eggsfs_block_span {
struct eggsfs_span span;
struct xarray pages; // indexed by page number
struct eggsfs_block blocks[EGGSFS_MAX_BLOCKS];
struct eggsfs_latch stripe_latches[15];
struct eggsfs_latch stripe_latches[EGGSFS_MAX_STRIPES];
// This field, like the one below, is protected by the LRU lock.
// If it is NULL, it means that the span has outlived the inode.
// Given the `file->in_flight` field this should really never
// happen, but it's a good sanity check.
//
// Note that ihold'ing this would defeat the purpose (the cached
// spans would prevent inode eviction).
struct eggsfs_inode* enode;
// When refcount = 0, this might still be alive, but in an LRU.
// When refcount < 0, we're currently reclaiming this span.
// Going from 0 to 1, and from 0 to -1 involves taking the respective
// LRU lock. This is not atomic because _all changes to this
// value go through the LRU lock_, since they must be paired with
// manipulating the LRU list.
int refcount;
u32 cell_size;
u32 stripes_crc[15];
// * 0: there are no readers, the span is in the LRU.
// * n > 0: there are n readers, the span is _not_ in the LRU.
// * n < 0: the span is being reclaimed, it is _not_ in the LRU.
s64 readers;
// If anybody's actually read some of this span
bool actually_read;
u32 stripes_crc[EGGSFS_MAX_STRIPES];
bool touched; // wether somebody actually read this span (will determine where it ends up in the LRU)
u8 stripes;
u8 parity;
};
@@ -73,11 +83,25 @@ struct eggsfs_block_span {
struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset);
// Makes the span available for reclamation.
void eggsfs_span_put(struct eggsfs_span* span, bool was_read);
void eggsfs_put_span(struct eggsfs_span* span, bool was_read);
// The page_ix is the page number inside the span.
// The page_ix is the page number inside the span. Can't be called with inline
// span.
struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix);
// Drop all the spans in a specific file. Note that this does not mean that
// the span will me immediately deallocated -- there might be still things
// holding onto them (prefetching requests, mostly).
//
// This function _must_ be called before inode is evicted! Otherwise we'll have
// dangling references to the enode in the spans.
void eggsfs_drop_file_spans(struct eggsfs_inode* enode);
// Drops all cached spans not being currently used. Returns number of
// freed pages.
u64 eggsfs_drop_all_spans(void);
// Callbacks for metadata
void eggsfs_file_spans_cb_span(
void* data, u64 offset, u32 size, u32 crc,
u8 storage_class, u8 parity, u8 stripes, u32 cell_size,
@@ -92,12 +116,6 @@ void eggsfs_file_spans_cb_block(
);
void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body);
// To be used when we know that the spans are not being used anymore (i.e. on inode eviction)
void eggsfs_drop_file_spans(struct eggsfs_inode* enode);
// Drops all cached spans. Returns number of freed pages.
u64 eggsfs_drop_all_spans(void);
int __init eggsfs_span_init(void);
void __cold eggsfs_span_exit(void);

View File

@@ -8,7 +8,7 @@
#include "log.h"
#include "inode.h"
#include "metadata.h"
#include "namei.h"
#include "dentry.h"
#include "net.h"
#include "shuckle.h"
#include "bincode.h"

View File

@@ -3,7 +3,7 @@
#include <linux/kobject.h>
#include <linux/fs.h>
#include "namei.h"
#include "dentry.h"
#include "span.h"
static struct kset* eggsfs_kset;

View File

@@ -167,7 +167,7 @@ EGGSFS_TRACE_EVENT_inode_ret(vfs_opendir_exit);
EGGSFS_TRACE_EVENT_inode(vfs_closedir_enter);
EGGSFS_TRACE_EVENT_inode_ret(vfs_closedir_exit);
// namei.c
// dentry.c
EGGSFS_TRACE_EVENT_dir_dentry(vfs_lookup_enter);
EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(vfs_lookup_exit);