diff --git a/kmod/bincode.c b/kmod/bincode.c new file mode 100644 index 00000000..d309c5a6 --- /dev/null +++ b/kmod/bincode.c @@ -0,0 +1 @@ +#include "bincodegen.c" \ No newline at end of file diff --git a/kmod/bincode.h b/kmod/bincode.h index ff21b531..51deef94 100644 --- a/kmod/bincode.h +++ b/kmod/bincode.h @@ -150,4 +150,6 @@ static const u8 EGGSFS_FLASH_STORAGE = 3; #define EGGSFS_BLOCK_SERVICE_DONT_READ (EGGSFS_BLOCK_SERVICE_STALE | EGGSFS_BLOCK_SERVICE_NO_READ | EGGSFS_BLOCK_SERVICE_DECOMMISSIONED) #define EGGSFS_BLOCK_SERVICE_DONT_WRITE (EGGSFS_BLOCK_SERVICE_STALE | EGGSFS_BLOCK_SERVICE_NO_WRITE | EGGSFS_BLOCK_SERVICE_DECOMMISSIONED) +#define EGGSFS_MAX_STRIPES 15 + #endif diff --git a/kmod/bincodegen.c b/kmod/bincodegen.c new file mode 100644 index 00000000..986ff20c --- /dev/null +++ b/kmod/bincodegen.c @@ -0,0 +1,109 @@ +const char* eggsfs_err_str(int err) { + switch (err) { + case 10: return "INTERNAL_ERROR"; + case 11: return "FATAL_ERROR"; + case 12: return "TIMEOUT"; + case 13: return "MALFORMED_REQUEST"; + case 14: return "MALFORMED_RESPONSE"; + case 15: return "NOT_AUTHORISED"; + case 16: return "UNRECOGNIZED_REQUEST"; + case 17: return "FILE_NOT_FOUND"; + case 18: return "DIRECTORY_NOT_FOUND"; + case 19: return "NAME_NOT_FOUND"; + case 20: return "EDGE_NOT_FOUND"; + case 21: return "EDGE_IS_LOCKED"; + case 22: return "TYPE_IS_DIRECTORY"; + case 23: return "TYPE_IS_NOT_DIRECTORY"; + case 24: return "BAD_COOKIE"; + case 25: return "INCONSISTENT_STORAGE_CLASS_PARITY"; + case 26: return "LAST_SPAN_STATE_NOT_CLEAN"; + case 27: return "COULD_NOT_PICK_BLOCK_SERVICES"; + case 28: return "BAD_SPAN_BODY"; + case 29: return "SPAN_NOT_FOUND"; + case 30: return "BLOCK_SERVICE_NOT_FOUND"; + case 31: return "CANNOT_CERTIFY_BLOCKLESS_SPAN"; + case 32: return "BAD_NUMBER_OF_BLOCKS_PROOFS"; + case 33: return "BAD_BLOCK_PROOF"; + case 34: return "CANNOT_OVERRIDE_NAME"; + case 35: return "NAME_IS_LOCKED"; + case 36: return "MTIME_IS_TOO_RECENT"; + case 37: return "MISMATCHING_TARGET"; + case 38: return "MISMATCHING_OWNER"; + case 39: return "MISMATCHING_CREATION_TIME"; + case 40: return "DIRECTORY_NOT_EMPTY"; + case 41: return "FILE_IS_TRANSIENT"; + case 42: return "OLD_DIRECTORY_NOT_FOUND"; + case 43: return "NEW_DIRECTORY_NOT_FOUND"; + case 44: return "LOOP_IN_DIRECTORY_RENAME"; + case 45: return "DIRECTORY_HAS_OWNER"; + case 46: return "FILE_IS_NOT_TRANSIENT"; + case 47: return "FILE_NOT_EMPTY"; + case 48: return "CANNOT_REMOVE_ROOT_DIRECTORY"; + case 49: return "FILE_EMPTY"; + case 50: return "CANNOT_REMOVE_DIRTY_SPAN"; + case 51: return "BAD_SHARD"; + case 52: return "BAD_NAME"; + case 53: return "MORE_RECENT_SNAPSHOT_EDGE"; + case 54: return "MORE_RECENT_CURRENT_EDGE"; + case 55: return "BAD_DIRECTORY_INFO"; + case 56: return "DEADLINE_NOT_PASSED"; + case 57: return "SAME_SOURCE_AND_DESTINATION"; + case 58: return "SAME_DIRECTORIES"; + case 59: return "SAME_SHARD"; + case 60: return "BAD_PROTOCOL_VERSION"; + case 61: return "BAD_CERTIFICATE"; + case 62: return "BLOCK_TOO_RECENT_FOR_DELETION"; + case 63: return "BLOCK_FETCH_OUT_OF_BOUNDS"; + case 64: return "BAD_BLOCK_CRC"; + case 65: return "BLOCK_TOO_BIG"; + case 66: return "BLOCK_NOT_FOUND"; + default: return "UNKNOWN"; + } +} + +const char* eggsfs_shard_kind_str(int kind) { + switch (kind) { + case 1: return "LOOKUP"; + case 2: return "STAT_FILE"; + case 4: return "STAT_DIRECTORY"; + case 5: return "READ_DIR"; + case 6: return "CONSTRUCT_FILE"; + case 7: return "ADD_SPAN_INITIATE"; + case 8: return "ADD_SPAN_CERTIFY"; + case 9: return "LINK_FILE"; + case 10: return "SOFT_UNLINK_FILE"; + case 11: return "FILE_SPANS"; + case 12: return "SAME_DIRECTORY_RENAME"; + case 16: return "ADD_INLINE_SPAN"; + case 14: return "SNAPSHOT_LOOKUP"; + default: return "UNKNOWN"; + } +} + +const char* eggsfs_cdc_kind_str(int kind) { + switch (kind) { + case 1: return "MAKE_DIRECTORY"; + case 2: return "RENAME_FILE"; + case 3: return "SOFT_UNLINK_DIRECTORY"; + case 4: return "RENAME_DIRECTORY"; + default: return "UNKNOWN"; + } +} + +const char* eggsfs_shuckle_kind_str(int kind) { + switch (kind) { + case 3: return "SHARDS"; + case 7: return "CDC"; + case 8: return "INFO"; + default: return "UNKNOWN"; + } +} + +const char* eggsfs_blocks_kind_str(int kind) { + switch (kind) { + case 2: return "FETCH_BLOCK"; + case 3: return "WRITE_BLOCK"; + default: return "UNKNOWN"; + } +} + diff --git a/kmod/namei.c b/kmod/dentry.c similarity index 99% rename from kmod/namei.c rename to kmod/dentry.c index 60a79060..791e966d 100644 --- a/kmod/namei.c +++ b/kmod/dentry.c @@ -1,4 +1,4 @@ -#include "namei.h" +#include "dentry.h" #include "log.h" #include "dir.h" diff --git a/kmod/namei.h b/kmod/dentry.h similarity index 92% rename from kmod/namei.h rename to kmod/dentry.h index 80b19256..b66df0cc 100644 --- a/kmod/namei.h +++ b/kmod/dentry.h @@ -1,5 +1,5 @@ -#ifndef _EGGSFS_NAMEI_H -#define _EGGSFS_NAMEI_H +#ifndef _EGGSFS_DENTRY_H +#define _EGGSFS_DENTRY_H #include @@ -7,6 +7,8 @@ EGGSFS_DECLARE_COUNTER(eggsfs_stat_dir_revalidations); +extern struct dentry_operations eggsfs_dentry_ops; + struct dentry* eggsfs_lookup(struct inode* dir, struct dentry* dentry, unsigned int flags); int eggsfs_mkdir(struct inode* dir, struct dentry* dentry, umode_t mode); int eggsfs_rmdir(struct inode* dir, struct dentry* dentry); @@ -17,6 +19,4 @@ int eggsfs_rename( unsigned int flags ); -extern struct dentry_operations eggsfs_dentry_ops; - #endif diff --git a/kmod/dir.c b/kmod/dir.c index 88ac8ea2..101caf48 100644 --- a/kmod/dir.c +++ b/kmod/dir.c @@ -15,6 +15,8 @@ int eggsfs_dir_refresh_time; // in jiffies #define eggsfs_dir_get_page_n(_page) ({ *((((u32*)&(_page)->private))+1); }) #define eggsfs_dir_set_page_n(_page, _n) ({ *((((u32*)&(_page)->private))+1) = _n; }) +static struct kmem_cache* eggsfs_readdir_ctx_cachep; + static inline int eggsfs_dir_entry_size(int name_len) { return 8 + 1 + name_len; // inode + len + name } @@ -94,7 +96,7 @@ static int eggsfs_dir_open(struct inode* inode, struct file* filp) { if (err) { return err; } } - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kmem_cache_alloc(eggsfs_readdir_ctx_cachep, GFP_KERNEL); if (!ctx) { return -ENOMEM; } again: // progress: whoever wins the lock either get pages or fails @@ -153,7 +155,7 @@ again: // progress: whoever wins the lock either get pages or fails return 0; out_ctx: - kfree(ctx); + kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx); trace_eggsfs_vfs_opendir_exit(inode, err); eggsfs_debug("err=%d", err); return err; @@ -430,7 +432,7 @@ static int eggsfs_dir_close(struct inode* inode, struct file* filp) { trace_eggsfs_vfs_closedir_enter(inode); eggsfs_dir_put_cache(ctx->pages); - kfree(ctx); + kmem_cache_free(eggsfs_readdir_ctx_cachep, ctx); trace_eggsfs_vfs_closedir_exit(inode, 0); return 0; } @@ -441,3 +443,19 @@ struct file_operations eggsfs_dir_operations = { .iterate_shared = eggsfs_dir_read, .release = eggsfs_dir_close, }; + +int __init eggsfs_dir_init(void) { + eggsfs_readdir_ctx_cachep = kmem_cache_create( + "eggsfs_readdir_ctx_cache", + sizeof(struct eggsfs_readdir_ctx), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL + ); + if (!eggsfs_readdir_ctx_cachep) { return -ENOMEM; } + return 0; +} + +void __cold eggsfs_dir_exit(void) { + kmem_cache_destroy(eggsfs_readdir_ctx_cachep); +} diff --git a/kmod/dir.h b/kmod/dir.h index e848215a..045b50c0 100644 --- a/kmod/dir.h +++ b/kmod/dir.h @@ -37,5 +37,8 @@ static inline int eggsfs_dir_needs_reval(struct eggsfs_inode* dir, struct dentry return ret; } +int __init eggsfs_dir_init(void); +void __cold eggsfs_dir_exit(void); + #endif diff --git a/kmod/err.c b/kmod/err.c index 1ce67f87..a2091f42 100644 --- a/kmod/err.c +++ b/kmod/err.c @@ -1,5 +1,6 @@ #include "err.h" #include "log.h" +#include "bincode.h" bool eggsfs_unexpected_error(int err) { switch (err) { @@ -14,7 +15,7 @@ bool eggsfs_unexpected_error(int err) { case EGGSFS_ERR_FILE_NOT_FOUND: return false; case EGGSFS_ERR_DIRECTORY_NOT_FOUND: return false; case EGGSFS_ERR_NAME_NOT_FOUND: return false; - case EGGSFS_ERR_EDGE_NOT_FOUND: return true; + case EGGSFS_ERR_EDGE_NOT_FOUND: return false; case EGGSFS_ERR_EDGE_IS_LOCKED: return true; case EGGSFS_ERR_TYPE_IS_DIRECTORY: return true; case EGGSFS_ERR_TYPE_IS_NOT_DIRECTORY: return true; @@ -33,7 +34,7 @@ bool eggsfs_unexpected_error(int err) { case EGGSFS_ERR_MTIME_IS_TOO_RECENT: return true; case EGGSFS_ERR_MISMATCHING_TARGET: return true; case EGGSFS_ERR_MISMATCHING_OWNER: return true; - case EGGSFS_ERR_MISMATCHING_CREATION_TIME: return true; + case EGGSFS_ERR_MISMATCHING_CREATION_TIME: return false; case EGGSFS_ERR_DIRECTORY_NOT_EMPTY: return true; case EGGSFS_ERR_FILE_IS_TRANSIENT: return true; case EGGSFS_ERR_OLD_DIRECTORY_NOT_FOUND: return true; @@ -62,6 +63,7 @@ bool eggsfs_unexpected_error(int err) { case EGGSFS_ERR_BLOCK_TOO_BIG: return true; case EGGSFS_ERR_BLOCK_NOT_FOUND: return true; case -ERESTARTSYS: return false; + case -ETIMEDOUT: return false; default: return true; } } @@ -70,7 +72,7 @@ bool eggsfs_unexpected_error(int err) { int eggsfs_error_to_linux(int err) { bool unexpected = eggsfs_unexpected_error(err); if (unexpected) { - eggsfs_warn("unexpected eggsfs error %d", err); + eggsfs_warn("unexpected eggsfs error %s (%d)", eggsfs_err_str(err), err); } WARN_ON(unexpected); if (err == 0) { return 0; } diff --git a/kmod/file.c b/kmod/file.c index a320b309..d00010ea 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -15,6 +15,9 @@ #include "trace.h" #include "span.h" #include "wq.h" +#include "bincode.h" + +static struct kmem_cache* eggsfs_transient_span_cachep; struct eggsfs_transient_span { // The transient span holds a reference to this. As long as the @@ -65,20 +68,26 @@ static int eggsfs_file_open(struct inode* inode, struct file* filp) { return 0; } -// starts out with refcount = 1 -static struct eggsfs_transient_span* eggsfs_new_transient_span(struct eggsfs_inode* enode, u64 offset) { - struct eggsfs_transient_span* span = kzalloc(sizeof(struct eggsfs_transient_span), GFP_KERNEL); - if (span == NULL) { return span; } - span->enode = enode; +static void eggsfs_init_transient_span(void* p) { + struct eggsfs_transient_span* span = (struct eggsfs_transient_span*)p; INIT_LIST_HEAD(&span->pages); int i; for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) { INIT_LIST_HEAD(&span->blocks[i]); } - ihold(&enode->inode); spin_lock_init(&span->blocks_proofs_lock); +} + +// starts out with refcount = 1 +static struct eggsfs_transient_span* eggsfs_new_transient_span(struct eggsfs_inode* enode, u64 offset) { + struct eggsfs_transient_span* span = kmem_cache_alloc(eggsfs_transient_span_cachep, GFP_KERNEL); + if (span == NULL) { return span; } + span->enode = enode; + eggsfs_in_flight_begin(enode); atomic_set(&span->refcount, 1); span->offset = offset; + span->written = 0; + memset(span->blocks_proofs, 0, sizeof(span->blocks_proofs)); return span; } @@ -88,6 +97,7 @@ static void eggsfs_hold_transient_span(struct eggsfs_transient_span* span) { static void eggsfs_put_transient_span(struct eggsfs_transient_span* span) { if (atomic_dec_return(&span->refcount) == 0) { + BUG_ON(spin_is_locked(&span->blocks_proofs_lock)); // free pages, adjust OOM score int num_pages = 0; #define FREE_PAGES(__pages) \ @@ -104,9 +114,9 @@ static void eggsfs_put_transient_span(struct eggsfs_transient_span* span) { } #undef FREE_PAGES add_mm_counter(span->enode->file.mm, MM_FILEPAGES, -num_pages); - iput(&span->enode->inode); + eggsfs_in_flight_end(span->enode); // Free the span itself - kfree(span); + kmem_cache_free(eggsfs_transient_span_cachep, span); } } @@ -348,8 +358,6 @@ static int eggsfs_start_flushing(struct eggsfs_inode* enode, bool non_blocking) // Here we might block even if we're non blocking, since we don't have // async metadata requests yet, which is a bit unfortunate. - u32* cell_crcs = NULL; - if (span->storage_class == EGGSFS_EMPTY_STORAGE) { up(&enode->file.flushing_span_sema); // nothing to do } else if (span->storage_class == EGGSFS_INLINE_STORAGE) { @@ -370,8 +378,8 @@ static int eggsfs_start_flushing(struct eggsfs_inode* enode, bool non_blocking) int D = eggsfs_data_blocks(span->parity); int B = eggsfs_blocks(span->parity); int S = span->stripes; - cell_crcs = kzalloc(sizeof(uint32_t)*B*span->stripes, GFP_KERNEL); - if (cell_crcs == NULL) { err = -ENOMEM; goto out_err_keep_writing; } + uint32_t cell_crcs[EGGSFS_MAX_BLOCKS*EGGSFS_MAX_STRIPES]; + memset(cell_crcs, 0, sizeof(cell_crcs)); eggsfs_debug("size=%d, D=%d, B=%d, S=%d", span->written, D, B, S); int i, j; @@ -486,7 +494,6 @@ out: // we don't need this anymore (the requests might though) eggsfs_put_transient_span(span); enode->file.writing_span = NULL; - if (cell_crcs) { kfree(cell_crcs); } return err; out_err_fpu: @@ -496,7 +503,6 @@ out_err_fpu: // When we got an error before even starting to flush out_err_keep_writing: up(&enode->file.flushing_span_sema); // nothing is flushing - if (cell_crcs) { kfree(cell_crcs); } return err; } @@ -607,22 +613,20 @@ static ssize_t eggsfs_file_write_iter(struct kiocb* iocb, struct iov_iter* from) int eggsfs_file_flush(struct eggsfs_inode* enode, struct dentry* dentry) { inode_lock(&enode->inode); + int err = 0; + // Not writing, there's nothing to do, there's nothing to do, files are immutable if (enode->file.status != EGGSFS_FILE_STATUS_WRITING) { eggsfs_debug("status=%d, won't flush", enode->file.status); - inode_unlock(&enode->inode); - return 0; + goto out_early; } // We are in another process, skip if (enode->file.owner != current->group_leader) { eggsfs_debug("owner=%p != group_leader=%p, won't flush", enode->file.owner, current->group_leader); - inode_unlock(&enode->inode); - return 0; + goto out_early; } - int err = 0; - // If the owner doesn't have an mm anymore, it means that the file // is being torn down after the process has been terminated. In that case // we shouldn't even link the file. @@ -668,7 +672,9 @@ out: } enode->file.writing_span = NULL; } +out_early: inode_unlock(&enode->inode); + eggsfs_wait_in_flight(enode); return err; } @@ -690,7 +696,7 @@ static ssize_t eggsfs_file_read_iter(struct kiocb* iocb, struct iov_iter* to) { eggsfs_debug("start of read loop, *ppos=%llu", *ppos); struct eggsfs_span* span = NULL; while (*ppos < inode->i_size && iov_iter_count(to)) { - if (span) { eggsfs_span_put(span, true); } + if (span) { eggsfs_put_span(span, true); } span = eggsfs_get_span(enode, *ppos); if (IS_ERR(span)) { written = PTR_ERR(span); @@ -759,7 +765,7 @@ static ssize_t eggsfs_file_read_iter(struct kiocb* iocb, struct iov_iter* to) { eggsfs_debug("before loop end, remaining %lu", iov_iter_count(to)); } out: - if (span) { eggsfs_span_put(span, true); } + if (span) { eggsfs_put_span(span, true); } eggsfs_debug("out of the loop, written=%ld", written); if (unlikely(written < 0)) { eggsfs_debug("reading failed, err=%ld", written); @@ -767,6 +773,60 @@ out: return written; } +void eggsfs_link_destructor(void* buf) { + kfree(buf); +} + +char* eggsfs_write_link(struct eggsfs_inode* enode) { + eggsfs_debug("ino=%016lx", enode->inode.i_ino); + + BUG_ON(eggsfs_inode_type(enode->inode.i_ino) != EGGSFS_INODE_SYMLINK); + + // size might not be filled in + int err = eggsfs_do_getattr(enode); + if (err) { return ERR_PTR(err); } + + BUG_ON(enode->inode.i_size > PAGE_SIZE); // for simplicity... + size_t size = enode->inode.i_size; + + eggsfs_debug("size=%lu", size); + + struct eggsfs_span* span = eggsfs_get_span(enode, 0); + if (span == NULL) { + eggsfs_debug("got no span, empty file?"); + return ""; + } + if (IS_ERR(span)) { return ERR_CAST(span); } + + BUG_ON(span->end-span->start != size); // again, for simplicity + + char* buf = kmalloc(size+1, GFP_KERNEL); + if (buf == NULL) { err = -ENOMEM; goto out_err; } + + if (span->storage_class == EGGSFS_INLINE_STORAGE) { + memcpy(buf, EGGSFS_INLINE_SPAN(span)->body, size); + } else { + struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); + struct page* page = eggsfs_get_span_page(block_span, 0); + BUG_ON(page == NULL); + if (IS_ERR(page)) { err = PTR_ERR(page); goto out_err; } + char* page_buf = kmap(page); + memcpy(buf, page_buf, size); + kunmap(page); + } + buf[size] = '\0'; + eggsfs_put_span(span, true); + + eggsfs_debug("link %*pE", (int)size, buf); + + return buf; + +out_err: + eggsfs_debug("get_link err=%d", err); + eggsfs_put_span(span, false); + return ERR_PTR(err); +} + const struct file_operations eggsfs_file_operations = { .open = eggsfs_file_open, .read_iter = eggsfs_file_read_iter, @@ -774,3 +834,20 @@ const struct file_operations eggsfs_file_operations = { .flush = eggsfs_file_flush_internal, .llseek = generic_file_llseek, }; + +int __init eggsfs_file_init(void) { + eggsfs_transient_span_cachep = kmem_cache_create( + "eggsfs_transient_span_cache", + sizeof(struct eggsfs_transient_span), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + &eggsfs_init_transient_span + ); + if (!eggsfs_transient_span_cachep) { return -ENOMEM; } + return 0; +} + +void __cold eggsfs_file_exit(void) { + // TODO: handle case where there still are requests in flight. + kmem_cache_destroy(eggsfs_transient_span_cachep); +} diff --git a/kmod/file.h b/kmod/file.h index 87489187..2acb09da 100644 --- a/kmod/file.h +++ b/kmod/file.h @@ -1,15 +1,13 @@ -#ifndef _EGGSFS_FILESIMPLE_H -#define _EGGSFS_FILESIMPLE_H - -#include -#include +#ifndef _EGGSFS_FILE_H +#define _EGGSFS_FILE_H #include "inode.h" -// Must be called with `spans_lock` taken. -struct eggsfs_transient_span* eggsfs_add_new_span(struct list_head* spans); +ssize_t eggsfs_file_write(struct eggsfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from); +int eggsfs_file_flush(struct eggsfs_inode* enode, struct dentry* dentry); -void eggsfs_flush_transient_spans(struct work_struct* work); +void eggsfs_link_destructor(void*); +char* eggsfs_write_link(struct eggsfs_inode* enode); int eggsfs_shard_add_span_initiate_block_cb( void* data, int block, u32 ip1, u16 port1, u32 ip2, u16 port2, u64 block_service_id, u64 block_id, u64 certificate @@ -17,7 +15,7 @@ int eggsfs_shard_add_span_initiate_block_cb( extern const struct file_operations eggsfs_file_operations; -ssize_t eggsfs_file_write(struct eggsfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from); -int eggsfs_file_flush(struct eggsfs_inode* enode, struct dentry* dentry); +int __init eggsfs_file_init(void); +void __cold eggsfs_file_exit(void); #endif diff --git a/kmod/inode.c b/kmod/inode.c index a040fc30..6ed465a1 100644 --- a/kmod/inode.c +++ b/kmod/inode.c @@ -5,7 +5,7 @@ #include "log.h" #include "dir.h" #include "metadata.h" -#include "namei.h" +#include "dentry.h" #include "trace.h" #include "err.h" #include "file.h" @@ -39,15 +39,6 @@ void eggsfs_inode_evict(struct inode* inode) { if (S_ISDIR(inode->i_mode)) { eggsfs_dir_drop_cache(enode); } else if (S_ISREG(inode->i_mode)) { - // Make sure no prefetching is taking place (otherwise the - // dropping of spans below would fail). - int prefetches = atomic_read(&enode->file.prefetches); - if (prefetches) { - wait_event(enode->file.prefetches_wq, atomic_read(&enode->file.prefetches) == 0); - } - // TODO verify that there are no flushing spans -- it should always be - // the case given that we get references to the inode - // Free span cache eggsfs_drop_file_spans(enode); } truncate_inode_pages(&inode->i_data, 0); @@ -266,66 +257,19 @@ static int eggsfs_symlink(struct inode* dir, struct dentry* dentry, const char* return 0; } -static void eggsfs_get_link_destructor(void* buf) { - kfree(buf); -} - static const char* eggsfs_get_link(struct dentry* dentry, struct inode* inode, struct delayed_call* destructor) { - eggsfs_debug("ino=%016lx", inode->i_ino); - - struct eggsfs_inode* enode = EGGSFS_I(inode); - - BUG_ON(eggsfs_inode_type(enode->inode.i_ino) != EGGSFS_INODE_SYMLINK); - // Can't be bothered to think about RCU if (dentry == NULL) { return ERR_PTR(-ECHILD); } - // size might not be filled in - int err = eggsfs_do_getattr(enode); - if (err) { return ERR_PTR(err); } + struct eggsfs_inode* enode = EGGSFS_I(inode); + char* buf = eggsfs_write_link(enode); - BUG_ON(enode->inode.i_size > PAGE_SIZE); // for simplicity... - size_t size = enode->inode.i_size; + if (IS_ERR(buf)) { return buf; } - eggsfs_debug("size=%lu", size); - - struct eggsfs_span* span = eggsfs_get_span(enode, 0); - if (span == NULL) { - eggsfs_debug("got no span, empty file?"); - return ""; - } - if (IS_ERR(span)) { return ERR_CAST(span); } - - BUG_ON(span->end-span->start != size); // again, for simplicity - - char* buf = kmalloc(size+1, GFP_KERNEL); - if (buf == NULL) { err = -ENOMEM; goto out_err; } - - destructor->fn = eggsfs_get_link_destructor; + destructor->fn = eggsfs_link_destructor; destructor->arg = buf; - if (span->storage_class == EGGSFS_INLINE_STORAGE) { - memcpy(buf, EGGSFS_INLINE_SPAN(span)->body, size); - } else { - struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); - struct page* page = eggsfs_get_span_page(block_span, 0); - BUG_ON(page == NULL); - if (IS_ERR(page)) { err = PTR_ERR(page); goto out_err; } - char* page_buf = kmap(page); - memcpy(buf, page_buf, size); - kunmap(page); - } - buf[size] = '\0'; - eggsfs_span_put(span, true); - - eggsfs_debug("link %*pE", (int)size, buf); - return buf; - -out_err: - eggsfs_debug("get_link err=%d", err); - eggsfs_span_put(span, false); - return ERR_PTR(err); } static const struct inode_operations eggsfs_dir_inode_ops = { @@ -389,12 +333,14 @@ struct inode* eggsfs_get_inode(struct super_block* sb, struct eggsfs_inode* pare inode->i_fop = &eggsfs_file_operations; enode->file.status = EGGSFS_FILE_STATUS_NONE; - // Init normal file stuff -- that's always there. + + // Init normal file stuff -- that's always there. The transient + // file stuff is only filled in if needed. enode->file.spans = RB_ROOT; init_rwsem(&enode->file.spans_lock); atomic64_set(&enode->file.prefetch_section, 0); - atomic_set(&enode->file.prefetches, 0); - init_waitqueue_head(&enode->file.prefetches_wq); + atomic_set(&enode->file.in_flight, 0); + init_waitqueue_head(&enode->file.in_flight_wq); } // FIXME @@ -418,3 +364,12 @@ struct inode* eggsfs_get_inode(struct super_block* sb, struct eggsfs_inode* pare return inode; } +void eggsfs_wait_in_flight(struct eggsfs_inode* enode) { + if (atomic_read(&enode->file.in_flight) == 0) { return; } + + // Wait for 10 secs, then give up + long res = wait_event_timeout(enode->file.in_flight_wq, atomic_read(&enode->file.in_flight) == 0, 10 * HZ); + if (res > 0) { return; } + + eggsfs_warn("waited for 10 seconds for in flight requests for inode %016lx, either some requests are stuck or this is a bug.", enode->inode.i_ino); +} diff --git a/kmod/inode.h b/kmod/inode.h index b07e7321..003447aa 100644 --- a/kmod/inode.h +++ b/kmod/inode.h @@ -69,23 +69,31 @@ struct eggsfs_inode_file { struct rb_root spans; struct rw_semaphore spans_lock; - atomic_t prefetches; // how many prefetches are ongoing - wait_queue_head_t prefetches_wq; // waited on by evictor // low 32 bits: section start. high 32 bits: section end. Rouded up to // PAGE_SIZE, since we know each stripe is at PAGE_SIZE boundary anyway. // This means that prefetch with be somewhat broken for files > 16TiB. // This is what we use in our prefetch heuristic. atomic64_t prefetch_section; + atomic_t prefetches_in_flight; + wait_queue_head_t prefetches_in_flight_wq; - // Transient file stuff. Only initialized on file creation, otherwise it's garbage. + // On top of sometimes holding references to inodes, we also have this + // mechanism to try to make sure that every in-flight operation is done + // before the file is closed. This should ensure that we don't leak inodes + // after unmounting. + atomic_t in_flight; + wait_queue_head_t in_flight_wq; + + // Transient file stuff. Only initialized on file creation (rather than opening), + // otherwise it's garbage. // Could be factored out to separate data structure since it's completely useless - // when writing. + // when reading. u64 cookie; // If we've encountered an error such that we want to stop writing to this file // forever. atomic_t transient_err; - // Span we're currently writing to. If NULL we're yet to write anything. + // Span we're currently writing to. Might be NULL. struct eggsfs_transient_span* writing_span; // Whether we're currently flushing a span (block write + add span certify) struct semaphore flushing_span_sema; @@ -137,7 +145,6 @@ struct eggsfs_inode { }; #define EGGSFS_I(ptr) container_of(ptr, struct eggsfs_inode, inode) -#define EGGSFS_INODE_NEED_GETATTR 1 #define EGGSFS_INODE_DIRECTORY 1 #define EGGSFS_INODE_FILE 2 @@ -153,16 +160,30 @@ static inline u32 eggsfs_inode_shard(u64 ino) { struct inode* eggsfs_get_inode(struct super_block* sb, struct eggsfs_inode* parent, u64 ino); - // super ops struct inode* eggsfs_inode_alloc(struct super_block* sb); void eggsfs_inode_evict(struct inode* inode); void eggsfs_inode_free(struct inode* inode); -int __init eggsfs_inode_init(void); -void __cold eggsfs_inode_exit(void); - // inode ops int eggsfs_do_getattr(struct eggsfs_inode* enode); +static inline void eggsfs_in_flight_begin(struct eggsfs_inode* enode) { + ihold(&enode->inode); + atomic_inc(&enode->file.in_flight); + smp_mb__after_atomic(); +} + +static inline void eggsfs_in_flight_end(struct eggsfs_inode* enode) { + smp_mb__before_atomic(); + int remaining = atomic_dec_return(&enode->file.in_flight); + if (remaining == 0) { wake_up_all(&enode->file.in_flight_wq); } + iput(&enode->inode); +} + +void eggsfs_wait_in_flight(struct eggsfs_inode* enode); + +int __init eggsfs_inode_init(void); +void __cold eggsfs_inode_exit(void); + #endif diff --git a/kmod/kmod.c b/kmod/kmod.c index 888a775d..8e135f2d 100644 --- a/kmod/kmod.c +++ b/kmod/kmod.c @@ -15,6 +15,8 @@ #include "rs.h" #include "log.h" #include "span.h" +#include "dir.h" +#include "file.h" MODULE_LICENSE("GPL"); @@ -51,9 +53,18 @@ static int __init eggsfs_init(void) { err = eggsfs_rs_init(); if (err) { goto out_rs; } + err = eggsfs_dir_init(); + if (err) { goto out_dir; } + + err = eggsfs_file_init(); + if (err) { goto out_file; } return 0; +out_file: + eggsfs_dir_exit(); +out_dir: + eggsfs_rs_exit(); out_rs: eggsfs_fs_exit(); out_fs: @@ -72,17 +83,20 @@ out_sysfs: } static void __exit eggsfs_exit(void) { + eggsfs_file_exit(); + eggsfs_dir_exit(); eggsfs_fs_exit(); eggsfs_inode_exit(); eggsfs_block_exit(); eggsfs_sysctl_exit(); eggsfs_sysfs_exit(); - destroy_workqueue(eggsfs_wq); // tracepoint_synchronize_unregister() must be called before the end of // the module exit function to make sure there is no caller left using // the probe. This, and the fact that preemption is disabled around the // probe call, make sure that probe removal and module unload are safe. tracepoint_synchronize_unregister(); + + destroy_workqueue(eggsfs_wq); } module_init(eggsfs_init); diff --git a/kmod/metadata.c b/kmod/metadata.c index 5e527483..b9c32eee 100644 --- a/kmod/metadata.c +++ b/kmod/metadata.c @@ -592,23 +592,32 @@ int eggsfs_shard_add_span_initiate( // Not worth writing a bincodegen stuff for this variable sized request, of which we have // only two instance. int B = eggsfs_blocks(parity); - size_t msg_size = - 8 + // FileId - 8 + // Cookie - 8 + // ByteOffset - 4 + // Size - 4 + // CRC - 1 + // StorageClass - 2 + // Blacklist (just length) - 1 + // Parity - 1 + // Stripes - 4 + // CellSize - 2 + (4*stripes*B); // Crcs - char* req = kmalloc(EGGSFS_SHARD_HEADER_SIZE + msg_size, GFP_KERNEL); + if (B > EGGSFS_MAX_BLOCKS) { + return -EINVAL; + } + +#define MSG_SIZE(__B, __stripes) ( \ + 8 + /* FileId */ \ + 8 + /* Cookie */ \ + 8 + /* ByteOffset */ \ + 4 + /* Size */ \ + 4 + /* CRC */ \ + 1 + /* StorageClass */ \ + 2 + /* Blacklist (just length) */ \ + 1 + /* Parity */ \ + 1 + /* Stripes */ \ + 4 + /* CellSize */ \ + 2 + (4*__stripes*__B) /* CRCs */ \ + ) + + size_t msg_size = MSG_SIZE(stripes, B); + char req[EGGSFS_SHARD_HEADER_SIZE + MSG_SIZE(EGGSFS_MAX_STRIPES, EGGSFS_MAX_BLOCKS)]; if (req == NULL) { return -ENOMEM; } +#undef MSG_SIZE + { PREPARE_SHARD_REQ_CTX_INNER(msg_size); eggsfs_add_span_initiate_req_put_start(&ctx, start); @@ -629,7 +638,6 @@ int eggsfs_shard_add_span_initiate( } eggsfs_add_span_initiate_req_put_end(&ctx, req_crcs, end); skb = eggsfs_send_shard_req(info, eggsfs_inode_shard(file), req_id, &ctx, &attempts); - kfree(req); if (IS_ERR(skb)) { return PTR_ERR(skb); } } @@ -685,19 +693,24 @@ int eggsfs_shard_add_span_certify( u64 req_id = alloc_request_id(); u8 kind = EGGSFS_SHARD_ADD_SPAN_CERTIFY; - // Not worth writing a bincodegen stuff for this variable sized request, of which we have - // only two instance. int B = eggsfs_blocks(parity); - size_t msg_size = - 8 + // FileId - 8 + // Cookie - 8 + // ByteOffset - 2 + (8*2*B); // Proofs - char* req = kmalloc(EGGSFS_SHARD_HEADER_SIZE + msg_size, GFP_KERNEL); - if (req == NULL) { - return -ENOMEM; + if (unlikely(B > EGGSFS_MAX_BLOCKS)) { + return -EINVAL; } + // Not worth writing a bincodegen stuff for this variable sized request, of which we have + // only two instances. + +#define MSG_SIZE(__B) ( \ + 8 + /* FileId */ \ + 8 + /* Cookie */ \ + 8 + /* ByteOffset */ \ + 2 + (8*2*__B) /* Proofs */ \ + ) + char req[EGGSFS_SHARD_HEADER_SIZE + MSG_SIZE(EGGSFS_MAX_BLOCKS)]; + size_t msg_size = MSG_SIZE(B); +#undef MSG_SIZE + { PREPARE_SHARD_REQ_CTX_INNER(msg_size); eggsfs_add_span_certify_req_put_start(&ctx, start); @@ -714,7 +727,6 @@ int eggsfs_shard_add_span_certify( } eggsfs_add_span_certify_req_put_end(&ctx, req_proofs, end); skb = eggsfs_send_shard_req(info, eggsfs_inode_shard(file), req_id, &ctx, &attempts); - kfree(req); if (IS_ERR(skb)) { return PTR_ERR(skb); } } diff --git a/kmod/net.c b/kmod/net.c index a965f5a9..90b36884 100644 --- a/kmod/net.c +++ b/kmod/net.c @@ -226,7 +226,7 @@ struct sk_buff* eggsfs_metadata_request( u64 start_t = get_jiffies_64(); u64 elapsed_ms = 0; -#define LOG_STR "unexpected error: req_id=%llu shard_id=%d kind_str=%s kind=%d addr=%pI4:%d attempts=%d elapsed=%llums" +#define LOG_STR "req_id=%llu shard_id=%d kind_str=%s kind=%d addr=%pI4:%d attempts=%d elapsed=%llums" #define LOG_ARGS req_id, shard_id, kind_str, kind, &addr->sin_addr, ntohs(addr->sin_port), *attempts, elapsed_ms #define WARN_LATE if (elapsed_ms > 1000) { eggsfs_warn("late request: " LOG_STR, LOG_ARGS); } diff --git a/kmod/restartsession.sh b/kmod/restartsession.sh index e35309c0..1b3b787c 100755 --- a/kmod/restartsession.sh +++ b/kmod/restartsession.sh @@ -74,4 +74,7 @@ fi # Attach tmux attach-session -t uovo:1 -# ./eggs/eggstests -kmod -filter 'mounted|rsync|large' -drop-cached-spans-every 100ms -short -binaries-dir $(pwd)/eggs +# ./eggs/eggstests -verbose -kmod -filter 'mounted|rsync|large' -drop-cached-spans-every 100ms -short -binaries-dir $(pwd)/eggs + +# sudo sysctl fs.eggsfs.debug=1 +# ./eggs/eggstests -verbose -kmod -filter 'mounted' -cfg fsTest.numDirs=1 -cfg fsTest.numFiles=10 -short -binaries-dir $(pwd)/eggs \ No newline at end of file diff --git a/kmod/span.c b/kmod/span.c index b51c9208..376b0ed2 100644 --- a/kmod/span.c +++ b/kmod/span.c @@ -1,4 +1,7 @@ +#include + #include "bincode.h" +#include "inode.h" #include "log.h" #include "span.h" #include "metadata.h" @@ -28,25 +31,27 @@ unsigned long eggsfs_span_cache_min_avail_mem_drop = (2ull << 30) + (500ull << 2 unsigned long eggsfs_span_cache_max_size_sync = (100ull << 30); // 100GiB unsigned long eggsfs_span_cache_min_avail_mem_sync = (1ull << 30); // 1GiB +static struct kmem_cache* eggsfs_block_span_cachep; +static struct kmem_cache* eggsfs_inline_span_cachep; +static struct kmem_cache* eggsfs_fetch_stripe_cachep; struct eggsfs_span_lru { spinlock_t lock; struct list_head lru; } ____cacheline_aligned; // These are global locks, but the sharding should alleviate contention. -#define EGGSFS_SPAN_BITS 7 -#define EGGSFS_SPAN_LRUS (1<enode->inode.i_ino, EGGSFS_SPAN_BITS); + h = hash_64(span->ino, EGGSFS_SPAN_BITS); h ^= hash_64(span->start, EGGSFS_SPAN_BITS); - return &eggsfs_span_lrus[h%EGGSFS_SPAN_LRUS]; + return &span_lrus[h%EGGSFS_SPAN_LRUS]; } -static struct eggsfs_span* eggsfs_lookup_span(struct rb_root* spans, u64 offset) { +static struct eggsfs_span* lookup_span(struct rb_root* spans, u64 offset) { struct rb_node* node = spans->rb_node; while (node) { struct eggsfs_span* span = container_of(node, struct eggsfs_span, node); @@ -61,7 +66,7 @@ static struct eggsfs_span* eggsfs_lookup_span(struct rb_root* spans, u64 offset) return NULL; } -static bool eggsfs_insert_span(struct rb_root* spans, struct eggsfs_span* span) { +static bool insert_span(struct rb_root* spans, struct eggsfs_span* span) { struct rb_node** new = &(spans->rb_node); struct rb_node* parent = NULL; while (*new) { @@ -82,15 +87,39 @@ static bool eggsfs_insert_span(struct rb_root* spans, struct eggsfs_span* span) return true; } -static void eggsfs_free_span(struct eggsfs_span* span) { - if (span->storage_class == EGGSFS_INLINE_STORAGE) { - kfree(EGGSFS_INLINE_SPAN(span)); - } else { - kfree(EGGSFS_BLOCK_SPAN(span)); +static void init_block_span(void* p) { + struct eggsfs_block_span* span = (struct eggsfs_block_span*)p; + + xa_init(&span->pages); + int i; + for (i = 0; i < EGGSFS_MAX_STRIPES; i++) { + eggsfs_latch_init(&span->stripe_latches[i]); } } -struct eggsfs_get_span_ctx { +static void free_span(struct eggsfs_span* span, u64* dropped_pages) { + if (span->storage_class == EGGSFS_INLINE_STORAGE) { + kmem_cache_free(eggsfs_inline_span_cachep, EGGSFS_INLINE_SPAN(span)); + } else { + struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); + eggsfs_counter_dec(eggsfs_stat_cached_spans); + struct page* page; + unsigned long page_ix; + xa_for_each(&block_span->pages, page_ix, page) { + if (dropped_pages) { (*dropped_pages)++; } + put_page(page); + xa_erase(&block_span->pages, page_ix); + atomic64_dec(&eggsfs_stat_cached_span_pages); + } + int i; + for (i = 0; i < EGGSFS_MAX_STRIPES; i++) { + BUG_ON(atomic64_read(&block_span->stripe_latches[i].counter) & 1); // nobody still hanging onto this + } + kmem_cache_free(eggsfs_block_span_cachep, block_span); + } +} + +struct get_span_ctx { struct eggsfs_inode* enode; struct list_head spans; int err; @@ -99,10 +128,10 @@ struct eggsfs_get_span_ctx { void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs) { eggsfs_debug("offset=%llu size=%u crc=%08x storage_class=%d parity=%d stripes=%d cell_size=%u", offset, size, crc, storage_class, parity, stripes, cell_size); - struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data; + struct get_span_ctx* ctx = (struct get_span_ctx*)data; if (ctx->err) { return; } - struct eggsfs_block_span* span = kmalloc(sizeof(struct eggsfs_block_span), GFP_KERNEL); + struct eggsfs_block_span* span = kmem_cache_alloc(eggsfs_block_span_cachep, GFP_KERNEL); if (!span) { ctx->err = -ENOMEM; return; } if (eggsfs_data_blocks(parity) > EGGSFS_MAX_DATA || eggsfs_parity_blocks(parity) > EGGSFS_MAX_PARITY) { @@ -111,22 +140,19 @@ void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 sto return; } - span->span.enode = ctx->enode; + span->span.ino = ctx->enode->inode.i_ino; span->span.start = offset; span->span.end = offset + size; span->span.storage_class = storage_class; + span->enode = ctx->enode; // no ihold: `span->enode` is cleared before eviction + - xa_init(&span->pages); - int i; - for (i = 0; i < stripes; i++) { - eggsfs_latch_init(&span->stripe_latches[i]); - } span->cell_size = cell_size; memcpy(span->stripes_crc, stripes_crcs, sizeof(uint32_t)*stripes); span->stripes = stripes; span->parity = parity; // important, this will have readers skip over this until it ends up in the LRU - span->readers = -1; + span->refcount = -1; eggsfs_debug("adding normal span"); list_add_tail(&span->span.lru, &ctx->spans); @@ -137,7 +163,7 @@ void eggsfs_file_spans_cb_block( u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2, u8 flags, u64 block_id, u32 crc ) { - struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data; + struct get_span_ctx* ctx = (struct get_span_ctx*)data; if (ctx->err) { return; } struct eggsfs_span* span = list_last_entry(&ctx->spans, struct eggsfs_span, lru); @@ -157,36 +183,39 @@ void eggsfs_file_spans_cb_block( void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body) { eggsfs_debug("offset=%llu size=%u len=%u body=%*pE", offset, size, len, len, body); - struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data; + struct get_span_ctx* ctx = (struct get_span_ctx*)data; if (ctx->err) { return; } - struct eggsfs_inline_span* span = kmalloc(sizeof(struct eggsfs_inline_span), GFP_KERNEL); + struct eggsfs_inline_span* span = kmem_cache_alloc(eggsfs_inline_span_cachep, GFP_KERNEL); if (!span) { ctx->err = -ENOMEM; return; } - span->span.enode = ctx->enode; + span->span.ino = ctx->enode->inode.i_ino; + span->span.start = offset; span->span.end = offset + size; span->span.storage_class = EGGSFS_INLINE_STORAGE; span->len = len; memcpy(span->body, body, len); + eggsfs_debug("adding inline span"); + list_add_tail(&span->span.lru, &ctx->spans); } // returns whether we could acquire it -static bool eggsfs_span_acquire(struct eggsfs_span* span) { +static bool span_acquire(struct eggsfs_span* span) { if (span->storage_class == EGGSFS_INLINE_STORAGE) { return true; } struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); - struct eggsfs_span_lru* lru = eggsfs_get_span_lru(span); + struct eggsfs_span_lru* lru = get_span_lru(span); spin_lock_bh(&lru->lock); - if (unlikely(block_span->readers < 0)) { // the reclaimer got here before us. + if (unlikely(block_span->refcount < 0)) { // the reclaimer got here before us. spin_unlock_bh(&lru->lock); return false; } - block_span->readers++; - if (block_span->readers == 1) { + block_span->refcount++; + if (block_span->refcount == 1) { // we're the first ones here, take it out of the LRU list_del(&span->lru); } @@ -195,23 +224,44 @@ static bool eggsfs_span_acquire(struct eggsfs_span* span) { return true; } -void eggsfs_span_put(struct eggsfs_span* span, bool was_read) { +// Non-blockingly looks up and acquires a span. Returns -EBUSY if +// the span is there, but it could not be acquired (contention +// with LRU reclaimer). Returns NULL if the span is not there. +static struct eggsfs_span* lookup_and_acquire_span(struct eggsfs_inode* enode, u64 offset) { + struct eggsfs_inode_file* file = &enode->file; + int err; + err = down_read_killable(&enode->file.spans_lock); + if (err) { return ERR_PTR(err); } + { + struct eggsfs_span* span = lookup_span(&file->spans, offset); + if (likely(span)) { + if (unlikely(!span_acquire(span))) { + up_read(&enode->file.spans_lock); + return ERR_PTR(-EBUSY); + } + up_read(&enode->file.spans_lock); + return span; + } + up_read(&enode->file.spans_lock); + } + return NULL; +} + +void eggsfs_put_span(struct eggsfs_span* span, bool was_read) { if (span->storage_class == EGGSFS_INLINE_STORAGE) { return; } struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); - struct eggsfs_span_lru* lru = eggsfs_get_span_lru(span); + struct eggsfs_span_lru* lru = get_span_lru(span); spin_lock_bh(&lru->lock); - block_span->actually_read = block_span->actually_read || was_read; - BUG_ON(block_span->readers < 1); - block_span->readers--; - if (block_span->readers == 0) { // we need to put it back into the LRU - if (block_span->actually_read) { + BUG_ON(block_span->refcount < 1); + block_span->refcount--; + if (block_span->refcount == 0) { // we need to put it back into the LRU, or free it + if (block_span->touched) { list_add_tail(&span->lru, &lru->lru); } else { list_add(&span->lru, &lru->lru); } - block_span->actually_read = false; } spin_unlock_bh(&lru->lock); } @@ -246,19 +296,15 @@ retry: } // Try to read the semaphore if it's already there. - err = down_read_killable(&enode->file.spans_lock); - if (err) { GET_SPAN_EXIT(ERR_PTR(err)); } { - struct eggsfs_span* span = eggsfs_lookup_span(&file->spans, offset); - if (likely(span)) { - if (unlikely(!eggsfs_span_acquire(span))) { - up_read(&enode->file.spans_lock); - goto retry; - } - up_read(&enode->file.spans_lock); - GET_SPAN_EXIT(span); + struct eggsfs_span* span = lookup_and_acquire_span(enode, offset); + if (unlikely(IS_ERR(span))) { + int err = PTR_ERR(span); + if (err == -EBUSY) { goto retry; } // we couldn't acquire the span + return span; // some other error + } else if (likely(span != NULL)) { + return span; } - up_read(&enode->file.spans_lock); } // We need to fetch the spans. @@ -267,9 +313,9 @@ retry: // Check if somebody go to it first. { - struct eggsfs_span* span = eggsfs_lookup_span(&file->spans, offset); + struct eggsfs_span* span = lookup_span(&file->spans, offset); if (unlikely(span)) { - if (unlikely(!eggsfs_span_acquire(span))) { + if (unlikely(!span_acquire(span))) { up_write(&file->spans_lock); goto retry; } @@ -285,7 +331,7 @@ retry: for (spans_offset = 0;;) { // fetch next batch of spans u64 next_offset; - struct eggsfs_get_span_ctx ctx = { .err = 0, .enode = enode }; + struct get_span_ctx ctx = { .err = 0, .enode = enode }; INIT_LIST_HEAD(&ctx.spans); err = eggsfs_error_to_linux(eggsfs_shard_file_spans( (struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, spans_offset, &next_offset,&ctx @@ -296,7 +342,7 @@ retry: for (;;) { struct eggsfs_span* span = list_first_entry_or_null(&ctx.spans, struct eggsfs_span, lru); list_del(&span->lru); - eggsfs_free_span(span); + free_span(span, NULL); } up_write(&file->spans_lock); GET_SPAN_EXIT(ERR_PTR(err)); @@ -306,17 +352,17 @@ retry: struct eggsfs_span* span = list_first_entry_or_null(&ctx.spans, struct eggsfs_span, lru); if (span == NULL) { break; } list_del(&span->lru); - if (!eggsfs_insert_span(&file->spans, span)) { + if (!insert_span(&file->spans, span)) { // Span is already cached - eggsfs_free_span(span); + free_span(span, NULL); } else { if (span->storage_class != EGGSFS_INLINE_STORAGE) { // Not already cached, must add it to the LRU eggsfs_counter_inc(eggsfs_stat_cached_spans); struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); - struct eggsfs_span_lru* lru = eggsfs_get_span_lru(span); + struct eggsfs_span_lru* lru = get_span_lru(span); spin_lock_bh(&lru->lock); - block_span->readers = 0; + block_span->refcount = 0; list_add(&span->lru, &lru->lru); spin_unlock_bh(&lru->lock); } @@ -338,15 +384,15 @@ retry: // If it returns -1, there are no spans to drop. Otherwise, returns the // next index to pass in to reclaim the next span. -static int eggsfs_drop_one_span(int lru_ix, u64* dropped_pages) { +static int drop_one_span(int lru_ix, u64* dropped_pages) { BUG_ON(lru_ix < 0 || lru_ix >= EGGSFS_SPAN_LRUS); int i; for (i = lru_ix; i < lru_ix + EGGSFS_SPAN_LRUS; i++) { - struct eggsfs_span_lru* lru = &eggsfs_span_lrus[i%EGGSFS_SPAN_LRUS]; + struct eggsfs_span_lru* lru = &span_lrus[i%EGGSFS_SPAN_LRUS]; struct eggsfs_span* candidate = NULL; - struct eggsfs_block_span* candidate_blocks = NULL; + struct eggsfs_inode* enode = NULL; // Pick a candidate from the LRU, mark it as reclaiming, take it out of // the LRU. @@ -356,32 +402,30 @@ static int eggsfs_drop_one_span(int lru_ix, u64* dropped_pages) { spin_unlock_bh(&lru->lock); } else { BUG_ON(candidate->storage_class == EGGSFS_INLINE_STORAGE); // no inline spans in LRU - candidate_blocks = EGGSFS_BLOCK_SPAN(candidate); + struct eggsfs_block_span* candidate_blocks = EGGSFS_BLOCK_SPAN(candidate); // we just got this from LRU, it must be readers == 0 - BUG_ON(candidate_blocks->readers != 0); - candidate_blocks->readers = -1; + BUG_ON(candidate_blocks->refcount != 0); + candidate_blocks->refcount = -1; list_del(&candidate->lru); + if (likely(candidate_blocks->enode)) { + ihold(&candidate_blocks->enode->inode); + enode = candidate_blocks->enode; + } spin_unlock_bh(&lru->lock); } if (candidate) { // Take it out of the spans for the file - down_write(&candidate->enode->file.spans_lock); - rb_erase(&candidate->node, &candidate->enode->file.spans); - up_write(&candidate->enode->file.spans_lock); + if (likely(enode)) { + down_write(&enode->file.spans_lock); + rb_erase(&candidate->node, &enode->file.spans); + up_write(&enode->file.spans_lock); + iput(&enode->inode); + } // At this point we're free to do what we please with the span // (it's fully private to us). - eggsfs_counter_dec(eggsfs_stat_cached_spans); - struct page* page; - unsigned long page_ix; - xa_for_each(&candidate_blocks->pages, page_ix, page) { - put_page(page); - xa_erase(&candidate_blocks->pages, page_ix); - atomic64_dec(&eggsfs_stat_cached_span_pages); - (*dropped_pages)++; - } - eggsfs_free_span(candidate); + free_span(candidate, dropped_pages); i++; break; @@ -389,45 +433,43 @@ static int eggsfs_drop_one_span(int lru_ix, u64* dropped_pages) { } // we've fully gone around without finding anything - if (i == lru_ix + EGGSFS_SPAN_LRUS) { - return -1; - } + if (i == lru_ix + EGGSFS_SPAN_LRUS) { return -1; } // We've found something return i%EGGSFS_SPAN_LRUS; } -static inline void eggsfs_drop_spans_enter(const char* type) { +static inline void drop_spans_enter(const char* type) { trace_eggsfs_drop_spans_enter(type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_span_pages), eggsfs_counter_get(&eggsfs_stat_cached_spans)); } -static inline void eggsfs_drop_spans_exit(const char* type, u64 dropped_pages) { +static inline void drop_spans_exit(const char* type, u64 dropped_pages) { trace_eggsfs_drop_spans_exit(type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_span_pages), eggsfs_counter_get(&eggsfs_stat_cached_spans), dropped_pages); } // returns the number of dropped pages u64 eggsfs_drop_all_spans(void) { - eggsfs_drop_spans_enter("all"); + drop_spans_enter("all"); u64 dropped_pages = 0; s64 spans_begin = eggsfs_counter_get(&eggsfs_stat_cached_spans); int lru_ix = 0; for (;;) { - lru_ix = eggsfs_drop_one_span(lru_ix, &dropped_pages); + lru_ix = drop_one_span(lru_ix, &dropped_pages); if (lru_ix < 0) { break; } } s64 spans_end = eggsfs_counter_get(&eggsfs_stat_cached_spans); eggsfs_info("reclaimed %llu pages, %lld spans (approx)", dropped_pages, spans_begin-spans_end); - eggsfs_drop_spans_exit("all", dropped_pages); + drop_spans_exit("all", dropped_pages); return dropped_pages; } -static DEFINE_MUTEX(eggsfs_drop_spans_mu); +static DEFINE_MUTEX(drop_spans_mu); -static u64 eggsfs_drop_spans(const char* type) { - mutex_lock(&eggsfs_drop_spans_mu); +static u64 drop_spans(const char* type) { + mutex_lock(&drop_spans_mu); - eggsfs_drop_spans_enter(type); + drop_spans_enter(type); u64 dropped_pages = 0; int lru_ix = 0; @@ -439,25 +481,25 @@ static u64 eggsfs_drop_spans(const char* type) { ) { break; } - lru_ix = eggsfs_drop_one_span(lru_ix, &dropped_pages); + lru_ix = drop_one_span(lru_ix, &dropped_pages); if (lru_ix < 0) { break; } } eggsfs_debug("dropped %llu pages", dropped_pages); - eggsfs_drop_spans_exit(type, dropped_pages); + drop_spans_exit(type, dropped_pages); - mutex_unlock(&eggsfs_drop_spans_mu); + mutex_unlock(&drop_spans_mu); return dropped_pages; } -static void eggsfs_reclaim_spans_async(struct work_struct* work) { - if (!mutex_is_locked(&eggsfs_drop_spans_mu)) { // somebody's already taking care of it - eggsfs_drop_spans("async"); +static void reclaim_spans_async(struct work_struct* work) { + if (!mutex_is_locked(&drop_spans_mu)) { // somebody's already taking care of it + drop_spans("async"); } } -static DECLARE_WORK(eggsfs_reclaim_spans_work, eggsfs_reclaim_spans_async); +static DECLARE_WORK(reclaim_spans_work, reclaim_spans_async); static unsigned long eggsfs_span_shrinker_count(struct shrinker* shrinker, struct shrink_control* sc) { u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages); @@ -469,15 +511,15 @@ static int eggsfs_span_shrinker_lru_ix = 0; static u64 eggsfs_span_shrinker_pages_round = 25600; // We drop at most 100MiB in one shrinker round static unsigned long eggsfs_span_shrinker_scan(struct shrinker* shrinker, struct shrink_control* sc) { - eggsfs_drop_spans_enter("shrinker"); + drop_spans_enter("shrinker"); u64 dropped_pages; int lru_ix = eggsfs_span_shrinker_lru_ix; for (dropped_pages = 0; dropped_pages < eggsfs_span_shrinker_pages_round;) { - lru_ix = eggsfs_drop_one_span(lru_ix, &dropped_pages); + lru_ix = drop_one_span(lru_ix, &dropped_pages); if (lru_ix < 0) { break; } } eggsfs_span_shrinker_lru_ix = lru_ix >= 0 ? lru_ix : 0; - eggsfs_drop_spans_exit("shrinker", dropped_pages); + drop_spans_exit("shrinker", dropped_pages); return dropped_pages; } @@ -498,48 +540,47 @@ again: struct eggsfs_span* span = rb_entry(node, struct eggsfs_span, node); struct eggsfs_block_span* block_span = NULL; + bool can_free_span = true; if (span->storage_class != EGGSFS_INLINE_STORAGE) { block_span = EGGSFS_BLOCK_SPAN(span); - struct eggsfs_span_lru* lru = eggsfs_get_span_lru(span); - // This function is only called on eviction, so either this is being - // reclaimed, or it's in the LRU. + struct eggsfs_span_lru* lru = get_span_lru(span); spin_lock_bh(&lru->lock); - BUG_ON(block_span->readers > 0); - if (unlikely(block_span->readers < 0)) { - // This is being reclaimed, we have to wait until the reclaimer - // cleans it up for us. + if (unlikely(block_span->refcount < 0)) { + // This is being reclaimed, we wait until the reclaimer cleans + // it up for us. spin_unlock_bh(&lru->lock); up_write(&enode->file.spans_lock); goto again; + } else if (unlikely(block_span->refcount > 0)) { + eggsfs_warn("span at %llu for inode %016lx still has %d readers, will linger on until next reclaim", span->start, enode->inode.i_ino, block_span->refcount); + can_free_span = false; + } else { + // Make ourselves the reclaimer. + // This is technically an optimization: we could just wait until they are reclaimed. But + // it will also be the mega common case and will free memory more eagerly. + // TODO tests that things still work if we don't have this optimization. + list_del(&span->lru); + block_span->refcount = -1; } - block_span->readers = -1; - list_del(&span->lru); + block_span->enode = NULL; spin_unlock_bh(&lru->lock); } rb_erase(node, &enode->file.spans); - if (block_span) { - eggsfs_counter_dec(eggsfs_stat_cached_spans); - struct page* page; - unsigned long page_ix; - xa_for_each(&block_span->pages, page_ix, page) { - put_page(page); - xa_erase(&block_span->pages, page_ix); - atomic64_dec(&eggsfs_stat_cached_span_pages); - } - } - eggsfs_free_span(span); + if (can_free_span) { free_span(span, NULL); } } up_write(&enode->file.spans_lock); } -struct eggsfs_fetch_stripe_state { - u64 ino; +struct fetch_stripe_state { + // ihold'd + struct eggsfs_inode *enode; struct semaphore sema; // used to wait on remaining = 0. could be done faster with a wait_queue, but don't want to worry about smb subtleties. s64 stripe_seqno; // used to release the stripe when we're done with it - struct eggsfs_block_span* span; // what we're working on (must not be put until the end of the fetch) + // We hold this (via span acquire) until we're done. + struct eggsfs_block_span* span; // these will be filled in by the fetching (only D of them well be // filled by fetching the blocks, the others might be filled in by the RS // recovery) @@ -550,31 +591,80 @@ struct eggsfs_fetch_stripe_state { static_assert(EGGSFS_MAX_BLOCKS <= 16); u16 blocks; // which blocks we're fetching, bitmap u8 stripe; // which stripe we're into - u8 prefetching; // wether we're prefetching (if we are we'll have to dispose of the span ourselves) + u8 prefetching; // used for logging }; -static void eggsfs_put_fetch_stripe(struct eggsfs_fetch_stripe_state* st) { - struct eggsfs_inode* enode = st->span->span.enode; +static void init_fetch_stripe(void* p) { + struct fetch_stripe_state* st = (struct fetch_stripe_state*)p; - int refs = atomic_dec_return(&st->refcount); - if (refs == 0) { // we're the last ones here - trace_eggsfs_fetch_stripe(enode->inode.i_ino, st->span->span.start, st->stripe, st->prefetching, EGGSFS_FETCH_STRIPE_FREE, atomic_read(&st->err)); - struct eggsfs_block_span* span = st->span; - int B = eggsfs_blocks(span->parity); - int i; - for (i = 0; i < B; i++) { - put_pages_list(&st->blocks_pages[i]); - } - if (st->prefetching) { - smp_mb__before_atomic(); - atomic_dec_return(&enode->file.prefetches); - wake_up_all(&enode->file.prefetches_wq); - } - kfree(st); + sema_init(&st->sema, 0); + int i; + for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) { + INIT_LIST_HEAD(&st->blocks_pages[i]); } } -static void eggsfs_fetch_stripe_store_pages(struct eggsfs_fetch_stripe_state* st) { +// This must be called while already holding the span, which means +// that span_acquire will always succeed. +static struct fetch_stripe_state* new_fetch_stripe( + struct eggsfs_inode* enode, + struct eggsfs_block_span* span, + u8 stripe, + u64 stripe_seqno, + u16 blocks, + bool prefetching +) { + struct fetch_stripe_state* st = (struct fetch_stripe_state*)kmem_cache_alloc(eggsfs_fetch_stripe_cachep, GFP_KERNEL); + if (!st) { return st; } + + BUG_ON(!span_acquire(&span->span)); + + int D = eggsfs_data_blocks(span->parity); + + st->enode = enode; + st->stripe_seqno = stripe_seqno; + st->span = span; + atomic_set(&st->remaining, D); + atomic_set(&st->refcount, 1); // the caller + atomic_set(&st->err, 0); + st->stripe = stripe; + st->prefetching = prefetching; + st->blocks = blocks; + + eggsfs_in_flight_begin(enode); + + return st; +} + +static void put_fetch_stripe(struct fetch_stripe_state* st) { + if (atomic_dec_return(&st->refcount) > 0) { return; } + + // we're the last ones here + trace_eggsfs_fetch_stripe(st->enode->inode.i_ino, st->span->span.start, st->stripe, st->prefetching, EGGSFS_FETCH_STRIPE_FREE, atomic_read(&st->err)); + + // Note that we release this here, rather than when `remaining == 0`, since + // there's no guarantee that we reach `remaining == 0` (we might fail to start + // enough requests) + eggsfs_latch_release(&st->span->stripe_latches[st->stripe], st->stripe_seqno); + + while (down_trylock(&st->sema) == 0) {} // reset sema to zero for next usage + + // Free leftover blocks (also ensures the lists are all nice and empty for the + // next user) + int i; + for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) { + put_pages_list(&st->blocks_pages[i]); + } + + // We don't need the span anymore + eggsfs_put_span(&st->span->span, false); + + eggsfs_in_flight_end(st->enode); + + kmem_cache_free(eggsfs_fetch_stripe_cachep, st); +} + +static void store_pages(struct fetch_stripe_state* st) { struct eggsfs_block_span* span = st->span; int D = eggsfs_data_blocks(span->parity); @@ -586,7 +676,7 @@ static void eggsfs_fetch_stripe_store_pages(struct eggsfs_fetch_stripe_state* st { // get the number of pages struct list_head* tmp; // eggsfs_info("picking %d", __builtin_ctz(st->blocks)); - list_for_each(tmp, &st->blocks_pages[__builtin_ctz(st->blocks)]) { // just pick the first one + list_for_each(tmp, &st->blocks_pages[__builtin_ctz(st->blocks)]) { // just pick the first one to count pages pages++; } } @@ -645,19 +735,19 @@ static void eggsfs_fetch_stripe_store_pages(struct eggsfs_fetch_stripe_state* st // eggsfs_info("curr_page=%u end_page=%u", curr_page, end_page); BUG_ON(curr_page != end_page); if (crc != span->stripes_crc[st->stripe]) { - eggsfs_warn("bad crc for file %016lx, stripe %u, expected %08x, got %08x", span->span.enode->inode.i_ino, st->stripe, span->stripes_crc[st->stripe], crc); + eggsfs_warn("bad crc for file %016lx, stripe %u, expected %08x, got %08x", st->enode->inode.i_ino, st->stripe, span->stripes_crc[st->stripe], crc); atomic_set(&st->err, -EIO); } } -static void eggsfs_fetch_stripe_block_done(void* data, u64 block_id, struct list_head* pages, int err) { +static void block_done(void* data, u64 block_id, struct list_head* pages, int err) { eggsfs_debug("block complete %016llx", block_id); - struct eggsfs_fetch_stripe_state* st = (struct eggsfs_fetch_stripe_state*)data; + struct fetch_stripe_state* st = (struct fetch_stripe_state*)data; struct eggsfs_block_span* span = st->span; int B = eggsfs_blocks(span->parity); trace_eggsfs_fetch_stripe( - st->ino, + st->enode->inode.i_ino, st->span->span.start, st->stripe, st->prefetching, @@ -693,17 +783,11 @@ out: eggsfs_debug("remaining=%d", remaining); if (remaining == 0) { // we're the last one to finish, we need to store the pages in the span - eggsfs_fetch_stripe_store_pages(st); - // release the stripe latch, we're done writing - eggsfs_latch_release(&span->stripe_latches[st->stripe], st->stripe_seqno); - // release the span too, if we were prefetching - if (st->prefetching) { - eggsfs_span_put(&st->span->span, false); - } + store_pages(st); // and wake up waiters, if any up(&st->sema); trace_eggsfs_fetch_stripe( - st->ino, + st->enode->inode.i_ino, st->span->span.start, st->stripe, st->prefetching, @@ -711,16 +795,18 @@ out: atomic_read(&st->err) ); } - eggsfs_put_fetch_stripe(st); // one refcount per request + put_fetch_stripe(st); // one refcount per request eggsfs_debug("done"); } -static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe( +static struct fetch_stripe_state* start_fetch_stripe( struct eggsfs_block_span* span, u8 stripe, s64 stripe_seqno, bool prefetching ) { + BUG_ON(!span->enode); // must still be alive here + struct eggsfs_inode* enode = span->enode; + int err; - struct eggsfs_inode* enode = span->span.enode; - struct eggsfs_fetch_stripe_state* st = NULL; + struct fetch_stripe_state* st = NULL; int i; trace_eggsfs_fetch_stripe(enode->inode.i_ino, span->span.start, stripe, prefetching, EGGSFS_FETCH_STRIPE_START, 0); @@ -753,32 +839,13 @@ static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe( #else // Randomly drop some blocks, useful for testing while (__builtin_popcount(blocks) < D) { - uint32_t r; + uint8_t r; get_random_bytes(&r, sizeof(r)); blocks |= 1u << (r%B); } #endif - st = kmalloc(sizeof(struct eggsfs_fetch_stripe_state), GFP_KERNEL); - if (!st) { err = -ENOMEM; goto out_err; } - - st->ino = enode->inode.i_ino; - sema_init(&st->sema, 0); - st->stripe_seqno = stripe_seqno; - st->span = span; - for (i = 0; i < B; i++) { - INIT_LIST_HEAD(&st->blocks_pages[i]); - } - atomic_set(&st->remaining, D); - atomic_set(&st->refcount, 1); // the caller - atomic_set(&st->err, 0); - st->stripe = stripe; - st->prefetching = prefetching; - smp_mb__before_atomic(); - if (st->prefetching) { - atomic_inc_return(&enode->file.prefetches); - } - st->blocks = blocks; + st = new_fetch_stripe(enode, span, stripe, stripe_seqno, blocks, prefetching); // start fetching blocks eggsfs_debug("fetching blocks"); @@ -789,7 +856,7 @@ static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe( struct eggsfs_block* block = &span->blocks[i]; atomic_inc(&st->refcount); err = eggsfs_fetch_block( - &eggsfs_fetch_stripe_block_done, + &block_done, (void*)st, &block->bs, block->id, block_offset, span->cell_size ); @@ -804,33 +871,23 @@ static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe( out_err: trace_eggsfs_fetch_stripe(enode->inode.i_ino, span->span.start, stripe, prefetching, EGGSFS_FETCH_STRIPE_END, err); - if (st) { eggsfs_put_fetch_stripe(st); } + if (st) { put_fetch_stripe(st); } return ERR_PTR(err); } -static void eggsfs_do_prefetch(struct eggsfs_block_span* block_span, u64 off) { - struct eggsfs_inode* enode = block_span->span.enode; +static void do_prefetch(struct eggsfs_block_span* block_span, u64 off) { + BUG_ON(!block_span->enode); // must still be alive at this point + struct eggsfs_inode* enode = block_span->enode; if (off < block_span->span.start || off >= block_span->span.end) { // Another span, we need to load it. We give up if we can't at any occurrence. - int err = down_read_killable(&enode->file.spans_lock); - if (err) { return; } - struct eggsfs_span* span = eggsfs_lookup_span(&enode->file.spans, off); - if (likely(span)) { - if (unlikely(!eggsfs_span_acquire(span))) { - up_read(&enode->file.spans_lock); - return; - } - } else { - up_read(&enode->file.spans_lock); - return; - } - up_read(&enode->file.spans_lock); + struct eggsfs_span* span = lookup_and_acquire_span(enode, off); + if (unlikely(span == NULL || IS_ERR(span))) { return; } if (span->storage_class == EGGSFS_INLINE_STORAGE) { return; // nothing to do, it's an inline span } block_span = EGGSFS_BLOCK_SPAN(span); - } else if (unlikely(!eggsfs_span_acquire(&block_span->span))) { + } else if (unlikely(!span_acquire(&block_span->span))) { return; // it's being reclaimed, don't bother } @@ -862,7 +919,7 @@ static void eggsfs_do_prefetch(struct eggsfs_block_span* block_span, u64 off) { } // Finally start the work. - struct eggsfs_fetch_stripe_state* st = eggsfs_start_fetch_stripe(block_span, stripe, seqno, true); + struct fetch_stripe_state* st = start_fetch_stripe(block_span, stripe, seqno, true); if (IS_ERR(st)) { eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno); eggsfs_debug("prefetch failed err=%ld", PTR_ERR(st)); @@ -870,15 +927,15 @@ static void eggsfs_do_prefetch(struct eggsfs_block_span* block_span, u64 off) { } // We want the state to go once all the block fetches are done. - eggsfs_put_fetch_stripe(st); - return; - + put_fetch_stripe(st); out_span: - eggsfs_span_put(&block_span->span, false); + // The fetch state has ownership itself + eggsfs_put_span(&block_span->span, false); + return; } // Returns whether we're low on memory -static bool eggsfs_reclaim_after_fetch_stripe(void) { +static bool reclaim_after_fetch_stripe(void) { // Reclaim pages if we went over the limit u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages); u64 free_pages = si_mem_available(); @@ -886,28 +943,33 @@ static bool eggsfs_reclaim_after_fetch_stripe(void) { if (pages*PAGE_SIZE > eggsfs_span_cache_max_size_sync || free_pages*PAGE_SIZE < eggsfs_span_cache_min_avail_mem_sync) { low_on_memory = true; // sync dropping, apply backpressure - if (!mutex_lock_killable(&eggsfs_drop_spans_mu)) { - eggsfs_drop_spans("sync"); - mutex_unlock(&eggsfs_drop_spans_mu); + if (!mutex_lock_killable(&drop_spans_mu)) { + drop_spans("sync"); + mutex_unlock(&drop_spans_mu); } } else if (pages*PAGE_SIZE > eggsfs_span_cache_max_size_async || free_pages*PAGE_SIZE < eggsfs_span_cache_min_avail_mem_async) { low_on_memory = true; // don't bother submitting if another span dropper is running already - if (!mutex_is_locked(&eggsfs_drop_spans_mu)) { + if (!mutex_is_locked(&drop_spans_mu)) { // TODO Is it a good idea to do it on system_long_wq rather than eggsfs_wq? The freeing // job might face heavy contention, so maybe yes? // On the other end, this might make it harder to know when it's safe to unload // the module by making sure that there's no work left on the queue. - queue_work(system_long_wq, &eggsfs_reclaim_spans_work); + queue_work(system_long_wq, &reclaim_spans_work); } } return low_on_memory; } -static void eggsfs_maybe_prefetch(struct eggsfs_block_span* span, bool low_on_memory, u64 stripe_start, u64 stripe_end) { - if (!eggsfs_prefetch) { return; } +static void maybe_prefetch( + struct eggsfs_block_span* span, bool low_on_memory, u64 stripe_start, u64 stripe_end +) { + BUG_ON(!span->enode); // must still be alive here + struct eggsfs_inode* enode = span->enode; - struct eggsfs_inode_file* file = &span->span.enode->file; + if (!eggsfs_prefetch || low_on_memory) { return; } + + struct eggsfs_inode_file* file = &enode->file; u64 prefetch_section = atomic64_read(&file->prefetch_section); u32 prefetch_section_begin = prefetch_section & (uint32_t)(-1); u32 prefetch_section_end = (prefetch_section>>32) & (uint32_t)(-1); @@ -929,65 +991,69 @@ static void eggsfs_maybe_prefetch(struct eggsfs_block_span* span, bool low_on_me } u64 new_prefetch_section = prefetch_section_begin | ((u64)prefetch_section_end << 32); if (likely(atomic64_cmpxchg(&file->prefetch_section, prefetch_section, new_prefetch_section) == prefetch_section)) { - eggsfs_do_prefetch(span, off); + do_prefetch(span, off); } } } -struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix) { - trace_eggsfs_get_span_page_enter(span->span.enode->inode.i_ino, span->span.start, page_ix*PAGE_SIZE); +struct page* eggsfs_get_span_page(struct eggsfs_block_span* block_span, u32 page_ix) { + struct eggsfs_span* span = &block_span->span; + + trace_eggsfs_get_span_page_enter(span->ino, span->start, page_ix*PAGE_SIZE); // We need to load the stripe - int D = eggsfs_data_blocks(span->parity); + int D = eggsfs_data_blocks(block_span->parity); u32 span_offset = page_ix * PAGE_SIZE; - u32 stripe = span_offset / (span->cell_size*D); + u32 stripe = span_offset / (block_span->cell_size*D); bool low_on_memory = false; int err = 0; #define GET_PAGE_EXIT(p) do { \ int __err = IS_ERR(p) ? PTR_ERR(p) : 0; \ if (likely(__err == 0)) { \ - eggsfs_maybe_prefetch(span, low_on_memory, span->span.start + stripe*(span->cell_size*D), span->span.start + (stripe+1)*(span->cell_size*D)); \ + maybe_prefetch(block_span, low_on_memory, span->start + stripe*(block_span->cell_size*D), span->start + (stripe+1)*(block_span->cell_size*D)); \ } \ - trace_eggsfs_get_span_page_exit(span->span.enode->inode.i_ino, span->span.start, page_ix*PAGE_SIZE, __err); \ + trace_eggsfs_get_span_page_exit(span->ino, span->start, page_ix*PAGE_SIZE, __err); \ return p; \ } while(0) // TODO better error? - if (stripe > span->stripes) { - eggsfs_warn("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, span->stripes); + if (stripe > block_span->stripes) { + eggsfs_warn("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, block_span->stripes); GET_PAGE_EXIT(ERR_PTR(-EIO)); } // this should be guaranteed by the caller but we rely on it below, so let's check - if (span->cell_size%PAGE_SIZE != 0) { - eggsfs_warn("cell_size=%u, PAGE_SIZE=%lu, span->cell_size%%PAGE_SIZE=%lu", span->cell_size, PAGE_SIZE, span->cell_size%PAGE_SIZE); + if (block_span->cell_size%PAGE_SIZE != 0) { + eggsfs_warn("cell_size=%u, PAGE_SIZE=%lu, block_span->cell_size%%PAGE_SIZE=%lu", block_span->cell_size, PAGE_SIZE, block_span->cell_size%PAGE_SIZE); GET_PAGE_EXIT(ERR_PTR(-EIO)); } struct page* page; again: - page = xa_load(&span->pages, page_ix); + page = xa_load(&block_span->pages, page_ix); if (page != NULL) { GET_PAGE_EXIT(page); } + // There is no matching release here because the latch is released + // when the fetchers finish (even if this call is interrupted) s64 seqno; - if (!eggsfs_latch_try_acquire(&span->stripe_latches[stripe], seqno)) { - int err = eggsfs_latch_wait_killable(&span->stripe_latches[stripe], seqno); + if (!eggsfs_latch_try_acquire(&block_span->stripe_latches[stripe], seqno)) { + int err = eggsfs_latch_wait_killable(&block_span->stripe_latches[stripe], seqno); if (err) { GET_PAGE_EXIT(ERR_PTR(err)); } goto again; } - page = xa_load(&span->pages, page_ix); + page = xa_load(&block_span->pages, page_ix); if (page != NULL) { // somebody got to it first - eggsfs_latch_release(&span->stripe_latches[stripe], seqno); + eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno); GET_PAGE_EXIT(page); } eggsfs_debug("about to start fetch stripe"); - struct eggsfs_fetch_stripe_state* st = NULL; - st = eggsfs_start_fetch_stripe(span, stripe, seqno, false); + struct fetch_stripe_state* st = NULL; + st = start_fetch_stripe(block_span, stripe, seqno, false); if (IS_ERR(st)) { err = PTR_ERR(st); st = NULL; @@ -1005,16 +1071,15 @@ again: if (err) { goto out_err; } // We're finally done - page = xa_load(&span->pages, page_ix); + page = xa_load(&block_span->pages, page_ix); // eggsfs_info("loaded page %p for span %p at %u", page, span, page_ix); BUG_ON(page == NULL); out: - // Note that the latch was already released by the fetchers. - if (st != NULL) { eggsfs_put_fetch_stripe(st); } + if (st != NULL) { put_fetch_stripe(st); } // Reclaim if needed - low_on_memory = eggsfs_reclaim_after_fetch_stripe(); + low_on_memory = reclaim_after_fetch_stripe(); GET_PAGE_EXIT(err == 0 ? page : ERR_PTR(err)); @@ -1029,16 +1094,63 @@ out_err: int eggsfs_span_init(void) { int i; for (i = 0; i < EGGSFS_SPAN_LRUS; i++) { - spin_lock_init(&eggsfs_span_lrus[i].lock); - INIT_LIST_HEAD(&eggsfs_span_lrus[i].lru); + spin_lock_init(&span_lrus[i].lock); + INIT_LIST_HEAD(&span_lrus[i].lru); } int err = register_shrinker(&eggsfs_span_shrinker); if (err) { return err; } + eggsfs_block_span_cachep = kmem_cache_create( + "eggsfs_block_span_cache", + sizeof(struct eggsfs_block_span), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + &init_block_span + ); + if (!eggsfs_block_span_cachep) { + err = -ENOMEM; + goto out_shrinker; + } + + eggsfs_inline_span_cachep = kmem_cache_create( + "eggsfs_inline_span_cache", + sizeof(struct eggsfs_inline_span), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL + ); + if (!eggsfs_block_span_cachep) { + err = -ENOMEM; + goto out_block; + } + + eggsfs_fetch_stripe_cachep = kmem_cache_create( + "eggsfs_fetch_stripe_cache", + sizeof(struct fetch_stripe_state), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + &init_fetch_stripe + ); + if (!eggsfs_fetch_stripe_cachep) { + err = -ENOMEM; + goto out_inline; + } + return 0; + +out_inline: + kmem_cache_destroy(eggsfs_fetch_stripe_cachep); +out_block: + kmem_cache_destroy(eggsfs_block_span_cachep); +out_shrinker: + unregister_shrinker(&eggsfs_span_shrinker); + return err; } void eggsfs_span_exit(void) { + kmem_cache_destroy(eggsfs_fetch_stripe_cachep); + kmem_cache_destroy(eggsfs_inline_span_cachep); + kmem_cache_destroy(eggsfs_block_span_cachep); unregister_shrinker(&eggsfs_span_shrinker); } diff --git a/kmod/span.h b/kmod/span.h index a5f6fb41..557828f3 100644 --- a/kmod/span.h +++ b/kmod/span.h @@ -18,9 +18,9 @@ extern unsigned long eggsfs_span_cache_max_size_drop; extern unsigned long eggsfs_span_cache_min_avail_mem_drop; struct eggsfs_span { - struct eggsfs_inode* enode; - struct rb_node node; // to look them up - struct list_head lru; // to decide what to evict + u64 ino; // for logging and various other non-essential things. + struct rb_node node; + struct list_head lru; u64 start; u64 end; u8 storage_class; // used to determine which type of span this is enclosed in @@ -47,15 +47,25 @@ struct eggsfs_block_span { struct eggsfs_span span; struct xarray pages; // indexed by page number struct eggsfs_block blocks[EGGSFS_MAX_BLOCKS]; - struct eggsfs_latch stripe_latches[15]; + struct eggsfs_latch stripe_latches[EGGSFS_MAX_STRIPES]; + // This field, like the one below, is protected by the LRU lock. + // If it is NULL, it means that the span has outlived the inode. + // Given the `file->in_flight` field this should really never + // happen, but it's a good sanity check. + // + // Note that ihold'ing this would defeat the purpose (the cached + // spans would prevent inode eviction). + struct eggsfs_inode* enode; + // When refcount = 0, this might still be alive, but in an LRU. + // When refcount < 0, we're currently reclaiming this span. + // Going from 0 to 1, and from 0 to -1 involves taking the respective + // LRU lock. This is not atomic because _all changes to this + // value go through the LRU lock_, since they must be paired with + // manipulating the LRU list. + int refcount; u32 cell_size; - u32 stripes_crc[15]; - // * 0: there are no readers, the span is in the LRU. - // * n > 0: there are n readers, the span is _not_ in the LRU. - // * n < 0: the span is being reclaimed, it is _not_ in the LRU. - s64 readers; - // If anybody's actually read some of this span - bool actually_read; + u32 stripes_crc[EGGSFS_MAX_STRIPES]; + bool touched; // wether somebody actually read this span (will determine where it ends up in the LRU) u8 stripes; u8 parity; }; @@ -73,11 +83,25 @@ struct eggsfs_block_span { struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset); // Makes the span available for reclamation. -void eggsfs_span_put(struct eggsfs_span* span, bool was_read); +void eggsfs_put_span(struct eggsfs_span* span, bool was_read); -// The page_ix is the page number inside the span. +// The page_ix is the page number inside the span. Can't be called with inline +// span. struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix); +// Drop all the spans in a specific file. Note that this does not mean that +// the span will me immediately deallocated -- there might be still things +// holding onto them (prefetching requests, mostly). +// +// This function _must_ be called before inode is evicted! Otherwise we'll have +// dangling references to the enode in the spans. +void eggsfs_drop_file_spans(struct eggsfs_inode* enode); + +// Drops all cached spans not being currently used. Returns number of +// freed pages. +u64 eggsfs_drop_all_spans(void); + +// Callbacks for metadata void eggsfs_file_spans_cb_span( void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, @@ -92,12 +116,6 @@ void eggsfs_file_spans_cb_block( ); void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body); -// To be used when we know that the spans are not being used anymore (i.e. on inode eviction) -void eggsfs_drop_file_spans(struct eggsfs_inode* enode); - -// Drops all cached spans. Returns number of freed pages. -u64 eggsfs_drop_all_spans(void); - int __init eggsfs_span_init(void); void __cold eggsfs_span_exit(void); diff --git a/kmod/super.c b/kmod/super.c index e2c6df65..d4e72441 100644 --- a/kmod/super.c +++ b/kmod/super.c @@ -8,7 +8,7 @@ #include "log.h" #include "inode.h" #include "metadata.h" -#include "namei.h" +#include "dentry.h" #include "net.h" #include "shuckle.h" #include "bincode.h" diff --git a/kmod/sysfs.c b/kmod/sysfs.c index 47280257..7e33632c 100644 --- a/kmod/sysfs.c +++ b/kmod/sysfs.c @@ -3,7 +3,7 @@ #include #include -#include "namei.h" +#include "dentry.h" #include "span.h" static struct kset* eggsfs_kset; diff --git a/kmod/trace.h b/kmod/trace.h index efb8a874..c0788c87 100644 --- a/kmod/trace.h +++ b/kmod/trace.h @@ -167,7 +167,7 @@ EGGSFS_TRACE_EVENT_inode_ret(vfs_opendir_exit); EGGSFS_TRACE_EVENT_inode(vfs_closedir_enter); EGGSFS_TRACE_EVENT_inode_ret(vfs_closedir_exit); -// namei.c +// dentry.c EGGSFS_TRACE_EVENT_dir_dentry(vfs_lookup_enter); EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(vfs_lookup_exit);