kmod: remove stripe reads

This commit is contained in:
Miroslav Crnic
2025-03-24 14:02:14 +00:00
committed by GitHub Enterprise
parent 08e22661e1
commit 5e17740091
10 changed files with 81 additions and 1454 deletions

View File

@@ -864,7 +864,6 @@ func main() {
repoDir := flag.String("repo-dir", "", "Used to build C++/Go binaries. If not provided, the path will be derived form the filename at build time (so will only work locally).")
binariesDir := flag.String("binaries-dir", "", "If provided, nothing will be built, instead it'll be assumed that the binaries will be in the specified directory.")
kmod := flag.Bool("kmod", false, "Whether to mount with the kernel module, rather than FUSE. Note that the tests will not attempt to run the kernel module and load it, they'll just mount with 'mount -t eggsfs'.")
dropCachedSpansEvery := flag.Duration("drop-cached-spans-every", 0, "If set, will repeatedly drop the cached spans using 'sysctl fs.eggsfs.drop_cached_spans=1'")
dropFetchBlockSocketsEvery := flag.Duration("drop-fetch-block-sockets-every", 0, "")
dirRefreshTime := flag.Duration("dir-refresh-time", 0, "If set, it will set the kmod /proc/sys/fs/eggsfs/dir_refresh_time_ms")
mtu := flag.Uint64("mtu", 0, "If set, we'll use the given MTU for big requests.")
@@ -1009,30 +1008,10 @@ func main() {
sysctl("fs.eggsfs.file_getattr_refresh_time_ms", "100")
}
// Start cached spans dropper
if *dropCachedSpansEvery != time.Duration(0) {
if !*kmod {
panic(fmt.Errorf("you provided -drop-cached-spans-every but you did't provide -kmod"))
}
fmt.Printf("will drop cached spans every %v\n", *dropCachedSpansEvery)
go func() {
for {
cmd := exec.Command("sudo", "/usr/sbin/sysctl", "fs.eggsfs.drop_cached_stripes=1")
cmd.Stdout = io.Discard
cmd.Stderr = io.Discard
if err := cmd.Run(); err != nil {
terminateChan <- err
return
}
time.Sleep(*dropCachedSpansEvery)
}
}()
}
// Start fetch block socket dropper
if *dropFetchBlockSocketsEvery != time.Duration(0) {
if !*kmod {
panic(fmt.Errorf("you provided -drop-cached-spans-every but you did't provide -kmod"))
panic(fmt.Errorf("you provided -drop-fetch-block-sockets-every but you did't provide -kmod"))
}
fmt.Printf("will drop fetch block sockets every %v\n", *dropFetchBlockSocketsEvery)
go func() {

View File

@@ -74,16 +74,15 @@ dmesg_pid=$!
# Trace metadata requests
ssh -p 2223 -i image-key fmazzol@localhost "sudo sh -c 'echo 1 > /sys/kernel/tracing/events/eggsfs/eggsfs_metadata_request/enable'"
ssh -p 2223 -i image-key fmazzol@localhost "sudo sh -c 'echo 1 > /sys/kernel/tracing/events/eggsfs/eggsfs_fetch_stripe/enable'"
ssh -p 2223 -i image-key fmazzol@localhost "sudo cat /sys/kernel/tracing/trace_pipe" > trace &
trace_pid=$!
# Do not test migrations/scrubbing since we test this outside qemu anyway
# (it's completely independent from the kmod code)
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'large file|cp|utime|seek|ftruncate' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTest.readWithMmap -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'rsync' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'large file|cp|utime|seek|ftruncate' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTest.readWithMmap -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'rsync' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out
echo 'Unmounting'
timeout -s KILL 300 ssh -p 2223 -i image-key fmazzol@localhost "grep eggsfs /proc/mounts | awk '{print \$2}' | xargs -r sudo umount"

View File

@@ -25,8 +25,6 @@ unsigned eggsfs_file_io_timeout_sec = 86400;
unsigned eggsfs_max_write_span_attempts = 5;
unsigned eggsfs_page_level_reads = 1;
static struct kmem_cache* eggsfs_transient_span_cachep;
struct eggsfs_transient_span {
@@ -984,8 +982,6 @@ static ssize_t file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
struct file* file = iocb->ki_filp;
struct inode* inode = file->f_inode;
struct eggsfs_inode* enode = EGGSFS_I(inode);
loff_t *ppos = &iocb->ki_pos;
ssize_t written = 0;
if (unlikely(iocb->ki_flags & IOCB_DIRECT)) { return -ENOSYS; }
@@ -993,157 +989,7 @@ static ssize_t file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
int err = eggsfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
if (err) { return err; }
if(eggsfs_page_level_reads == 1) {
struct eggsfs_span* span;
span = eggsfs_get_span(enode, *ppos);
if (span != NULL && span->storage_class != EGGSFS_INLINE_STORAGE) {
return generic_file_read_iter(iocb, to);
}
}
// Three-level loop:
// 1. Fetch spans
// 2. Fetch stripes
// 3. Fetch pages
eggsfs_debug("start of read loop, *ppos=%llu", *ppos);
int span_read_attempts = 0;
// Go through matching spans falling in the requested range
while (*ppos < inode->i_size && iov_iter_count(to)) {
struct eggsfs_span* span;
retry:
span = eggsfs_get_span(enode, *ppos);
if (IS_ERR(span)) {
written = PTR_ERR(span);
goto out_span;
}
if (span == NULL) { goto out_span; }
if (span->start%PAGE_SIZE != 0) {
eggsfs_warn("span start is not a multiple of page size %llu", span->start);
written = -EIO;
goto out_span;
}
eggsfs_debug("span start=%llu end=%llu", span->start, span->end);
u64 span_offset = *ppos - span->start;
u64 span_size = span->end - span->start;
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
struct eggsfs_inline_span* inline_span = EGGSFS_INLINE_SPAN(span);
size_t to_copy = span_size - span_offset;
eggsfs_debug("(inline) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
size_t copied =
copy_to_iter(inline_span->body + span_offset, inline_span->len - span_offset, to) +
iov_iter_zero(span_size - inline_span->len, to); // trailing zeros
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out_span; }
written += copied;
*ppos += copied;
} else {
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
if (block_span->cell_size%PAGE_SIZE != 0) {
eggsfs_warn("cell size not multiple of page size %u", block_span->cell_size);
written = -EIO;
goto out_span;
}
u32 stripe_size = eggsfs_stripe_size(block_span);
u32 num_stripe_pages = stripe_size/PAGE_SIZE;
struct eggsfs_stripe* stripe = NULL;
u64 span_data_end = min(
span->end,
span->start + (u64)stripe_size*block_span->num_stripes
);
eggsfs_debug("start: %llu, end: %llu, span_data_end: %llu", span->start, span->end, span_data_end);
bool stripe_read = false;
// Go through matching stripes in the span
while (*ppos < span_data_end && iov_iter_count(to)) {
if (stripe != NULL) { eggsfs_put_stripe(stripe, stripe_read); }
stripe_read = false;
stripe = eggsfs_get_stripe(block_span, span_offset);
BUG_ON(stripe == NULL); // We know we're within bounds
eggsfs_debug("got stripe %d at span_offset %llu", stripe->stripe_ix, span_offset);
if (IS_ERR(stripe)) {
written = PTR_ERR(stripe);
goto out_stripe;
}
u32 page_ix = (span_offset - stripe->stripe_ix*stripe_size)/PAGE_SIZE;
eggsfs_debug("page_ix %u of %u", page_ix, num_stripe_pages);
BUG_ON(page_ix >= num_stripe_pages);
// Go through matching blocks in the stripe
for (; page_ix < num_stripe_pages && (*ppos < span_data_end) && iov_iter_count(to); page_ix++) {
struct page* page = eggsfs_get_stripe_page(stripe, page_ix);
if (IS_ERR(page)) {
if (span_read_attempts == 0) {
// The idea behind this is that span structure changes extremely rarely: currently only
// when we "defrag" files into a new span structure (note that span boundary never changes).
// That's only needed when we change the block storage (e.g. HDD to FLASH) or when we tune
// the parity/block sizes/etc.
// A "proper" solution to this would probably to store a revision number or mtime for the
// span itself. If this coarse solution ever becomes problematic we can change to that, which
// requires a fairly annoying schema change.
eggsfs_warn("reading stripe %lld in file %016lx failed with error %ld, retrying since it's the first attempt, and the span structure might have changed in the meantime", *ppos, enode->inode.i_ino, PTR_ERR(stripe));
eggsfs_put_stripe(stripe, stripe_read);
eggsfs_unlink_span(enode, span);
goto retry;
}
written = PTR_ERR(page);
goto out_stripe;
}
stripe_read = true;
size_t to_copy = min((u64)PAGE_SIZE - (*ppos % PAGE_SIZE), span_data_end - *ppos);
if (to_copy == 0) {
eggsfs_warn("got 0 bytes to copy at pos %llu, span_data_end %llu", *ppos, span_data_end);
BUG();
}
eggsfs_debug("(block) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
size_t copied;
// copy_page_to_iter below ends up calling copy_page_to_iter_pipe for spliced reads
// which takes a reference to the our page using `get_page()` and also associates
// `page_cache_pipe_buf_ops()` to manage the page (https://elixir.bootlin.com/linux/v5.4.249/source/lib/iov_iter.c#L400).
// Later `splice_direct_to_actor()` trips over on https://elixir.bootlin.com/linux/v5.4.249/source/fs/splice.c#L977,
// goes through all the pipe buffers (which contain references to our pages) and drops
// references to the pages by calling https://elixir.bootlin.com/linux/v5.4.249/source/fs/splice.c#L92.
// As a result nfs spliced reads return zero pages.
if (unlikely(iov_iter_is_pipe(to))) {
char* from_ptr = kmap_atomic(page);
copied = copy_to_iter(from_ptr + *ppos % PAGE_SIZE, to_copy, to);
kunmap_atomic(from_ptr);
} else {
copied = copy_page_to_iter(page, *ppos % PAGE_SIZE, to_copy, to);
}
eggsfs_debug("copied=%lu, remaining %lu", copied, iov_iter_count(to));
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out_stripe; }
written += copied;
*ppos += copied;
span_offset += copied;
}
}
out_stripe:
if (stripe != NULL) { eggsfs_put_stripe(stripe, stripe_read); }
if (unlikely(written < 0)) { goto out_span; }
// trailing zeros
if (iov_iter_count(to)) {
size_t to_copy = span->end - span_data_end;
size_t copied = iov_iter_zero(to_copy, to);
eggsfs_debug("(block zeroes) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out_span; }
written += copied;
*ppos += copied;
}
}
span_read_attempts = 0;
eggsfs_debug("before loop end, remaining %lu", iov_iter_count(to));
}
out_span:
eggsfs_debug("out of the loop, written=%ld", written);
if (unlikely(written < 0)) {
eggsfs_debug("reading failed, err=%ld", written);
}
return written;
return generic_file_read_iter(iocb, to);
}
void eggsfs_link_destructor(void* buf) {
@@ -1181,40 +1027,23 @@ char* eggsfs_read_link(struct eggsfs_inode* enode) {
} else {
struct page* page;
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
if (eggsfs_page_level_reads == 1) {
LIST_HEAD(pages);
page = alloc_page(GFP_KERNEL);
if(IS_ERR(page)) {
err = PTR_ERR(page);
goto out_err;
}
page->index = 0;
list_add_tail(&page->lru, &pages);
LIST_HEAD(extra_pages);
err = eggsfs_span_get_pages(block_span, enode->inode.i_mapping, &pages, 1, &extra_pages);
if (err) goto out_err;
list_del(&page->lru);
put_pages_list(&pages);
put_pages_list(&extra_pages);
char* page_buf = kmap(page);
memcpy(buf, page_buf, size);
kunmap(page);
} else {
struct eggsfs_stripe* stripe = eggsfs_get_stripe(block_span, 0);
BUG_ON(stripe == NULL); // We know we're within bounds
struct page* page = eggsfs_get_stripe_page(stripe, 0);
BUG_ON(page == NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
eggsfs_put_stripe(stripe, false);
goto out_err;
}
char* page_buf = kmap(page);
memcpy(buf, page_buf, size);
kunmap(page);
eggsfs_put_stripe(stripe, true);
LIST_HEAD(pages);
page = alloc_page(GFP_KERNEL);
if(IS_ERR(page)) {
err = PTR_ERR(page);
goto out_err;
}
page->index = 0;
list_add_tail(&page->lru, &pages);
LIST_HEAD(extra_pages);
err = eggsfs_span_get_pages(block_span, enode->inode.i_mapping, &pages, 1, &extra_pages);
if (err) goto out_err;
list_del(&page->lru);
put_pages_list(&pages);
put_pages_list(&extra_pages);
char* page_buf = kmap(page);
memcpy(buf, page_buf, size);
kunmap(page);
}
buf[size] = '\0';
@@ -1281,7 +1110,7 @@ out_err:
}
// Files are not visible until they're fully persisted, so fsync is unnecessary.
int file_fsync(struct file* f, loff_t start, loff_t end, int datasync) {
static int file_fsync(struct file* f, loff_t start, loff_t end, int datasync) {
return 0;
}
@@ -1297,7 +1126,7 @@ const struct file_operations eggsfs_file_operations = {
.fsync = file_fsync,
};
void process_file_pages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) {
static void process_file_pages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) {
struct page* page;
int err;
for(;;) {
@@ -1325,11 +1154,6 @@ void process_file_pages(struct address_space *mapping, struct list_head *pages,
static int file_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) {
int err = 0;
if (eggsfs_page_level_reads != 1) {
err = -ENOTSUPP;
goto out_err;
}
struct inode* inode = file_inode(filp);
struct eggsfs_inode* enode = EGGSFS_I(inode);
loff_t off = page_offset(lru_to_page(pages));
@@ -1354,9 +1178,23 @@ static int file_readpages(struct file *filp, struct address_space *mapping, stru
goto out_err;
}
LIST_HEAD(extra_pages);
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
err = -ENOTSUPP;
goto out_err;
struct eggsfs_inline_span* inline_span = EGGSFS_INLINE_SPAN(span);
struct page *page = lru_to_page(pages);
if (page == NULL) {
err = -EIO;
goto out_err;
}
list_del(&page->lru);
// rest is past end of file, just drop it
put_pages_list(pages);
// return page back
list_add_tail(&page->lru, pages);
BUG_ON(page_offset(page) != span->start);
char* dst = kmap_atomic(page);
memcpy(dst, inline_span->body, inline_span->len);
kunmap_atomic(dst);
} else {
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
if (block_span->cell_size%PAGE_SIZE != 0) {
@@ -1364,17 +1202,15 @@ static int file_readpages(struct file *filp, struct address_space *mapping, stru
err = -EIO;
goto out_err;
}
LIST_HEAD(extra_pages);
err = eggsfs_span_get_pages(block_span, mapping, pages, nr_pages, &extra_pages);
if (err) {
eggsfs_warn("readahead of %d pages at off=%lld in file %016lx failed with error %d", nr_pages, off, enode->inode.i_ino, err);
put_pages_list(&extra_pages);
goto out_err;
}
process_file_pages(mapping, pages, nr_pages);
process_file_pages(mapping, &extra_pages, 0);
}
process_file_pages(mapping, pages, nr_pages);
process_file_pages(mapping, &extra_pages, 0);
return 0;
out_err:
@@ -1389,7 +1225,6 @@ static int file_readpage(struct file* filp, struct page* page) {
int err = 0;
struct eggsfs_span* span = NULL;
struct eggsfs_stripe* stripe = NULL;
int span_read_attempts = 0;
struct timespec64 start_ts = ns_to_timespec64(ktime_get_real_ns());
@@ -1431,55 +1266,34 @@ retry:
struct page* stripe_page = NULL;
if (eggsfs_page_level_reads == 1) {
LIST_HEAD(pages);
stripe_page = alloc_page(GFP_KERNEL);
if(!stripe_page) {
err = -ENOMEM;
goto out;
}
stripe_page->index = page->index;
list_add_tail(&stripe_page->lru, &pages);
LIST_HEAD(extra_pages);
err = eggsfs_span_get_pages(block_span, filp->f_mapping, &pages, 1, &extra_pages);
process_file_pages(filp->f_mapping, &extra_pages, 0);
if (err) {
put_pages_list(&pages);
if (span_read_attempts == 0) {
// see comment in read_file_iter for rationale here
eggsfs_warn("reading page %lld in file %016lx failed with error %d, retrying since it's the first attempt, and the span structure might have changed in the meantime", off, enode->inode.i_ino, err);
eggsfs_unlink_span(enode, span);
span_read_attempts++;
goto retry;
}
struct timespec64 end_ts = ns_to_timespec64(ktime_get_real_ns());
if ((end_ts.tv_sec - start_ts.tv_sec) < eggsfs_file_io_timeout_sec) {
goto retry;
}
eggsfs_warn("reading page at off=%lld in file %016lx failed with error %d", off, enode->inode.i_ino, err);
goto out;
}
list_del(&stripe_page->lru);
} else {
stripe = eggsfs_get_stripe(block_span, span_offset);
BUG_ON(stripe == NULL);
u64 stripe_offset = (span_offset - stripe->stripe_ix*eggsfs_stripe_size(block_span));
u32 page_ix = stripe_offset/PAGE_SIZE;
stripe_page = eggsfs_get_stripe_page(stripe, page_ix);
if (IS_ERR(stripe_page)) {
err = PTR_ERR(stripe_page);
if (span_read_attempts == 0) {
// see comment in read_file_iter for rationale here
eggsfs_warn("reading page %lld in file %016lx failed with error %d, retrying since it's the first attempt, and the span structure might have changed in the meantime", off, enode->inode.i_ino, err);
eggsfs_put_stripe(stripe, true);
eggsfs_unlink_span(enode, span);
span_read_attempts++;
goto retry;
}
eggsfs_warn("reading page %lld in file %016lx failed with error %d, retries exceeded, failing request.", off, enode->inode.i_ino, err);
goto out;
}
LIST_HEAD(pages);
stripe_page = alloc_page(GFP_KERNEL);
if(!stripe_page) {
err = -ENOMEM;
goto out;
}
stripe_page->index = page->index;
list_add_tail(&stripe_page->lru, &pages);
LIST_HEAD(extra_pages);
err = eggsfs_span_get_pages(block_span, filp->f_mapping, &pages, 1, &extra_pages);
process_file_pages(filp->f_mapping, &extra_pages, 0);
if (err) {
put_pages_list(&pages);
if (span_read_attempts == 0) {
// see comment in read_file_iter for rationale here
eggsfs_warn("reading page %lld in file %016lx failed with error %d, retrying since it's the first attempt, and the span structure might have changed in the meantime", off, enode->inode.i_ino, err);
eggsfs_unlink_span(enode, span);
span_read_attempts++;
goto retry;
}
struct timespec64 end_ts = ns_to_timespec64(ktime_get_real_ns());
if ((end_ts.tv_sec - start_ts.tv_sec) < eggsfs_file_io_timeout_sec) {
goto retry;
}
eggsfs_warn("reading page at off=%lld in file %016lx failed with error %d", off, enode->inode.i_ino, err);
goto out;
}
list_del(&stripe_page->lru);
char* to_ptr = kmap_atomic(page);
char* from_ptr = kmap_atomic(stripe_page);
@@ -1496,9 +1310,6 @@ out:
} else {
SetPageUptodate(page);
}
if (stripe) {
eggsfs_put_stripe(stripe, true);
}
unlock_page(page);
return err;

View File

@@ -7,8 +7,6 @@ extern unsigned eggsfs_atime_update_interval_sec;
extern unsigned eggsfs_max_write_span_attempts;
extern unsigned eggsfs_file_io_timeout_sec;
extern unsigned eggsfs_page_level_reads;
extern int eggsfs_file_getattr_refresh_time_jiffies; // this is only relevant for mtime/atime updates
ssize_t eggsfs_file_write(struct eggsfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from);

View File

@@ -78,8 +78,7 @@ fi
# Attach
tmux attach-session -t uovo:1
# ./eggs/eggstests -verbose -kmod -filter 'mounted|rsync|large' -drop-cached-spans-every 100ms -short -binaries-dir $(pwd)/eggs
# ./eggs/eggstests -verbose -kmod -filter 'mounted|rsync|large' -short -binaries-dir $(pwd)/eggs
# sudo sh -c 'echo eggsfs_fetch_stripe >> /sys/kernel/debug/tracing/set_event'
# sudo sysctl fs.eggsfs.debug=1
# ./eggs/eggstests -kmod -filter 'mounted' -cfg fsTest.checkThreads=4 -cfg fsTest.numDirs=1 -cfg fsTest.numFiles=100 -short -binaries-dir $(pwd)/eggs

File diff suppressed because it is too large Load Diff

View File

@@ -7,22 +7,6 @@
#include "counter.h"
#include "block.h"
#include "rs.h"
EGGSFS_DECLARE_COUNTER(eggsfs_stat_cached_stripes);
EGGSFS_DECLARE_COUNTER(eggsfs_stat_dropped_stripes_refetched);
EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_cache_hit);
EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_cache_miss);
EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_page_cache_hit);
EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_page_cache_miss);
extern atomic64_t eggsfs_stat_cached_stripe_pages;
extern unsigned long eggsfs_stripe_cache_max_size_async;
extern unsigned long eggsfs_stripe_cache_min_avail_mem_async;
extern unsigned long eggsfs_stripe_cache_max_size_sync;
extern unsigned long eggsfs_stripe_cache_min_avail_mem_sync;
extern unsigned long eggsfs_stripe_cache_max_size_drop;
extern unsigned long eggsfs_stripe_cache_min_avail_mem_drop;
struct eggsfs_span {
// All the spans operations are independent from `struct eggsfs_inode`.
// So we need the inode here.
@@ -32,11 +16,6 @@ struct eggsfs_span {
// To be in the inode tree. Note that we might _not_ be in the
// tree once the inode releases us.
struct rb_node node;
// The inode and the stripes hold references to us. So the inode
// needs to unlink us and all the stripes need to be gone after
// this can be freed. Inline spans obviously only are referenced
// by inodes.
atomic_t refcount;
// Used to determine which type of span this is enclosed in.
u8 storage_class;
};
@@ -58,46 +37,15 @@ struct eggsfs_block {
u32 crc;
};
struct eggsfs_stripe {
// Holds reference to it (we can always dereference it from
// the stripe).
struct eggsfs_block_span* span;
// Full list of contents if they have been fetched.
struct xarray pages;
// To synchronize fillin in the pages for the first time.
struct eggsfs_latch latch;
struct list_head lru;
// When refcount = 0, this might still be alive, but in an LRU.
// When refcount < 0, we're currently reclaiming this stripe.
// Going from 0 to 1, and from 0 to -1 involves taking the respective
// LRU lock. This field is not atomic_t because _all changes to this
// value go through the LRU lock_, since they must be paired with
// manipulating the LRU list.
atomic_t refcount;
u8 stripe_ix; // Which stripe it is
bool touched; // Whether the stripe was touched before putting it back
};
struct eggsfs_block_span {
struct eggsfs_span span;
struct eggsfs_block blocks[EGGSFS_MAX_BLOCKS];
// If NULL, the stripe is currently not initialized.
// The value has to be accessed with the lock of the containg stripe_lru.
// It is set without LRU lock in a single place when creating a new stripe.
// There should be no new code paths that manipulate it without locking
// the LRU first.
struct eggsfs_stripe* stripes[EGGSFS_MAX_STRIPES];
bool stripe_was_cached[EGGSFS_MAX_STRIPES];
u32 cell_size;
u32 stripes_crc[EGGSFS_MAX_STRIPES];
u8 num_stripes;
u8 parity;
};
static inline u32 eggsfs_stripe_size(struct eggsfs_block_span* span) {
return span->cell_size * eggsfs_data_blocks(span->parity);
}
#define EGGSFS_BLOCK_SPAN(_span) ({ \
BUG_ON((_span)->storage_class == EGGSFS_EMPTY_STORAGE || (_span)->storage_class == EGGSFS_INLINE_STORAGE); \
container_of(_span, struct eggsfs_block_span, span); \
@@ -113,25 +61,11 @@ struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset);
// Remove the span from the inode tree, decrementing the refcount.
void eggsfs_unlink_span(struct eggsfs_inode* enode, struct eggsfs_span* span);
// Drop all the spans in a specific file. The span might linger around
// if the stripes are being read (since they have references to it too).
//
// Drop all the spans in a specific file.
// This function must be called before inode eviction, otherwise we'll
// leak spans and stripes.
// leak spans.
void eggsfs_unlink_spans(struct eggsfs_inode* enode);
// Gets a stripe within a span. offset is the offset inside the span.
// Once this returns, all the pages will be filled in already. The stripe
// needs to be put back with `eggsfs_put_stripe` when you're done with it.
struct eggsfs_stripe* eggsfs_get_stripe(struct eggsfs_block_span* span, u32 offset);
// Makes the stripe available for reclamation.
void eggsfs_put_stripe(struct eggsfs_stripe* stripe, bool was_read);
// The page_ix is the page number inside the stripe. The first call demanding
// a page will pull in all the pages for the stripe.
struct page* eggsfs_get_stripe_page(struct eggsfs_stripe* stripe, u32 page_ix);
// Fetches pages from block services to fill the supplied list of pages.
// @pages - original list of pages to be filled. The list is expected to
// contain pages with page->index values increasing by 1.
@@ -140,7 +74,7 @@ struct page* eggsfs_get_stripe_page(struct eggsfs_stripe* stripe, u32 page_ix);
// and removed from the list.
// @extra_pages - empty list to add fetched pages that fall outside of the
// requested range.
// Pages are taken out of the list, added to required blocks for fetching,
// Pages are taken out of the list, added to required blocks for fetching,
// filled with data fetched from block services and then assembled back
// maintaining the original order to match file offsets. Each supplied page
// has page->index prefilled as file_offset/PAGE_SIZE. Fetches are done in
@@ -158,16 +92,13 @@ struct page* eggsfs_get_stripe_page(struct eggsfs_stripe* stripe, u32 page_ix);
// even returning error aggressively is fine.
int eggsfs_span_get_pages(struct eggsfs_block_span* block_span, struct address_space* mapping, struct list_head *pages, unsigned nr_pages, struct list_head *extra_pages);
// Drops all cached stripes not being currently used. Returns number of
// freed pages.
u64 eggsfs_drop_all_stripes(void);
// Callbacks for metadata
void eggsfs_file_spans_cb_span(
void* data, u64 offset, u32 size, u32 crc,
u8 storage_class, u8 parity, u8 stripes, u32 cell_size,
const uint32_t* stripes_crcs
);
void eggsfs_file_spans_cb_block(
void* data, int block_ix,
// block service stuff

View File

@@ -30,11 +30,6 @@ int eggsfs_debug_output = 0;
} \
return 0;
static int drop_cached_stripes;
static int eggsfs_drop_stripes_sysctl(struct ctl_table* table, int write, void __sysctl_buffer* buffer, size_t* len, loff_t* ppos) {
eggsfs_do_sysctl(eggsfs_drop_all_stripes);
}
static int drop_fetch_block_sockets;
static int eggsfs_drop_fetch_block_sockets_sysctl(struct ctl_table* table, int write, void __sysctl_buffer* buffer, size_t* len, loff_t* ppos) {
eggsfs_do_sysctl(eggsfs_drop_fetch_block_sockets);
@@ -107,14 +102,6 @@ static struct ctl_table eggsfs_cb_sysctls[] = {
.extra2 = &eggsfs_rs_cpu_level_max,
},
{
.procname = "drop_cached_stripes",
.data = &drop_cached_stripes,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = eggsfs_drop_stripes_sysctl,
},
{
.procname = "drop_fetch_block_sockets",
.data = &drop_fetch_block_sockets,
@@ -147,16 +134,8 @@ static struct ctl_table eggsfs_cb_sysctls[] = {
EGGSFS_CTL_UINT(max_write_span_attempts),
EGGSFS_CTL_UINT(atime_update_interval_sec),
EGGSFS_CTL_UINT(file_io_timeout_sec),
EGGSFS_CTL_UINT(page_level_reads),
EGGSFS_CTL_UINT(disable_ftruncate),
EGGSFS_CTL_ULONG(stripe_cache_max_size_async),
EGGSFS_CTL_ULONG(stripe_cache_min_avail_mem_async),
EGGSFS_CTL_ULONG(stripe_cache_max_size_sync),
EGGSFS_CTL_ULONG(stripe_cache_min_avail_mem_sync),
EGGSFS_CTL_ULONG(stripe_cache_max_size_drop),
EGGSFS_CTL_ULONG(stripe_cache_min_avail_mem_drop),
{
.procname = "mtu",
.data = &eggsfs_mtu,

View File

@@ -67,23 +67,9 @@ static umode_t eggsfs_stat_visible(struct kobject *kobj, struct attribute *attr,
#define EGGSFS_SYSFS_ATOMIC64_PTR(_name) (&eggsfs_sysfs_atomic64_##_name.kobj_attr.attr)
EGGSFS_SYSFS_COUNTER(dir_revalidations);
EGGSFS_SYSFS_COUNTER(stripe_cache_hit);
EGGSFS_SYSFS_COUNTER(stripe_cache_miss);
EGGSFS_SYSFS_COUNTER(stripe_page_cache_hit);
EGGSFS_SYSFS_COUNTER(stripe_page_cache_miss);
EGGSFS_SYSFS_COUNTER(cached_stripes);
EGGSFS_SYSFS_COUNTER(dropped_stripes_refetched);
EGGSFS_SYSFS_ATOMIC64(cached_stripe_pages);
static struct attribute* eggsfs_stat_attrs[] = {
EGGSFS_SYSFS_COUNTER_PTR(dir_revalidations),
EGGSFS_SYSFS_COUNTER_PTR(stripe_cache_hit),
EGGSFS_SYSFS_COUNTER_PTR(stripe_cache_miss),
EGGSFS_SYSFS_COUNTER_PTR(stripe_page_cache_hit),
EGGSFS_SYSFS_COUNTER_PTR(stripe_page_cache_miss),
EGGSFS_SYSFS_COUNTER_PTR(cached_stripes),
EGGSFS_SYSFS_COUNTER_PTR(dropped_stripes_refetched),
EGGSFS_SYSFS_ATOMIC64_PTR(cached_stripe_pages),
NULL
};

View File

@@ -436,45 +436,6 @@ TRACE_EVENT(eggsfs_span_add,
TP_printk("file_id=%016llx offset=%llu", __entry->file_id, __entry->offset)
);
#define EGGSFS_DROP_STRIPES_START 0
#define EGGSFS_DROP_STRIPES_END 1
TRACE_EVENT(eggsfs_drop_stripes,
TP_PROTO(const char* type, long mem_available, u64 cached_pages, u64 cached_stripes, u8 event, u64 examined_stripes, u64 dropped_pages, int err),
TP_ARGS( type, mem_available, cached_pages, cached_stripes, event, examined_stripes, dropped_pages, err),
TP_STRUCT__entry(
__field(const char*, type)
__field(long, mem_available)
__field(u64, cached_pages)
__field(u64, cached_stripes)
__field(u64, dropped_pages)
__field(u64, examined_stripes)
__field(int, err)
__field(u8, event)
),
TP_fast_assign(
__entry->type = type;
__entry->mem_available = mem_available;
__entry->cached_pages = cached_pages;
__entry->cached_stripes = cached_stripes;
__entry->dropped_pages = dropped_pages;
__entry->examined_stripes = examined_stripes;
__entry->err = err;
__entry->event = event;
),
TP_printk(
"type=%s mem_available=%ld cached_pages=%llu cached_stripes=%llu event=%s examined_stripes=%llu dropped_pages=%llu err=%d",
__entry->type, __entry->mem_available, __entry->cached_pages, __entry->cached_stripes,
__print_symbolic(
__entry->event,
{ EGGSFS_DROP_STRIPES_START, "start" },
{ EGGSFS_DROP_STRIPES_END, "end" }
),
__entry->examined_stripes, __entry->dropped_pages, __entry->err
)
);
TRACE_EVENT(eggsfs_get_span_enter,
TP_PROTO(u64 file_id, u64 offset),
TP_ARGS( file_id, offset),
@@ -506,87 +467,6 @@ TRACE_EVENT(eggsfs_get_span_exit,
TP_printk("file_id=%016llx offset=%llu, err=%d", __entry->file_id, __entry->offset, __entry->err)
);
TRACE_EVENT(eggsfs_get_stripe_page_enter,
TP_PROTO(u64 file_id, u64 stripe_offset, u32 page_offset),
TP_ARGS( file_id, stripe_offset, page_offset),
TP_STRUCT__entry(
__field(u64, file_id)
__field(u64, stripe_offset)
__field(u32, page_offset)
),
TP_fast_assign(
__entry->file_id = file_id;
__entry->stripe_offset = stripe_offset;
__entry->page_offset = page_offset;
),
TP_printk("file_id=%016llx stripe_offset=%llu page_offset=%u", __entry->file_id, __entry->stripe_offset, __entry->page_offset)
);
TRACE_EVENT(eggsfs_get_stripe_page_exit,
TP_PROTO(u64 file_id, u64 stripe_offset, u32 page_offset, int err),
TP_ARGS( file_id, stripe_offset, page_offset, err),
TP_STRUCT__entry(
__field(u64, file_id)
__field(u64, stripe_offset)
__field(u32, page_offset)
__field(int, err)
),
TP_fast_assign(
__entry->file_id = file_id;
__entry->stripe_offset = stripe_offset;
__entry->page_offset = page_offset;
__entry->err = err;
),
TP_printk("file_id=%016llx stripe_offset=%llu page_offset=%u err=%d", __entry->file_id, __entry->stripe_offset, __entry->page_offset, __entry->err)
);
#define EGGSFS_FETCH_STRIPE_START 0
#define EGGSFS_FETCH_STRIPE_BLOCK_START 1
#define EGGSFS_FETCH_STRIPE_BLOCK_DONE 2
#define EGGSFS_FETCH_STRIPE_END 3
#define EGGSFS_FETCH_STRIPE_FREE 4
TRACE_EVENT(eggsfs_fetch_stripe,
TP_PROTO(u64 file_id, u64 span_offset, u8 stripe, u8 parity, bool prefetching, u8 event, s8 block, int err),
TP_ARGS( file_id, span_offset, stripe, parity, prefetching, event, block, err),
TP_STRUCT__entry(
__field(u64, file_id)
__field(u64, span_offset)
__field(int, err)
__field(s8, block) // negative = non-block event
__field(u8, stripe)
__field(bool, prefetching)
__field(u8, event)
__field(u8, parity)
),
TP_fast_assign(
__entry->file_id = file_id;
__entry->span_offset = span_offset;
__entry->stripe = stripe;
__entry->prefetching = prefetching;
__entry->event = event;
__entry->err = err;
__entry->block = block;
__entry->parity = parity;
),
TP_printk(
"file_id=%016llx span_offset=%llu stripe=%u D=%u P=%u prefetching=%d event=%s block=%d err=%d",
__entry->file_id, __entry->span_offset, __entry->stripe, __entry->parity&0xF, __entry->parity>>4, (int)__entry->prefetching,
__print_symbolic(
__entry->event,
{ EGGSFS_FETCH_STRIPE_START, "start" },
{ EGGSFS_FETCH_STRIPE_BLOCK_START, "block_start" },
{ EGGSFS_FETCH_STRIPE_BLOCK_DONE, "block_done" },
{ EGGSFS_FETCH_STRIPE_END, "end" },
{ EGGSFS_FETCH_STRIPE_FREE, "free" }
),
__entry->block, __entry->err
)
);
#define EGGSFS_UPSERT_BLOCKSERVICE_MATCH 0
#define EGGSFS_UPSERT_BLOCKSERVICE_NOMATCH 1
#define EGGSFS_UPSERT_BLOCKSERVICE_NEW 2