diff --git a/go/eggstests/eggstests.go b/go/eggstests/eggstests.go index ce6bd89e..1e9d04cf 100644 --- a/go/eggstests/eggstests.go +++ b/go/eggstests/eggstests.go @@ -864,7 +864,6 @@ func main() { repoDir := flag.String("repo-dir", "", "Used to build C++/Go binaries. If not provided, the path will be derived form the filename at build time (so will only work locally).") binariesDir := flag.String("binaries-dir", "", "If provided, nothing will be built, instead it'll be assumed that the binaries will be in the specified directory.") kmod := flag.Bool("kmod", false, "Whether to mount with the kernel module, rather than FUSE. Note that the tests will not attempt to run the kernel module and load it, they'll just mount with 'mount -t eggsfs'.") - dropCachedSpansEvery := flag.Duration("drop-cached-spans-every", 0, "If set, will repeatedly drop the cached spans using 'sysctl fs.eggsfs.drop_cached_spans=1'") dropFetchBlockSocketsEvery := flag.Duration("drop-fetch-block-sockets-every", 0, "") dirRefreshTime := flag.Duration("dir-refresh-time", 0, "If set, it will set the kmod /proc/sys/fs/eggsfs/dir_refresh_time_ms") mtu := flag.Uint64("mtu", 0, "If set, we'll use the given MTU for big requests.") @@ -1009,30 +1008,10 @@ func main() { sysctl("fs.eggsfs.file_getattr_refresh_time_ms", "100") } - // Start cached spans dropper - if *dropCachedSpansEvery != time.Duration(0) { - if !*kmod { - panic(fmt.Errorf("you provided -drop-cached-spans-every but you did't provide -kmod")) - } - fmt.Printf("will drop cached spans every %v\n", *dropCachedSpansEvery) - go func() { - for { - cmd := exec.Command("sudo", "/usr/sbin/sysctl", "fs.eggsfs.drop_cached_stripes=1") - cmd.Stdout = io.Discard - cmd.Stderr = io.Discard - if err := cmd.Run(); err != nil { - terminateChan <- err - return - } - time.Sleep(*dropCachedSpansEvery) - } - }() - } - // Start fetch block socket dropper if *dropFetchBlockSocketsEvery != time.Duration(0) { if !*kmod { - panic(fmt.Errorf("you provided -drop-cached-spans-every but you did't provide -kmod")) + panic(fmt.Errorf("you provided -drop-fetch-block-sockets-every but you did't provide -kmod")) } fmt.Printf("will drop fetch block sockets every %v\n", *dropFetchBlockSocketsEvery) go func() { diff --git a/kmod/ci.sh b/kmod/ci.sh index 199df0e9..dc900e60 100755 --- a/kmod/ci.sh +++ b/kmod/ci.sh @@ -74,16 +74,15 @@ dmesg_pid=$! # Trace metadata requests ssh -p 2223 -i image-key fmazzol@localhost "sudo sh -c 'echo 1 > /sys/kernel/tracing/events/eggsfs/eggsfs_metadata_request/enable'" -ssh -p 2223 -i image-key fmazzol@localhost "sudo sh -c 'echo 1 > /sys/kernel/tracing/events/eggsfs/eggsfs_fetch_stripe/enable'" ssh -p 2223 -i image-key fmazzol@localhost "sudo cat /sys/kernel/tracing/trace_pipe" > trace & trace_pid=$! # Do not test migrations/scrubbing since we test this outside qemu anyway # (it's completely independent from the kmod code) -ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'large file|cp|utime|seek|ftruncate' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out -ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out -ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTest.readWithMmap -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out -ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'rsync' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -drop-cached-spans-every 100ms -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out +ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'large file|cp|utime|seek|ftruncate' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out +ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out +ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'mounted' -block-service-killer -cfg fsTest.readWithMmap -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out +ssh -p 2223 -i image-key fmazzol@localhost "eggs/eggstests -verbose -kmsg -shuckle-beacon-port 55556 -kmod -filter 'rsync' -block-service-killer -cfg fsTests.dontMigrate -cfg fsTests.dontDefrag -cfg fsTest.corruptFileProb=0 -outgoing-packet-drop 0.02 $short $leader_only $preserve_ddir -binaries-dir eggs" | tee -a test-out echo 'Unmounting' timeout -s KILL 300 ssh -p 2223 -i image-key fmazzol@localhost "grep eggsfs /proc/mounts | awk '{print \$2}' | xargs -r sudo umount" diff --git a/kmod/file.c b/kmod/file.c index ca19164e..457efbe9 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -25,8 +25,6 @@ unsigned eggsfs_file_io_timeout_sec = 86400; unsigned eggsfs_max_write_span_attempts = 5; -unsigned eggsfs_page_level_reads = 1; - static struct kmem_cache* eggsfs_transient_span_cachep; struct eggsfs_transient_span { @@ -984,8 +982,6 @@ static ssize_t file_read_iter(struct kiocb* iocb, struct iov_iter* to) { struct file* file = iocb->ki_filp; struct inode* inode = file->f_inode; struct eggsfs_inode* enode = EGGSFS_I(inode); - loff_t *ppos = &iocb->ki_pos; - ssize_t written = 0; if (unlikely(iocb->ki_flags & IOCB_DIRECT)) { return -ENOSYS; } @@ -993,157 +989,7 @@ static ssize_t file_read_iter(struct kiocb* iocb, struct iov_iter* to) { int err = eggsfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT); if (err) { return err; } - if(eggsfs_page_level_reads == 1) { - struct eggsfs_span* span; - span = eggsfs_get_span(enode, *ppos); - if (span != NULL && span->storage_class != EGGSFS_INLINE_STORAGE) { - return generic_file_read_iter(iocb, to); - } - } - // Three-level loop: - // 1. Fetch spans - // 2. Fetch stripes - // 3. Fetch pages - eggsfs_debug("start of read loop, *ppos=%llu", *ppos); - int span_read_attempts = 0; - - // Go through matching spans falling in the requested range - while (*ppos < inode->i_size && iov_iter_count(to)) { - struct eggsfs_span* span; - retry: - span = eggsfs_get_span(enode, *ppos); - if (IS_ERR(span)) { - written = PTR_ERR(span); - goto out_span; - } - if (span == NULL) { goto out_span; } - if (span->start%PAGE_SIZE != 0) { - eggsfs_warn("span start is not a multiple of page size %llu", span->start); - written = -EIO; - goto out_span; - } - - eggsfs_debug("span start=%llu end=%llu", span->start, span->end); - - u64 span_offset = *ppos - span->start; - u64 span_size = span->end - span->start; - - if (span->storage_class == EGGSFS_INLINE_STORAGE) { - struct eggsfs_inline_span* inline_span = EGGSFS_INLINE_SPAN(span); - size_t to_copy = span_size - span_offset; - eggsfs_debug("(inline) copying %lu, have %lu remaining", to_copy, iov_iter_count(to)); - size_t copied = - copy_to_iter(inline_span->body + span_offset, inline_span->len - span_offset, to) + - iov_iter_zero(span_size - inline_span->len, to); // trailing zeros - if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out_span; } - written += copied; - *ppos += copied; - } else { - struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); - if (block_span->cell_size%PAGE_SIZE != 0) { - eggsfs_warn("cell size not multiple of page size %u", block_span->cell_size); - written = -EIO; - goto out_span; - } - u32 stripe_size = eggsfs_stripe_size(block_span); - u32 num_stripe_pages = stripe_size/PAGE_SIZE; - struct eggsfs_stripe* stripe = NULL; - u64 span_data_end = min( - span->end, - span->start + (u64)stripe_size*block_span->num_stripes - ); - eggsfs_debug("start: %llu, end: %llu, span_data_end: %llu", span->start, span->end, span_data_end); - bool stripe_read = false; - - // Go through matching stripes in the span - while (*ppos < span_data_end && iov_iter_count(to)) { - if (stripe != NULL) { eggsfs_put_stripe(stripe, stripe_read); } - stripe_read = false; - stripe = eggsfs_get_stripe(block_span, span_offset); - BUG_ON(stripe == NULL); // We know we're within bounds - eggsfs_debug("got stripe %d at span_offset %llu", stripe->stripe_ix, span_offset); - if (IS_ERR(stripe)) { - written = PTR_ERR(stripe); - goto out_stripe; - } - - u32 page_ix = (span_offset - stripe->stripe_ix*stripe_size)/PAGE_SIZE; - eggsfs_debug("page_ix %u of %u", page_ix, num_stripe_pages); - BUG_ON(page_ix >= num_stripe_pages); - - // Go through matching blocks in the stripe - for (; page_ix < num_stripe_pages && (*ppos < span_data_end) && iov_iter_count(to); page_ix++) { - struct page* page = eggsfs_get_stripe_page(stripe, page_ix); - if (IS_ERR(page)) { - if (span_read_attempts == 0) { - // The idea behind this is that span structure changes extremely rarely: currently only - // when we "defrag" files into a new span structure (note that span boundary never changes). - // That's only needed when we change the block storage (e.g. HDD to FLASH) or when we tune - // the parity/block sizes/etc. - // A "proper" solution to this would probably to store a revision number or mtime for the - // span itself. If this coarse solution ever becomes problematic we can change to that, which - // requires a fairly annoying schema change. - eggsfs_warn("reading stripe %lld in file %016lx failed with error %ld, retrying since it's the first attempt, and the span structure might have changed in the meantime", *ppos, enode->inode.i_ino, PTR_ERR(stripe)); - eggsfs_put_stripe(stripe, stripe_read); - eggsfs_unlink_span(enode, span); - goto retry; - } - written = PTR_ERR(page); - goto out_stripe; - } - stripe_read = true; - - size_t to_copy = min((u64)PAGE_SIZE - (*ppos % PAGE_SIZE), span_data_end - *ppos); - if (to_copy == 0) { - eggsfs_warn("got 0 bytes to copy at pos %llu, span_data_end %llu", *ppos, span_data_end); - BUG(); - } - eggsfs_debug("(block) copying %lu, have %lu remaining", to_copy, iov_iter_count(to)); - size_t copied; - // copy_page_to_iter below ends up calling copy_page_to_iter_pipe for spliced reads - // which takes a reference to the our page using `get_page()` and also associates - // `page_cache_pipe_buf_ops()` to manage the page (https://elixir.bootlin.com/linux/v5.4.249/source/lib/iov_iter.c#L400). - // Later `splice_direct_to_actor()` trips over on https://elixir.bootlin.com/linux/v5.4.249/source/fs/splice.c#L977, - // goes through all the pipe buffers (which contain references to our pages) and drops - // references to the pages by calling https://elixir.bootlin.com/linux/v5.4.249/source/fs/splice.c#L92. - // As a result nfs spliced reads return zero pages. - if (unlikely(iov_iter_is_pipe(to))) { - char* from_ptr = kmap_atomic(page); - copied = copy_to_iter(from_ptr + *ppos % PAGE_SIZE, to_copy, to); - kunmap_atomic(from_ptr); - } else { - copied = copy_page_to_iter(page, *ppos % PAGE_SIZE, to_copy, to); - } - eggsfs_debug("copied=%lu, remaining %lu", copied, iov_iter_count(to)); - if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out_stripe; } - written += copied; - *ppos += copied; - span_offset += copied; - } - } -out_stripe: - if (stripe != NULL) { eggsfs_put_stripe(stripe, stripe_read); } - if (unlikely(written < 0)) { goto out_span; } - - // trailing zeros - if (iov_iter_count(to)) { - size_t to_copy = span->end - span_data_end; - size_t copied = iov_iter_zero(to_copy, to); - eggsfs_debug("(block zeroes) copying %lu, have %lu remaining", to_copy, iov_iter_count(to)); - if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out_span; } - written += copied; - *ppos += copied; - } - } - span_read_attempts = 0; - eggsfs_debug("before loop end, remaining %lu", iov_iter_count(to)); - } -out_span: - eggsfs_debug("out of the loop, written=%ld", written); - if (unlikely(written < 0)) { - eggsfs_debug("reading failed, err=%ld", written); - } - return written; + return generic_file_read_iter(iocb, to); } void eggsfs_link_destructor(void* buf) { @@ -1181,40 +1027,23 @@ char* eggsfs_read_link(struct eggsfs_inode* enode) { } else { struct page* page; struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); - if (eggsfs_page_level_reads == 1) { - LIST_HEAD(pages); - page = alloc_page(GFP_KERNEL); - if(IS_ERR(page)) { - err = PTR_ERR(page); - goto out_err; - } - page->index = 0; - list_add_tail(&page->lru, &pages); - LIST_HEAD(extra_pages); - err = eggsfs_span_get_pages(block_span, enode->inode.i_mapping, &pages, 1, &extra_pages); - if (err) goto out_err; - list_del(&page->lru); - put_pages_list(&pages); - put_pages_list(&extra_pages); - char* page_buf = kmap(page); - memcpy(buf, page_buf, size); - kunmap(page); - } else { - struct eggsfs_stripe* stripe = eggsfs_get_stripe(block_span, 0); - BUG_ON(stripe == NULL); // We know we're within bounds - - struct page* page = eggsfs_get_stripe_page(stripe, 0); - BUG_ON(page == NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - eggsfs_put_stripe(stripe, false); - goto out_err; - } - char* page_buf = kmap(page); - memcpy(buf, page_buf, size); - kunmap(page); - eggsfs_put_stripe(stripe, true); + LIST_HEAD(pages); + page = alloc_page(GFP_KERNEL); + if(IS_ERR(page)) { + err = PTR_ERR(page); + goto out_err; } + page->index = 0; + list_add_tail(&page->lru, &pages); + LIST_HEAD(extra_pages); + err = eggsfs_span_get_pages(block_span, enode->inode.i_mapping, &pages, 1, &extra_pages); + if (err) goto out_err; + list_del(&page->lru); + put_pages_list(&pages); + put_pages_list(&extra_pages); + char* page_buf = kmap(page); + memcpy(buf, page_buf, size); + kunmap(page); } buf[size] = '\0'; @@ -1281,7 +1110,7 @@ out_err: } // Files are not visible until they're fully persisted, so fsync is unnecessary. -int file_fsync(struct file* f, loff_t start, loff_t end, int datasync) { +static int file_fsync(struct file* f, loff_t start, loff_t end, int datasync) { return 0; } @@ -1297,7 +1126,7 @@ const struct file_operations eggsfs_file_operations = { .fsync = file_fsync, }; -void process_file_pages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { +static void process_file_pages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct page* page; int err; for(;;) { @@ -1325,11 +1154,6 @@ void process_file_pages(struct address_space *mapping, struct list_head *pages, static int file_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { int err = 0; - if (eggsfs_page_level_reads != 1) { - err = -ENOTSUPP; - goto out_err; - } - struct inode* inode = file_inode(filp); struct eggsfs_inode* enode = EGGSFS_I(inode); loff_t off = page_offset(lru_to_page(pages)); @@ -1354,9 +1178,23 @@ static int file_readpages(struct file *filp, struct address_space *mapping, stru goto out_err; } + LIST_HEAD(extra_pages); if (span->storage_class == EGGSFS_INLINE_STORAGE) { - err = -ENOTSUPP; - goto out_err; + struct eggsfs_inline_span* inline_span = EGGSFS_INLINE_SPAN(span); + struct page *page = lru_to_page(pages); + if (page == NULL) { + err = -EIO; + goto out_err; + } + list_del(&page->lru); + // rest is past end of file, just drop it + put_pages_list(pages); + // return page back + list_add_tail(&page->lru, pages); + BUG_ON(page_offset(page) != span->start); + char* dst = kmap_atomic(page); + memcpy(dst, inline_span->body, inline_span->len); + kunmap_atomic(dst); } else { struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span); if (block_span->cell_size%PAGE_SIZE != 0) { @@ -1364,17 +1202,15 @@ static int file_readpages(struct file *filp, struct address_space *mapping, stru err = -EIO; goto out_err; } - - LIST_HEAD(extra_pages); err = eggsfs_span_get_pages(block_span, mapping, pages, nr_pages, &extra_pages); if (err) { eggsfs_warn("readahead of %d pages at off=%lld in file %016lx failed with error %d", nr_pages, off, enode->inode.i_ino, err); put_pages_list(&extra_pages); goto out_err; } - process_file_pages(mapping, pages, nr_pages); - process_file_pages(mapping, &extra_pages, 0); } + process_file_pages(mapping, pages, nr_pages); + process_file_pages(mapping, &extra_pages, 0); return 0; out_err: @@ -1389,7 +1225,6 @@ static int file_readpage(struct file* filp, struct page* page) { int err = 0; struct eggsfs_span* span = NULL; - struct eggsfs_stripe* stripe = NULL; int span_read_attempts = 0; struct timespec64 start_ts = ns_to_timespec64(ktime_get_real_ns()); @@ -1431,55 +1266,34 @@ retry: struct page* stripe_page = NULL; - if (eggsfs_page_level_reads == 1) { - LIST_HEAD(pages); - stripe_page = alloc_page(GFP_KERNEL); - if(!stripe_page) { - err = -ENOMEM; - goto out; - } - stripe_page->index = page->index; - list_add_tail(&stripe_page->lru, &pages); - LIST_HEAD(extra_pages); - err = eggsfs_span_get_pages(block_span, filp->f_mapping, &pages, 1, &extra_pages); - process_file_pages(filp->f_mapping, &extra_pages, 0); - if (err) { - put_pages_list(&pages); - if (span_read_attempts == 0) { - // see comment in read_file_iter for rationale here - eggsfs_warn("reading page %lld in file %016lx failed with error %d, retrying since it's the first attempt, and the span structure might have changed in the meantime", off, enode->inode.i_ino, err); - eggsfs_unlink_span(enode, span); - span_read_attempts++; - goto retry; - } - struct timespec64 end_ts = ns_to_timespec64(ktime_get_real_ns()); - if ((end_ts.tv_sec - start_ts.tv_sec) < eggsfs_file_io_timeout_sec) { - goto retry; - } - eggsfs_warn("reading page at off=%lld in file %016lx failed with error %d", off, enode->inode.i_ino, err); - goto out; - } - list_del(&stripe_page->lru); - } else { - stripe = eggsfs_get_stripe(block_span, span_offset); - BUG_ON(stripe == NULL); - u64 stripe_offset = (span_offset - stripe->stripe_ix*eggsfs_stripe_size(block_span)); - u32 page_ix = stripe_offset/PAGE_SIZE; - stripe_page = eggsfs_get_stripe_page(stripe, page_ix); - if (IS_ERR(stripe_page)) { - err = PTR_ERR(stripe_page); - if (span_read_attempts == 0) { - // see comment in read_file_iter for rationale here - eggsfs_warn("reading page %lld in file %016lx failed with error %d, retrying since it's the first attempt, and the span structure might have changed in the meantime", off, enode->inode.i_ino, err); - eggsfs_put_stripe(stripe, true); - eggsfs_unlink_span(enode, span); - span_read_attempts++; - goto retry; - } - eggsfs_warn("reading page %lld in file %016lx failed with error %d, retries exceeded, failing request.", off, enode->inode.i_ino, err); - goto out; - } + LIST_HEAD(pages); + stripe_page = alloc_page(GFP_KERNEL); + if(!stripe_page) { + err = -ENOMEM; + goto out; } + stripe_page->index = page->index; + list_add_tail(&stripe_page->lru, &pages); + LIST_HEAD(extra_pages); + err = eggsfs_span_get_pages(block_span, filp->f_mapping, &pages, 1, &extra_pages); + process_file_pages(filp->f_mapping, &extra_pages, 0); + if (err) { + put_pages_list(&pages); + if (span_read_attempts == 0) { + // see comment in read_file_iter for rationale here + eggsfs_warn("reading page %lld in file %016lx failed with error %d, retrying since it's the first attempt, and the span structure might have changed in the meantime", off, enode->inode.i_ino, err); + eggsfs_unlink_span(enode, span); + span_read_attempts++; + goto retry; + } + struct timespec64 end_ts = ns_to_timespec64(ktime_get_real_ns()); + if ((end_ts.tv_sec - start_ts.tv_sec) < eggsfs_file_io_timeout_sec) { + goto retry; + } + eggsfs_warn("reading page at off=%lld in file %016lx failed with error %d", off, enode->inode.i_ino, err); + goto out; + } + list_del(&stripe_page->lru); char* to_ptr = kmap_atomic(page); char* from_ptr = kmap_atomic(stripe_page); @@ -1496,9 +1310,6 @@ out: } else { SetPageUptodate(page); } - if (stripe) { - eggsfs_put_stripe(stripe, true); - } unlock_page(page); return err; diff --git a/kmod/file.h b/kmod/file.h index 4f4d840d..4bc854f1 100644 --- a/kmod/file.h +++ b/kmod/file.h @@ -7,8 +7,6 @@ extern unsigned eggsfs_atime_update_interval_sec; extern unsigned eggsfs_max_write_span_attempts; extern unsigned eggsfs_file_io_timeout_sec; -extern unsigned eggsfs_page_level_reads; - extern int eggsfs_file_getattr_refresh_time_jiffies; // this is only relevant for mtime/atime updates ssize_t eggsfs_file_write(struct eggsfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from); diff --git a/kmod/restartsession.sh b/kmod/restartsession.sh index db2b2084..f2334d8d 100755 --- a/kmod/restartsession.sh +++ b/kmod/restartsession.sh @@ -78,8 +78,7 @@ fi # Attach tmux attach-session -t uovo:1 -# ./eggs/eggstests -verbose -kmod -filter 'mounted|rsync|large' -drop-cached-spans-every 100ms -short -binaries-dir $(pwd)/eggs +# ./eggs/eggstests -verbose -kmod -filter 'mounted|rsync|large' -short -binaries-dir $(pwd)/eggs -# sudo sh -c 'echo eggsfs_fetch_stripe >> /sys/kernel/debug/tracing/set_event' # sudo sysctl fs.eggsfs.debug=1 # ./eggs/eggstests -kmod -filter 'mounted' -cfg fsTest.checkThreads=4 -cfg fsTest.numDirs=1 -cfg fsTest.numFiles=100 -short -binaries-dir $(pwd)/eggs diff --git a/kmod/span.c b/kmod/span.c index 2c03f93c..fed9ea55 100644 --- a/kmod/span.c +++ b/kmod/span.c @@ -20,401 +20,10 @@ #include "sysctl.h" #include "block_services.h" -EGGSFS_DEFINE_COUNTER(eggsfs_stat_cached_stripes); -EGGSFS_DEFINE_COUNTER(eggsfs_stat_dropped_stripes_refetched); -EGGSFS_DEFINE_COUNTER(eggsfs_stat_stripe_cache_hit); -EGGSFS_DEFINE_COUNTER(eggsfs_stat_stripe_cache_miss); -EGGSFS_DEFINE_COUNTER(eggsfs_stat_stripe_page_cache_hit); -EGGSFS_DEFINE_COUNTER(eggsfs_stat_stripe_page_cache_miss); - -// This currently also includes in-flight pages (i.e. pages that cannot be -// reclaimed), just because the code is a bit simpler this way. -atomic64_t eggsfs_stat_cached_stripe_pages = ATOMIC64_INIT(0); - -// These numbers do not mean anything in particular. We do want to avoid -// flickering sync drops though, therefore we go down to 25GiB from -// 50GiB, and similarly for free memory. -unsigned long eggsfs_stripe_cache_max_size_async = (50ull << 30); // 50GiB -unsigned long eggsfs_stripe_cache_min_avail_mem_async = (2ull << 30); // 2GiB -unsigned long eggsfs_stripe_cache_max_size_drop = (45ull << 30); // 45GiB -unsigned long eggsfs_stripe_cache_min_avail_mem_drop = (2ull << 30) + (500ull << 20); // 2GiB + 500MiB -unsigned long eggsfs_stripe_cache_max_size_sync = (100ull << 30); // 100GiB -unsigned long eggsfs_stripe_cache_min_avail_mem_sync = (1ull << 30); // 1GiB - static struct kmem_cache* eggsfs_block_span_cachep; static struct kmem_cache* eggsfs_inline_span_cachep; -static struct kmem_cache* eggsfs_stripe_cachep; -static struct kmem_cache* eggsfs_fetch_stripe_cachep; static struct kmem_cache* eggsfs_fetch_span_pages_cachep; -// -// STRIPE PART -// -struct eggsfs_stripe_lru { - spinlock_t lock; - struct list_head lru; -} ____cacheline_aligned; - -// These are global locks, but the sharding should alleviate contention. -#define EGGSFS_STRIPE_BITS 8 -#define EGGSFS_STRIPE_LRUS (1<ino, EGGSFS_STRIPE_BITS); - return &stripe_lrus[h%EGGSFS_STRIPE_LRUS]; -} - -static struct eggsfs_stripe* read_and_acquire_stripe(struct eggsfs_block_span* span, u8 stripe_ix) { - struct eggsfs_stripe_lru* lru = get_stripe_lru(&span->span); - bool lru_del = false; // for debug output - spin_lock(&lru->lock); - struct eggsfs_stripe *stripe = (struct eggsfs_stripe*)atomic64_read_acquire((atomic64_t*)&span->stripes[stripe_ix]); - if (stripe == NULL) { goto unlock; } - - if (unlikely(atomic_read(&stripe->refcount) < 0)) { - // if the reclaimer locked first, we could only get stripe == NULL. - BUG(); - } - if (atomic_inc_return(&stripe->refcount) == 1) { - // we're the first ones here, take it out of the LRU - list_del(&stripe->lru); - lru_del = true; - // We only register a cache hit when the stripe is taken from lru. - // All other cases would be concurrent access and such stripe would not - // be considered to be dropped. - eggsfs_counter_inc(eggsfs_stat_stripe_cache_hit); - } -unlock: - spin_unlock(&lru->lock); - if (stripe) { - eggsfs_debug("acquired stripe %d of inode %llu, lru_del:%d, addr %p", stripe->stripe_ix, stripe->span->span.ino, lru_del, stripe); - } - return stripe; -} - -void eggsfs_put_stripe(struct eggsfs_stripe* stripe, bool was_read) { - struct eggsfs_stripe_lru* lru = get_stripe_lru(&stripe->span->span); - bool lru_add = false; // for debug output - spin_lock(&lru->lock); - BUG_ON(atomic_read(&stripe->refcount) < 1); - stripe->touched = stripe->touched || was_read; - if (atomic_dec_return(&stripe->refcount) == 0) { // we need to put it back into the LRU - lru_add = true; - if (stripe->touched) { - list_add_tail(&stripe->lru, &lru->lru); - } else { - list_add(&stripe->lru, &lru->lru); - } - } - spin_unlock(&lru->lock); - eggsfs_debug("done for stripe %d of inode %llu, lru_add=%d, addr %p", stripe->stripe_ix, stripe->span->span.ino, lru_add, stripe); -} - -struct eggsfs_stripe* eggsfs_get_stripe(struct eggsfs_block_span* span, u32 offset) { - u32 stripe_size = eggsfs_stripe_size(span); - u32 stripe_ix = offset / stripe_size; - if (unlikely(stripe_ix >= span->num_stripes)) { return NULL; } - - struct eggsfs_stripe* stripe; - u64 iterations = 0; - eggsfs_debug("ino:%llu, stripe_size:%u, offset: %u, stripe_ix:%u", span->span.ino, stripe_size, offset, stripe_ix); -retry: - iterations++; - // This would indicate some serious contention where the stripe is being - // added/reclaimed by many different tasks at the same time. - if (unlikely(iterations == 10)) { - eggsfs_warn("we've been fetching the same stripe for %llu iterations, we're probably stuck", iterations); - } - - stripe = read_and_acquire_stripe(span, stripe_ix); - if (stripe != NULL) { return stripe; } - - // Not found or was errored, first allocate it and bootstrap it - { - struct eggsfs_stripe* new_stripe = kmem_cache_alloc(eggsfs_stripe_cachep, GFP_KERNEL); - if (new_stripe == NULL) { - eggsfs_warn("failed allocating stripe: %ld", PTR_ERR(stripe)); - return ERR_PTR(-ENOMEM); - } - xa_init(&new_stripe->pages); - eggsfs_latch_init(&new_stripe->latch); - atomic_set(&new_stripe->refcount, 1); // the caller - new_stripe->stripe_ix = stripe_ix; - new_stripe->touched = false; - new_stripe->span = span; - eggsfs_debug("created stripe at index %u for %llu, addr %p", new_stripe->stripe_ix, span->span.ino, new_stripe); - atomic64_inc((atomic64_t*)&span->span.refcount); - // This is a single place where stripe is set without holding an LRU lock, - // but it is not in any LRU at this point and the race can only happen with - // this same code path, which is guarded by `atomic64_cmpxchg_release`. - if ((struct eggsfs_stripe*)atomic64_cmpxchg_release((atomic64_t*)&span->stripes[stripe_ix], (s64)0, (s64)new_stripe) != NULL) { - // Somebody got there first, free up and retry - eggsfs_debug("failed adding stripe %p", new_stripe); - kmem_cache_free(eggsfs_stripe_cachep, new_stripe); - atomic64_dec((atomic64_t*)&span->span.refcount); - goto retry; - } - if (span->stripe_was_cached[stripe_ix]) { - eggsfs_counter_inc(eggsfs_stat_dropped_stripes_refetched); - } else { - // We know it will eventually come back to lru. - span->stripe_was_cached[stripe_ix] = 1; - } - eggsfs_counter_inc(eggsfs_stat_cached_stripes); - eggsfs_counter_inc(eggsfs_stat_stripe_cache_miss); - return new_stripe; - } -} - -// -// STRIPE RECLAIM PART -// - -// Implementation below with the rest of the span code. -static void put_span(struct eggsfs_span* span); - -static void free_stripe(struct eggsfs_stripe* stripe, u64* dropped_pages) { - u64 dropped = 0; - eggsfs_counter_dec(eggsfs_stat_cached_stripes); - struct page* page; - unsigned long page_ix; - xa_for_each(&stripe->pages, page_ix, page) { - put_page(page); - dropped++; - BUG_ON(xa_erase(&stripe->pages, page_ix) == NULL); - } - BUG_ON(atomic64_read(&stripe->latch.counter) & 1); // somebody still hanging onto this - - // decresing the refcount may happen much later after setting the stripe - // to NULL, but it is ok because if, in the mean time new stripe was added, - // the refcount would be incremented and this will balance it. - put_span(&stripe->span->span); - eggsfs_debug("freeing stripe %p", stripe); - kmem_cache_free(eggsfs_stripe_cachep, stripe); - if (dropped_pages != NULL) { - (*dropped_pages) += dropped; - } - atomic64_sub(dropped, &eggsfs_stat_cached_stripe_pages); -} - -// Needs to be called with the lru lock that is holding the stripe. -static void stripe_validate_and_remove_from_lru(struct eggsfs_stripe *candidate, struct eggsfs_stripe_lru *lru) { - BUG_ON(!spin_is_locked(&lru->lock)); - struct eggsfs_stripe *old_stripe = (struct eggsfs_stripe*)atomic64_cmpxchg_release((atomic64_t*)&candidate->span->stripes[candidate->stripe_ix], (s64)candidate, (s64)0); - // If we could read the item from the lru, nothing should be able to replace it. - BUG_ON(candidate != old_stripe); - // stripe obtained while holding the LRU lock, it must be readers == 0 and goes to -1 after our dec - if (atomic_dec_return(&candidate->refcount) == -1) { - list_del(&candidate->lru); - } else { - eggsfs_warn("refcount %d for ino %llu span %llu:%llu stripe %d", atomic_read(&candidate->refcount), candidate->span->span.ino, candidate->span->span.start, candidate->span->span.end, candidate->stripe_ix); - BUG(); - } -} - -// If it returns -1, there are no stripes to drop. Otherwise, returns the -// next index to pass in to reclaim the next stripe. -static int drop_one_stripe(int lru_ix, u64* examined_stripes, u64* dropped_pages) { - BUG_ON(lru_ix < 0 || lru_ix >= EGGSFS_STRIPE_LRUS); - - int i; - for (i = lru_ix; i < lru_ix + EGGSFS_STRIPE_LRUS; i++) { - struct eggsfs_stripe_lru* lru = &stripe_lrus[i%EGGSFS_STRIPE_LRUS]; - struct eggsfs_stripe* candidate = NULL; - - // Pick a candidate from the LRU, mark it as reclaiming, take it out of - // the LRU. - spin_lock(&lru->lock); - candidate = list_first_entry_or_null(&lru->lru, struct eggsfs_stripe, lru); - if (likely(candidate != NULL)) { - (*examined_stripes)++; - stripe_validate_and_remove_from_lru(candidate, lru); - } - spin_unlock(&lru->lock); - if (candidate) { - // At this point we're free to do what we please with the stripe - // (it's fully private to us). - free_stripe(candidate, dropped_pages); - - i++; - break; - } - } - - // we've fully gone around without finding anything - if (i == lru_ix + EGGSFS_STRIPE_LRUS) { return -1; } - - // We've found something - return i%EGGSFS_STRIPE_LRUS; -} - -static inline void drop_stripes_start(const char* type) { - trace_eggsfs_drop_stripes( - type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_stripe_pages), eggsfs_counter_get(&eggsfs_stat_cached_stripes), EGGSFS_DROP_STRIPES_START, 0, 0, 0 - ); -} - -static inline void drop_stripes_end(const char* type, u64 examined_stripes, u64 dropped_pages, int err) { - trace_eggsfs_drop_stripes( - type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_stripe_pages), eggsfs_counter_get(&eggsfs_stat_cached_stripes), EGGSFS_DROP_STRIPES_END, examined_stripes, dropped_pages, err - ); -} - -// returns the number of dropped pages -u64 eggsfs_drop_all_stripes(void) { - drop_stripes_start("all"); - u64 dropped_pages = 0; - u64 dropped_pages_last = 0; - u64 examined_stripes = 0; - s64 stripes_begin = eggsfs_counter_get(&eggsfs_stat_cached_stripes); - int lru_ix = 0; - int last_ix = 0; - int total_iterations = 0; - for (;;) { - dropped_pages_last = 0; - last_ix = drop_one_stripe(lru_ix, &examined_stripes, &dropped_pages_last); - if (last_ix < 0) { - // We have iterated through all lrus without freeing anything - if (dropped_pages_last == 0) { break; } - // We will continue again in the same lru. - } else { - lru_ix = last_ix; - } - dropped_pages += dropped_pages_last; - total_iterations++; - } - s64 stripes_end = eggsfs_counter_get(&eggsfs_stat_cached_stripes); - if (dropped_pages != 0 || total_iterations != 0) { - eggsfs_info("reclaimed %llu pages, %lld stripes (approx), iterations:%d", dropped_pages, stripes_begin-stripes_end, total_iterations); - } - drop_stripes_end("all", examined_stripes, dropped_pages, 0); - return dropped_pages; -} - -static DEFINE_MUTEX(drop_stripes_mu); - -// Return the number of dropped pages. -static s64 drop_stripes(const char* type) { - drop_stripes_start(type); - - mutex_lock(&drop_stripes_mu); - - u64 dropped_pages = 0; - u64 examined_stripes = 0; - int lru_ix = 0; - for (;;) { - u64 pages = atomic64_read(&eggsfs_stat_cached_stripe_pages); - if ( - pages*PAGE_SIZE < eggsfs_stripe_cache_max_size_drop && - si_mem_available()*PAGE_SIZE > eggsfs_stripe_cache_min_avail_mem_drop - ) { - break; - } - lru_ix = drop_one_stripe(lru_ix, &examined_stripes, &dropped_pages); - if (lru_ix < 0) { break; } - } - eggsfs_debug("dropped %llu pages, %llu stripes examined", dropped_pages, examined_stripes); - - mutex_unlock(&drop_stripes_mu); - drop_stripes_end(type, examined_stripes, dropped_pages, 0); - - return dropped_pages; -} - -static void reclaim_stripes_async(struct work_struct* work) { - if (!mutex_is_locked(&drop_stripes_mu)) { // somebody's already taking care of it - drop_stripes("async"); - } -} - -static DECLARE_WORK(reclaim_stripes_work, reclaim_stripes_async); - -static unsigned long eggsfs_stripe_shrinker_count(struct shrinker* shrinker, struct shrink_control* sc) { - u64 pages = atomic64_read(&eggsfs_stat_cached_stripe_pages); - if (pages == 0) { return SHRINK_EMPTY; } - return pages; -} - -static int eggsfs_stripe_shrinker_lru_ix = 0; - -static unsigned long eggsfs_stripe_shrinker_scan(struct shrinker* shrinker, struct shrink_control* sc) { - drop_stripes_start("shrinker"); - - u64 dropped_pages = 0; - u64 examined_stripes = 0; - int lru_ix = eggsfs_stripe_shrinker_lru_ix; - for (dropped_pages = 0; dropped_pages < sc->nr_to_scan;) { - lru_ix = drop_one_stripe(lru_ix, &examined_stripes, &dropped_pages); - if (lru_ix < 0) { break; } - } - eggsfs_stripe_shrinker_lru_ix = lru_ix >= 0 ? lru_ix : 0; - drop_stripes_end("shrinker", examined_stripes, dropped_pages, 0); - - return dropped_pages; -} - -static struct shrinker eggsfs_stripe_shrinker = { - .count_objects = eggsfs_stripe_shrinker_count, - .scan_objects = eggsfs_stripe_shrinker_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NONSLAB, // what's this? -}; - -// Returns whether we're low on memory -static bool reclaim_after_fetch_stripe(void) { - // Reclaim pages if we went over the limit - u64 pages = atomic64_read(&eggsfs_stat_cached_stripe_pages); - u64 free_pages = si_mem_available(); - bool low_on_memory = false; - if (pages*PAGE_SIZE > eggsfs_stripe_cache_max_size_sync || free_pages*PAGE_SIZE < eggsfs_stripe_cache_min_avail_mem_sync) { - low_on_memory = true; - // sync dropping, apply backpressure - drop_stripes("sync"); - } else if (pages*PAGE_SIZE > eggsfs_stripe_cache_max_size_async || free_pages*PAGE_SIZE < eggsfs_stripe_cache_min_avail_mem_async) { - low_on_memory = true; - // TODO Is it a good idea to do it on system_long_wq rather than eggsfs_wq? The freeing - // job might face heavy contention, so maybe yes? - // On the other end, this might make it harder to know when it's safe to unload - // the module by making sure that there's no work left on the queue. - queue_work(system_long_wq, &reclaim_stripes_work); - } - return low_on_memory; -} - -// -// FETCH STRIPE PART -// - -struct fetch_stripe_state { - struct eggsfs_stripe* stripe; - // Used to wait on every block being finished. Could be done faster with a wait_queue, but don't want to - // worry about smb subtleties. - struct semaphore sema; - // Used to release the stripe when we're done with it - s64 stripe_seqno; - // These will be filled in by the fetching (only D of them well be - // filled by fetching the blocks, the others might be filled in by the RS - // recovery) - struct list_head blocks_pages[EGGSFS_MAX_BLOCKS]; - atomic_t refcount; // to garbage collect - atomic_t err; // the result of the stripe fetching - atomic_t last_block_err; // to return something vaguely connected to what happened - static_assert(EGGSFS_MAX_BLOCKS <= 16); - // 00-16: blocks which are downloading. - // 16-32: blocks which have succeeded. - // 32-48: blocks which have failed. - atomic64_t blocks; - u8 prefetching; // used for logging -}; - #define DOWNLOADING_SHIFT 0 #define DOWNLOADING_MASK (0xFFFFull<>DOWNLOADING_SHIFT) & 0xFFFFull) @@ -433,478 +42,10 @@ struct fetch_stripe_state { #define FAILED_SET(__b, __i) (__b | 1ull<<(__i+FAILED_SHIFT)) #define FAILED_UNSET(__b, __i) (__b & ~(1ull<<(__i+FAILED_SHIFT))) -static void init_fetch_stripe(void* p) { - struct fetch_stripe_state* st = (struct fetch_stripe_state*)p; - - sema_init(&st->sema, 0); - int i; - for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) { - INIT_LIST_HEAD(&st->blocks_pages[i]); - } +inline u32 eggsfs_stripe_size(struct eggsfs_block_span* span) { + return span->cell_size * eggsfs_data_blocks(span->parity); } -// This must be called while already holding the span, which means -// that span_acquire will always succeed. -static struct fetch_stripe_state* new_fetch_stripe( - struct eggsfs_stripe* stripe, - u64 stripe_seqno, - bool prefetching -) { - struct fetch_stripe_state* st = (struct fetch_stripe_state*)kmem_cache_alloc(eggsfs_fetch_stripe_cachep, GFP_KERNEL); - if (!st) { return st; } - - st->stripe = stripe; - atomic_set(&st->refcount, 1); // the caller - atomic_set(&st->err, 0); - atomic_set(&st->last_block_err, 0); - st->stripe = stripe; - st->prefetching = prefetching; - atomic64_set(&st->blocks, 0); - st->stripe_seqno = stripe_seqno; - - return st; -} - -static void fetch_stripe_trace(struct fetch_stripe_state* st, u8 event, s8 block, int err) { - trace_eggsfs_fetch_stripe( - st->stripe->span->span.ino, - st->stripe->span->span.start, - st->stripe->stripe_ix, - st->stripe->span->parity, - st->prefetching, - event, - block, - err - ); -} - -static void free_fetch_stripe(struct fetch_stripe_state* st) { - // we're the last ones here - fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_FREE, -1, atomic_read(&st->err)); - - // managing the stripe latch happens in eggsfs_get_stripe_page() - - while (down_trylock(&st->sema) == 0) {} // reset sema to zero for next usage - - // Free leftover blocks (also ensures the lists are all nice and empty for the - // next user) - int i; - for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) { - put_pages_list(&st->blocks_pages[i]); - } - kmem_cache_free(eggsfs_fetch_stripe_cachep, st); -} - -static void put_fetch_stripe(struct fetch_stripe_state* st) { - int remaining = atomic_dec_return(&st->refcount); - eggsfs_debug("st=%p remaining=%d", st, remaining); - BUG_ON(remaining < 0); - if (remaining > 0) { return; } - free_fetch_stripe(st); -} - -static void hold_fetch_stripe(struct fetch_stripe_state* st) { - int holding = atomic_inc_return(&st->refcount); - eggsfs_debug("st=%p holding=%d", st, holding) - WARN_ON(holding < 2); -} - -static void store_pages(struct fetch_stripe_state* st) { - struct eggsfs_block_span* span = st->stripe->span; - int D = eggsfs_data_blocks(span->parity); - - int i, j; - - u32 pages_per_block = span->cell_size/PAGE_SIZE; - - u16 blocks = SUCCEEDED_GET(atomic64_read(&st->blocks)); - BUG_ON(__builtin_popcountll(blocks) != D); - if (unlikely(blocks != (1ull<blocks_pages[i]; - for (j = 0; j < pages_per_block; j++) { - struct page* page = alloc_page(GFP_KERNEL); - if (!page) { - atomic_set(&st->err, -ENOMEM); - return; - } - list_add_tail(&page->lru, i_pages); - } - // compute - int err = eggsfs_recover(span->parity, blocks, 1ull<blocks_pages); - if (err) { - atomic_set(&st->err, err); - return; - } - } - } - - // compute crc - kernel_fpu_begin(); - u32 crc = 0; - u32 num_pages = 0; - for (i = 0; i < D; i++) { - struct page* page; - list_for_each_entry(page, &st->blocks_pages[i], lru) { - num_pages++; - char* page_buf = kmap_atomic(page); - crc = eggsfs_crc32c(crc, page_buf, PAGE_SIZE); - kunmap_atomic(page_buf); - } - } - kernel_fpu_end(); - - // if crc is wrong, bail immediately - if (crc != span->stripes_crc[st->stripe->stripe_ix]) { - eggsfs_warn("bad crc for file %016llx, stripe %u, expected %08x, got %08x, expected %u pages, got %u pages", span->span.ino, st->stripe->stripe_ix, span->stripes_crc[st->stripe->stripe_ix], crc, pages_per_block*D, num_pages); - atomic_set(&st->err, -EIO); - return; - } - - //u32 curr_page = pages_per_block*D*st->stripe->stripe_ix; <-- seems wrong and meant for span - // We start from 0 on every stripe. We could also switch to the above and track all pages with indices relative to the whole span (would need to change readers) - u32 curr_page = 0; - - // Now move pages to the span. Note that we can't add them - // as we compute the CRC because `xa_store` might allocate. - for (i = 0; i < D; i++) { - struct page* page; - struct page* tmp; - list_for_each_entry_safe(page, tmp, &st->blocks_pages[i], lru) { - // We don't want the page to be managed by the page cache - list_del(&page->lru); - struct page* old_page = xa_store(&st->stripe->pages, curr_page, page, GFP_KERNEL); - if (IS_ERR(old_page)) { - atomic_set(&st->err, PTR_ERR(old_page)); - eggsfs_info("xa_store failed: %ld, error stored", PTR_ERR(old_page)); - // We need to remove all the pages stored so far, and - // this errored one. - put_page(page); - u32 page_ix; - for (page_ix = 0; page_ix < curr_page; page_ix++) { - page = xa_erase(&st->stripe->pages, page_ix); - BUG_ON(page == NULL); - put_page(page); - } - return; - } - BUG_ON(old_page != NULL); // the latch protects against this - curr_page++; - } - } - - // We know that we added all of them given the BUG_ON(old_page != NULL) - // above. - atomic64_add(pages_per_block*D, &eggsfs_stat_cached_stripe_pages); - if (crc != span->stripes_crc[st->stripe->stripe_ix]) { - eggsfs_warn("bad crc for file %016llx, stripe %u, expected %08x, got %08x", span->span.ino, st->stripe->stripe_ix, span->stripes_crc[st->stripe->stripe_ix], crc); - atomic_set(&st->err, -EIO); - } -} - -static void block_done(void* data, u64 block_id, struct list_head* pages, int err); - -// Starts loading up D blocks as necessary. -static int fetch_blocks(struct fetch_stripe_state* st) { - int i; - - struct eggsfs_block_span* span = st->stripe->span; - int D = eggsfs_data_blocks(span->parity); - int P = eggsfs_parity_blocks(span->parity); - int B = eggsfs_blocks(span->parity); - -#define LOG_STR "file=%016llx span=%llu stripe=%u D=%d P=%d downloading=%04x(%llu) succeeded=%04x(%llu) failed=%04x(%llu) last_block_err=%d" -#define LOG_ARGS span->span.ino, span->span.start, st->stripe->stripe_ix, D, P, downloading, __builtin_popcountll(downloading), succeeded, __builtin_popcountll(succeeded), failed, __builtin_popcountll(failed), atomic_read(&st->last_block_err) - - for (;;) { - s64 blocks = atomic64_read(&st->blocks); - u16 downloading = DOWNLOADING_GET(blocks); - u16 succeeded = SUCCEEDED_GET(blocks); - u16 failed = FAILED_GET(blocks); - if (__builtin_popcountll(downloading)+__builtin_popcountll(succeeded) >= D) { // we managed to get out of the woods - eggsfs_debug("we have enough blocks, terminating " LOG_STR, LOG_ARGS); - return 0; - } - if (__builtin_popcountll(failed) > P) { // nowhere to go from here but tears - eggsfs_info("we're out of blocks, giving up " LOG_STR, LOG_ARGS); - int err = atomic_read(&st->last_block_err); - BUG_ON(err == 0); - return err; - } - - // Find the next block to download. If we're mirrored, download - // blocks at random, so that we can use mirrored files to avoid - // hotspot for super busy files. - if (D == 1) { - int start = prandom_u32()%B; - int j; - for (j = start; j < start + B; j++) { - i = j%B; - if (!((1ull<blocks, blocks, DOWNLOADING_SET(blocks, i)) != blocks) { - eggsfs_debug("skipping %d " LOG_STR, i, LOG_ARGS); - // somebody else got here first (can happen if some blocks complete before this loop finishes) first, keep going - continue; - } - - // ok, we're responsible for this now, start - struct eggsfs_block* block = &span->blocks[i]; - - // lands at correct cell in a block - // +------ stripe is cells at certain index across all data blocks. - //+----+-v--+----+----+----+ - //|cell|cell|cell|....|cell| <--- Block - //+----+----+----+----+----+ - u32 block_offset = st->stripe->stripe_ix * span->cell_size; - - fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_START, i, 0); - hold_fetch_stripe(st); - eggsfs_debug("block start st=%p block_service=%016llx block_id=%016llx", st, block->id, block->id); - struct eggsfs_block_service bs; - eggsfs_get_block_service(block->bs, &bs); - // Fetches a single cell from the block - - struct list_head pages; - INIT_LIST_HEAD(&pages); - int block_err = eggsfs_fetch_block_pages_with_crc( - &block_done, - (void*)st, - &bs, &pages, span->span.ino, block->id, block->crc, block_offset, span->cell_size - ); - if (block_err) { - put_pages_list(&pages); - atomic_set(&st->last_block_err, block_err); - put_fetch_stripe(st); - eggsfs_debug("loading block failed %d " LOG_STR, i, LOG_ARGS); - fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_DONE, i, block_err); - // mark it as failed and not downloading - blocks = atomic64_read(&st->blocks); - while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, FAILED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {} - } else { - eggsfs_debug("loading block %d " LOG_STR, i, LOG_ARGS); - } - BUG_ON(!list_empty(&pages)); - } - -#undef LOG_STR -#undef LOG_ARGS -} - -static void block_done(void* data, u64 block_id, struct list_head* pages, int err) { - int i; - - struct fetch_stripe_state* st = (struct fetch_stripe_state*)data; - struct eggsfs_block_span* span = st->stripe->span; - - int D = eggsfs_data_blocks(span->parity); - int P = eggsfs_parity_blocks(span->parity); - int B = eggsfs_blocks(span->parity); - u32 pages_per_block = span->cell_size/PAGE_SIZE; - - for (i = 0; i < B; i++) { - if (st->stripe->span->blocks[i].id == block_id) { break; } - } - // eggsfs_info("block ix %d, B=%d", i, B); - BUG_ON(i == B); - - eggsfs_debug( - "st=%p block=%d block_id=%016llx file=%016llx span=%llu stripe=%u D=%d P=%d err=%d", - st, i, block_id, span->span.ino, span->span.start, st->stripe->stripe_ix, D, P, err - ); - - fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_DONE, i, err); - - bool finished = false; - if (err) { - s64 blocks = atomic64_read(&st->blocks); - eggsfs_info("fetching block %016llx (index:%d, D=%d, P=%d, blocks:%lld) in file %016llx failed: %d", block_id, i, D, P, blocks, span->span.ino, err); - atomic_set(&st->last_block_err, err); - // it failed, try to restore order - eggsfs_debug("block %016llx ix %d failed with %d", block_id, i, err); - // mark it as failed and not downloading - while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, FAILED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {} - err = fetch_blocks(st); - // If we failed to restore order, mark the whole thing as failed, - // and if we were the first ones to notice the error, finish - // the overall thing. Other failing requests will just drop the - // reference. - if (err) { - int old_err = 0; - finished = atomic_try_cmpxchg(&st->err, &old_err, err); - eggsfs_debug("failed finished=%d", finished); - } - } else { - // it succeeded, grab all the pages - BUG_ON(!list_empty(&st->blocks_pages[i])); - struct page* page; - struct page* tmp; - u32 fetched_pages = 0; - list_for_each_entry_safe(page, tmp, pages, lru) { - list_del(&page->lru); - list_add_tail(&page->lru, &st->blocks_pages[i]); - fetched_pages++; - } - BUG_ON(fetched_pages != pages_per_block); - // mark as fetched, see if we're the last ones - s64 blocks = atomic64_read(&st->blocks); - while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, SUCCEEDED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {} - finished = __builtin_popcountll(SUCCEEDED_GET(SUCCEEDED_SET(blocks, i))) == D; // blocks = last old one, we need to reapply - } - - if (finished) { - eggsfs_debug("finished"); - // we're the last one to finish, we need to store the pages in the span, - // if we have no errors - err = atomic_read(&st->err); - if (err == 0) { store_pages(st); } - fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_END, -1, err); - // Wake up waiters. The trace above needs to happen before because it - // references the stripe. If the caller is allowed to continue, - // the stripe may get reclaimed and it results in null pointer deref. - up(&st->sema); - } - - // drop our reference - put_fetch_stripe(st); -} - -static struct fetch_stripe_state* start_fetch_stripe( - struct eggsfs_stripe* stripe, s64 stripe_seqno, bool prefetching -) { - int err; - - trace_eggsfs_fetch_stripe( - stripe->span->span.ino, - stripe->span->span.start, - stripe->stripe_ix, - stripe->span->parity, - prefetching, - EGGSFS_FETCH_STRIPE_START, - -1, - 0 - ); - - struct fetch_stripe_state* st = NULL; - - int D = eggsfs_data_blocks(stripe->span->parity); - int P = eggsfs_parity_blocks(stripe->span->parity); - if (D > EGGSFS_MAX_DATA || P > EGGSFS_MAX_PARITY) { - eggsfs_error("got out of bounds parity of RS(%d,%d) for span %llu in file %016llx", D, P, stripe->span->span.start, stripe->span->span.ino); - err = -EIO; - goto out_err; - } - - st = new_fetch_stripe(stripe, stripe_seqno, prefetching); - - // start fetching blocks - eggsfs_debug("fetching blocks"); - err = fetch_blocks(st); - if (err) { goto out_err; } - - eggsfs_debug("fetch stripe started"); - return st; - -out_err: - fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_END, -1, err); - if (st) { put_fetch_stripe(st); } - return ERR_PTR(err); -} - -struct page* eggsfs_get_stripe_page(struct eggsfs_stripe* stripe, u32 page_ix) { - u64 seqno; - struct page* page; - bool low_on_memory = false; - int err = 0; - int D = eggsfs_data_blocks(stripe->span->parity); - - eggsfs_debug("reading page from stripe %u at index %u of %u", stripe->stripe_ix, page_ix, D); - trace_eggsfs_get_stripe_page_enter(stripe->span->span.ino, stripe->span->span.start, stripe->stripe_ix*D*page_ix*PAGE_SIZE); - -#define GET_PAGE_EXIT(p) do { \ - int __err = IS_ERR(p) ? PTR_ERR(p) : 0; \ - trace_eggsfs_get_stripe_page_exit(stripe->span->span.ino, stripe->span->span.start, stripe->stripe_ix*D*page_ix*PAGE_SIZE, __err); \ - return p; \ - } while(0) - -retry: - page = xa_load(&stripe->pages, page_ix); - if (page != NULL) { - eggsfs_counter_inc(eggsfs_stat_stripe_page_cache_hit); - GET_PAGE_EXIT(page); - } - - if (!eggsfs_latch_try_acquire(&stripe->latch, seqno)) { - eggsfs_latch_wait(&stripe->latch, seqno); - goto retry; - } - - // We're the first ones here - page = xa_load(&stripe->pages, page_ix); - if (page != NULL) { - // somebody got to it first - eggsfs_latch_release(&stripe->latch, seqno); - GET_PAGE_EXIT(page); - } - - eggsfs_debug("about to start fetch stripe"); - struct fetch_stripe_state* st = NULL; - st = start_fetch_stripe(stripe, seqno, false); - if (IS_ERR(st)) { - err = PTR_ERR(st); - st = NULL; - goto out_err; - } - eggsfs_debug("started fetch stripe, waiting"); - - // Wait for all the block requests - down(&st->sema); - - eggsfs_debug("done"); - - err = atomic_read(&st->err); - if (err) { goto out_err; } - - // We're finally done - page = xa_load(&stripe->pages, page_ix); - eggsfs_debug("loaded page %p for stripe %u at %u", page, stripe->stripe_ix, page_ix); - BUG_ON(page == NULL); - // Only track the miss when we go and fetch the stripe. - eggsfs_counter_inc(eggsfs_stat_stripe_page_cache_miss); - -out: - eggsfs_latch_release(&stripe->latch, seqno); - if (st != NULL) { put_fetch_stripe(st); } - - // Reclaim if needed - low_on_memory = reclaim_after_fetch_stripe(); - - GET_PAGE_EXIT(err == 0 ? page : ERR_PTR(err)); - -out_err: - eggsfs_info("getting stripe page failed, err=%d", err); - goto out; - -#undef GET_PAGE_EXIT -} - -// -// New page fetch -// - struct fetch_span_pages_state { struct eggsfs_block_span* block_span; // Used to wait on every block being finished. Could be done faster with a wait_queue, but don't want to @@ -1664,7 +805,6 @@ static bool insert_span(struct rb_root* spans, struct eggsfs_span* span) { } static void put_span(struct eggsfs_span* span) { - if (atomic_dec_return(&span->refcount) > 0) { return; } if (span->storage_class == EGGSFS_INLINE_STORAGE) { kmem_cache_free(eggsfs_inline_span_cachep, EGGSFS_INLINE_SPAN(span)); } else { @@ -1686,7 +826,6 @@ void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 sto struct eggsfs_block_span* span = kmem_cache_alloc(eggsfs_block_span_cachep, GFP_KERNEL); if (!span) { ctx->err = -ENOMEM; return; } - atomic_set(&span->span.refcount, 1); if (eggsfs_data_blocks(parity) > EGGSFS_MAX_DATA || eggsfs_parity_blocks(parity) > EGGSFS_MAX_PARITY) { eggsfs_warn("D=%d > %d || P=%d > %d", eggsfs_data_blocks(parity), EGGSFS_MAX_DATA, eggsfs_data_blocks(parity), EGGSFS_MAX_PARITY); @@ -1705,10 +844,6 @@ void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 sto span->num_stripes = stripes; span->parity = parity; int i; - for (i = 0; i < stripes; i++) { - span->stripes[i] = NULL; - } - eggsfs_debug("adding normal span"); insert_span(&ctx->spans, &span->span); } @@ -1751,7 +886,6 @@ void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, struct eggsfs_inline_span* span = kmem_cache_alloc(eggsfs_inline_span_cachep, GFP_KERNEL); if (!span) { ctx->err = -ENOMEM; return; } - atomic_set(&span->span.refcount, 1); span->span.ino = ctx->enode->inode.i_ino; @@ -1845,30 +979,9 @@ retry: #undef GET_SPAN_EXIT } -static void free_span_stripes(struct eggsfs_block_span* block_span) { - - u8 i; - struct eggsfs_stripe_lru *lru; - struct eggsfs_stripe *stripe; - - for (i = 0; i < block_span->num_stripes; i++) { - lru = get_stripe_lru(&block_span->span); - spin_lock(&lru->lock); - stripe = block_span->stripes[i]; - if (stripe != NULL) { - stripe_validate_and_remove_from_lru(stripe, lru); - free_stripe(stripe, NULL); - } - spin_unlock(&lru->lock); - } -} - void eggsfs_unlink_span(struct eggsfs_inode* enode, struct eggsfs_span* span) { down_write(&enode->file.spans_lock); rb_erase(&span->node, &enode->file.spans); - if (span->storage_class != EGGSFS_INLINE_STORAGE) { - free_span_stripes(EGGSFS_BLOCK_SPAN(span)); - } put_span(span); up_write(&enode->file.spans_lock); } @@ -1893,15 +1006,7 @@ void eggsfs_unlink_spans(struct eggsfs_inode* enode) { // int eggsfs_span_init(void) { - int i; - for (i = 0; i < EGGSFS_STRIPE_LRUS; i++) { - spin_lock_init(&stripe_lrus[i].lock); - INIT_LIST_HEAD(&stripe_lrus[i].lru); - } - - int err = register_shrinker(&eggsfs_stripe_shrinker); - if (err) { return err; } - + int err; eggsfs_block_span_cachep = kmem_cache_create( "eggsfs_block_span_cache", sizeof(struct eggsfs_block_span), @@ -1911,7 +1016,7 @@ int eggsfs_span_init(void) { ); if (!eggsfs_block_span_cachep) { err = -ENOMEM; - goto out_shrinker; + return err; } eggsfs_inline_span_cachep = kmem_cache_create_usercopy( @@ -1927,31 +1032,6 @@ int eggsfs_span_init(void) { err = -ENOMEM; goto out_block; } - - eggsfs_stripe_cachep = kmem_cache_create( - "eggsfs_stripe_cache", - sizeof(struct eggsfs_stripe), - 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL - ); - if (!eggsfs_stripe_cachep) { - err = -ENOMEM; - goto out_inline; - } - - eggsfs_fetch_stripe_cachep = kmem_cache_create( - "eggsfs_fetch_stripe_cache", - sizeof(struct fetch_stripe_state), - 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - &init_fetch_stripe - ); - if (!eggsfs_fetch_stripe_cachep) { - err = -ENOMEM; - goto out_stripe; - } - eggsfs_fetch_span_pages_cachep = kmem_cache_create( "eggsfs_fetch_span_pages_cache", sizeof(struct fetch_span_pages_state), @@ -1961,35 +1041,20 @@ int eggsfs_span_init(void) { ); if (!eggsfs_fetch_span_pages_cachep) { err = -ENOMEM; - goto out_fetch_stripe; + goto out_inline; } return 0; -out_fetch_stripe: - kmem_cache_destroy(eggsfs_fetch_stripe_cachep); -out_stripe: - kmem_cache_destroy(eggsfs_stripe_cachep); out_inline: kmem_cache_destroy(eggsfs_inline_span_cachep); out_block: kmem_cache_destroy(eggsfs_block_span_cachep); -out_shrinker: - unregister_shrinker(&eggsfs_stripe_shrinker); return err; } void eggsfs_span_exit(void) { eggsfs_debug("span exit"); - unregister_shrinker(&eggsfs_stripe_shrinker); - // All the stripes should be freed up when removing spans from inodes, - // but it seems like the inode cache is not always purged before calling - // the exit function. - - u64 pages = eggsfs_drop_all_stripes(); - if (pages > 0) eggsfs_info("expected to drop 0 pages, but dropped %llu", pages); - kmem_cache_destroy(eggsfs_block_span_cachep); - kmem_cache_destroy(eggsfs_inline_span_cachep); - kmem_cache_destroy(eggsfs_stripe_cachep); - kmem_cache_destroy(eggsfs_fetch_stripe_cachep); kmem_cache_destroy(eggsfs_fetch_span_pages_cachep); + kmem_cache_destroy(eggsfs_inline_span_cachep); + kmem_cache_destroy(eggsfs_block_span_cachep); } diff --git a/kmod/span.h b/kmod/span.h index 88beb080..48ea9a24 100644 --- a/kmod/span.h +++ b/kmod/span.h @@ -7,22 +7,6 @@ #include "counter.h" #include "block.h" #include "rs.h" - -EGGSFS_DECLARE_COUNTER(eggsfs_stat_cached_stripes); -EGGSFS_DECLARE_COUNTER(eggsfs_stat_dropped_stripes_refetched); -EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_cache_hit); -EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_cache_miss); -EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_page_cache_hit); -EGGSFS_DECLARE_COUNTER(eggsfs_stat_stripe_page_cache_miss); -extern atomic64_t eggsfs_stat_cached_stripe_pages; - -extern unsigned long eggsfs_stripe_cache_max_size_async; -extern unsigned long eggsfs_stripe_cache_min_avail_mem_async; -extern unsigned long eggsfs_stripe_cache_max_size_sync; -extern unsigned long eggsfs_stripe_cache_min_avail_mem_sync; -extern unsigned long eggsfs_stripe_cache_max_size_drop; -extern unsigned long eggsfs_stripe_cache_min_avail_mem_drop; - struct eggsfs_span { // All the spans operations are independent from `struct eggsfs_inode`. // So we need the inode here. @@ -32,11 +16,6 @@ struct eggsfs_span { // To be in the inode tree. Note that we might _not_ be in the // tree once the inode releases us. struct rb_node node; - // The inode and the stripes hold references to us. So the inode - // needs to unlink us and all the stripes need to be gone after - // this can be freed. Inline spans obviously only are referenced - // by inodes. - atomic_t refcount; // Used to determine which type of span this is enclosed in. u8 storage_class; }; @@ -58,46 +37,15 @@ struct eggsfs_block { u32 crc; }; -struct eggsfs_stripe { - // Holds reference to it (we can always dereference it from - // the stripe). - struct eggsfs_block_span* span; - // Full list of contents if they have been fetched. - struct xarray pages; - // To synchronize fillin in the pages for the first time. - struct eggsfs_latch latch; - struct list_head lru; - // When refcount = 0, this might still be alive, but in an LRU. - // When refcount < 0, we're currently reclaiming this stripe. - // Going from 0 to 1, and from 0 to -1 involves taking the respective - // LRU lock. This field is not atomic_t because _all changes to this - // value go through the LRU lock_, since they must be paired with - // manipulating the LRU list. - atomic_t refcount; - u8 stripe_ix; // Which stripe it is - bool touched; // Whether the stripe was touched before putting it back -}; - struct eggsfs_block_span { struct eggsfs_span span; struct eggsfs_block blocks[EGGSFS_MAX_BLOCKS]; - // If NULL, the stripe is currently not initialized. - // The value has to be accessed with the lock of the containg stripe_lru. - // It is set without LRU lock in a single place when creating a new stripe. - // There should be no new code paths that manipulate it without locking - // the LRU first. - struct eggsfs_stripe* stripes[EGGSFS_MAX_STRIPES]; - bool stripe_was_cached[EGGSFS_MAX_STRIPES]; u32 cell_size; u32 stripes_crc[EGGSFS_MAX_STRIPES]; u8 num_stripes; u8 parity; }; -static inline u32 eggsfs_stripe_size(struct eggsfs_block_span* span) { - return span->cell_size * eggsfs_data_blocks(span->parity); -} - #define EGGSFS_BLOCK_SPAN(_span) ({ \ BUG_ON((_span)->storage_class == EGGSFS_EMPTY_STORAGE || (_span)->storage_class == EGGSFS_INLINE_STORAGE); \ container_of(_span, struct eggsfs_block_span, span); \ @@ -113,25 +61,11 @@ struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset); // Remove the span from the inode tree, decrementing the refcount. void eggsfs_unlink_span(struct eggsfs_inode* enode, struct eggsfs_span* span); -// Drop all the spans in a specific file. The span might linger around -// if the stripes are being read (since they have references to it too). -// +// Drop all the spans in a specific file. // This function must be called before inode eviction, otherwise we'll -// leak spans and stripes. +// leak spans. void eggsfs_unlink_spans(struct eggsfs_inode* enode); -// Gets a stripe within a span. offset is the offset inside the span. -// Once this returns, all the pages will be filled in already. The stripe -// needs to be put back with `eggsfs_put_stripe` when you're done with it. -struct eggsfs_stripe* eggsfs_get_stripe(struct eggsfs_block_span* span, u32 offset); - -// Makes the stripe available for reclamation. -void eggsfs_put_stripe(struct eggsfs_stripe* stripe, bool was_read); - -// The page_ix is the page number inside the stripe. The first call demanding -// a page will pull in all the pages for the stripe. -struct page* eggsfs_get_stripe_page(struct eggsfs_stripe* stripe, u32 page_ix); - // Fetches pages from block services to fill the supplied list of pages. // @pages - original list of pages to be filled. The list is expected to // contain pages with page->index values increasing by 1. @@ -140,7 +74,7 @@ struct page* eggsfs_get_stripe_page(struct eggsfs_stripe* stripe, u32 page_ix); // and removed from the list. // @extra_pages - empty list to add fetched pages that fall outside of the // requested range. -// Pages are taken out of the list, added to required blocks for fetching, +// Pages are taken out of the list, added to required blocks for fetching, // filled with data fetched from block services and then assembled back // maintaining the original order to match file offsets. Each supplied page // has page->index prefilled as file_offset/PAGE_SIZE. Fetches are done in @@ -158,16 +92,13 @@ struct page* eggsfs_get_stripe_page(struct eggsfs_stripe* stripe, u32 page_ix); // even returning error aggressively is fine. int eggsfs_span_get_pages(struct eggsfs_block_span* block_span, struct address_space* mapping, struct list_head *pages, unsigned nr_pages, struct list_head *extra_pages); -// Drops all cached stripes not being currently used. Returns number of -// freed pages. -u64 eggsfs_drop_all_stripes(void); - // Callbacks for metadata void eggsfs_file_spans_cb_span( void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs ); + void eggsfs_file_spans_cb_block( void* data, int block_ix, // block service stuff diff --git a/kmod/sysctl.c b/kmod/sysctl.c index a176bd98..e2f3c941 100644 --- a/kmod/sysctl.c +++ b/kmod/sysctl.c @@ -30,11 +30,6 @@ int eggsfs_debug_output = 0; } \ return 0; -static int drop_cached_stripes; -static int eggsfs_drop_stripes_sysctl(struct ctl_table* table, int write, void __sysctl_buffer* buffer, size_t* len, loff_t* ppos) { - eggsfs_do_sysctl(eggsfs_drop_all_stripes); -} - static int drop_fetch_block_sockets; static int eggsfs_drop_fetch_block_sockets_sysctl(struct ctl_table* table, int write, void __sysctl_buffer* buffer, size_t* len, loff_t* ppos) { eggsfs_do_sysctl(eggsfs_drop_fetch_block_sockets); @@ -107,14 +102,6 @@ static struct ctl_table eggsfs_cb_sysctls[] = { .extra2 = &eggsfs_rs_cpu_level_max, }, - { - .procname = "drop_cached_stripes", - .data = &drop_cached_stripes, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = eggsfs_drop_stripes_sysctl, - }, - { .procname = "drop_fetch_block_sockets", .data = &drop_fetch_block_sockets, @@ -147,16 +134,8 @@ static struct ctl_table eggsfs_cb_sysctls[] = { EGGSFS_CTL_UINT(max_write_span_attempts), EGGSFS_CTL_UINT(atime_update_interval_sec), EGGSFS_CTL_UINT(file_io_timeout_sec), - EGGSFS_CTL_UINT(page_level_reads), EGGSFS_CTL_UINT(disable_ftruncate), - EGGSFS_CTL_ULONG(stripe_cache_max_size_async), - EGGSFS_CTL_ULONG(stripe_cache_min_avail_mem_async), - EGGSFS_CTL_ULONG(stripe_cache_max_size_sync), - EGGSFS_CTL_ULONG(stripe_cache_min_avail_mem_sync), - EGGSFS_CTL_ULONG(stripe_cache_max_size_drop), - EGGSFS_CTL_ULONG(stripe_cache_min_avail_mem_drop), - { .procname = "mtu", .data = &eggsfs_mtu, diff --git a/kmod/sysfs.c b/kmod/sysfs.c index 79863a21..80d1f295 100644 --- a/kmod/sysfs.c +++ b/kmod/sysfs.c @@ -67,23 +67,9 @@ static umode_t eggsfs_stat_visible(struct kobject *kobj, struct attribute *attr, #define EGGSFS_SYSFS_ATOMIC64_PTR(_name) (&eggsfs_sysfs_atomic64_##_name.kobj_attr.attr) EGGSFS_SYSFS_COUNTER(dir_revalidations); -EGGSFS_SYSFS_COUNTER(stripe_cache_hit); -EGGSFS_SYSFS_COUNTER(stripe_cache_miss); -EGGSFS_SYSFS_COUNTER(stripe_page_cache_hit); -EGGSFS_SYSFS_COUNTER(stripe_page_cache_miss); -EGGSFS_SYSFS_COUNTER(cached_stripes); -EGGSFS_SYSFS_COUNTER(dropped_stripes_refetched); -EGGSFS_SYSFS_ATOMIC64(cached_stripe_pages); static struct attribute* eggsfs_stat_attrs[] = { EGGSFS_SYSFS_COUNTER_PTR(dir_revalidations), - EGGSFS_SYSFS_COUNTER_PTR(stripe_cache_hit), - EGGSFS_SYSFS_COUNTER_PTR(stripe_cache_miss), - EGGSFS_SYSFS_COUNTER_PTR(stripe_page_cache_hit), - EGGSFS_SYSFS_COUNTER_PTR(stripe_page_cache_miss), - EGGSFS_SYSFS_COUNTER_PTR(cached_stripes), - EGGSFS_SYSFS_COUNTER_PTR(dropped_stripes_refetched), - EGGSFS_SYSFS_ATOMIC64_PTR(cached_stripe_pages), NULL }; diff --git a/kmod/trace.h b/kmod/trace.h index 5d557b0a..af952041 100644 --- a/kmod/trace.h +++ b/kmod/trace.h @@ -436,45 +436,6 @@ TRACE_EVENT(eggsfs_span_add, TP_printk("file_id=%016llx offset=%llu", __entry->file_id, __entry->offset) ); -#define EGGSFS_DROP_STRIPES_START 0 -#define EGGSFS_DROP_STRIPES_END 1 - -TRACE_EVENT(eggsfs_drop_stripes, - TP_PROTO(const char* type, long mem_available, u64 cached_pages, u64 cached_stripes, u8 event, u64 examined_stripes, u64 dropped_pages, int err), - TP_ARGS( type, mem_available, cached_pages, cached_stripes, event, examined_stripes, dropped_pages, err), - - TP_STRUCT__entry( - __field(const char*, type) - __field(long, mem_available) - __field(u64, cached_pages) - __field(u64, cached_stripes) - __field(u64, dropped_pages) - __field(u64, examined_stripes) - __field(int, err) - __field(u8, event) - ), - TP_fast_assign( - __entry->type = type; - __entry->mem_available = mem_available; - __entry->cached_pages = cached_pages; - __entry->cached_stripes = cached_stripes; - __entry->dropped_pages = dropped_pages; - __entry->examined_stripes = examined_stripes; - __entry->err = err; - __entry->event = event; - ), - TP_printk( - "type=%s mem_available=%ld cached_pages=%llu cached_stripes=%llu event=%s examined_stripes=%llu dropped_pages=%llu err=%d", - __entry->type, __entry->mem_available, __entry->cached_pages, __entry->cached_stripes, - __print_symbolic( - __entry->event, - { EGGSFS_DROP_STRIPES_START, "start" }, - { EGGSFS_DROP_STRIPES_END, "end" } - ), - __entry->examined_stripes, __entry->dropped_pages, __entry->err - ) -); - TRACE_EVENT(eggsfs_get_span_enter, TP_PROTO(u64 file_id, u64 offset), TP_ARGS( file_id, offset), @@ -506,87 +467,6 @@ TRACE_EVENT(eggsfs_get_span_exit, TP_printk("file_id=%016llx offset=%llu, err=%d", __entry->file_id, __entry->offset, __entry->err) ); -TRACE_EVENT(eggsfs_get_stripe_page_enter, - TP_PROTO(u64 file_id, u64 stripe_offset, u32 page_offset), - TP_ARGS( file_id, stripe_offset, page_offset), - - TP_STRUCT__entry( - __field(u64, file_id) - __field(u64, stripe_offset) - __field(u32, page_offset) - ), - TP_fast_assign( - __entry->file_id = file_id; - __entry->stripe_offset = stripe_offset; - __entry->page_offset = page_offset; - ), - TP_printk("file_id=%016llx stripe_offset=%llu page_offset=%u", __entry->file_id, __entry->stripe_offset, __entry->page_offset) -); - -TRACE_EVENT(eggsfs_get_stripe_page_exit, - TP_PROTO(u64 file_id, u64 stripe_offset, u32 page_offset, int err), - TP_ARGS( file_id, stripe_offset, page_offset, err), - - TP_STRUCT__entry( - __field(u64, file_id) - __field(u64, stripe_offset) - __field(u32, page_offset) - __field(int, err) - ), - TP_fast_assign( - __entry->file_id = file_id; - __entry->stripe_offset = stripe_offset; - __entry->page_offset = page_offset; - __entry->err = err; - ), - TP_printk("file_id=%016llx stripe_offset=%llu page_offset=%u err=%d", __entry->file_id, __entry->stripe_offset, __entry->page_offset, __entry->err) -); - -#define EGGSFS_FETCH_STRIPE_START 0 -#define EGGSFS_FETCH_STRIPE_BLOCK_START 1 -#define EGGSFS_FETCH_STRIPE_BLOCK_DONE 2 -#define EGGSFS_FETCH_STRIPE_END 3 -#define EGGSFS_FETCH_STRIPE_FREE 4 - -TRACE_EVENT(eggsfs_fetch_stripe, - TP_PROTO(u64 file_id, u64 span_offset, u8 stripe, u8 parity, bool prefetching, u8 event, s8 block, int err), - TP_ARGS( file_id, span_offset, stripe, parity, prefetching, event, block, err), - - TP_STRUCT__entry( - __field(u64, file_id) - __field(u64, span_offset) - __field(int, err) - __field(s8, block) // negative = non-block event - __field(u8, stripe) - __field(bool, prefetching) - __field(u8, event) - __field(u8, parity) - ), - TP_fast_assign( - __entry->file_id = file_id; - __entry->span_offset = span_offset; - __entry->stripe = stripe; - __entry->prefetching = prefetching; - __entry->event = event; - __entry->err = err; - __entry->block = block; - __entry->parity = parity; - ), - TP_printk( - "file_id=%016llx span_offset=%llu stripe=%u D=%u P=%u prefetching=%d event=%s block=%d err=%d", - __entry->file_id, __entry->span_offset, __entry->stripe, __entry->parity&0xF, __entry->parity>>4, (int)__entry->prefetching, - __print_symbolic( - __entry->event, - { EGGSFS_FETCH_STRIPE_START, "start" }, - { EGGSFS_FETCH_STRIPE_BLOCK_START, "block_start" }, - { EGGSFS_FETCH_STRIPE_BLOCK_DONE, "block_done" }, - { EGGSFS_FETCH_STRIPE_END, "end" }, - { EGGSFS_FETCH_STRIPE_FREE, "free" } - ), - __entry->block, __entry->err - ) -); - #define EGGSFS_UPSERT_BLOCKSERVICE_MATCH 0 #define EGGSFS_UPSERT_BLOCKSERVICE_NOMATCH 1 #define EGGSFS_UPSERT_BLOCKSERVICE_NEW 2