mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-05-04 08:49:29 -05:00
1254 lines
46 KiB
C
1254 lines
46 KiB
C
#include <linux/module.h>
|
|
|
|
#include "bincode.h"
|
|
#include "inode.h"
|
|
#include "log.h"
|
|
#include "span.h"
|
|
#include "metadata.h"
|
|
#include "rs.h"
|
|
#include "latch.h"
|
|
#include "crc.h"
|
|
#include "err.h"
|
|
#include "counter.h"
|
|
#include "wq.h"
|
|
#include "trace.h"
|
|
#include "intrshims.h"
|
|
#include "sysctl.h"
|
|
|
|
EGGSFS_DEFINE_COUNTER(eggsfs_stat_cached_spans);
|
|
|
|
// This currently also includes in-flight pages (i.e. pages that cannot be
|
|
// reclaimed), just because the code is a bit simpler this way.
|
|
atomic64_t eggsfs_stat_cached_span_pages = ATOMIC64_INIT(0);
|
|
|
|
// These numbers do not mean anything in particular. We do want to avoid
|
|
// flickering sync drops though, therefore we go down to 25GiB from
|
|
// 50GiB, and similarly for free memory.
|
|
unsigned long eggsfs_span_cache_max_size_async = (50ull << 30); // 50GiB
|
|
unsigned long eggsfs_span_cache_min_avail_mem_async = (2ull << 30); // 2GiB
|
|
unsigned long eggsfs_span_cache_max_size_drop = (45ull << 30); // 25GiB
|
|
unsigned long eggsfs_span_cache_min_avail_mem_drop = (2ull << 30) + (500ull << 20); // 2GiB + 500MiB
|
|
unsigned long eggsfs_span_cache_max_size_sync = (100ull << 30); // 100GiB
|
|
unsigned long eggsfs_span_cache_min_avail_mem_sync = (1ull << 30); // 1GiB
|
|
|
|
static struct kmem_cache* eggsfs_block_span_cachep;
|
|
static struct kmem_cache* eggsfs_inline_span_cachep;
|
|
static struct kmem_cache* eggsfs_fetch_stripe_cachep;
|
|
struct eggsfs_span_lru {
|
|
spinlock_t lock;
|
|
struct list_head lru;
|
|
} ____cacheline_aligned;
|
|
|
|
// These are global locks, but the sharding should alleviate contention.
|
|
#define EGGSFS_SPAN_BITS 8
|
|
#define EGGSFS_SPAN_LRUS (1<<EGGSFS_SPAN_BITS) // 256
|
|
static struct eggsfs_span_lru span_lrus[EGGSFS_SPAN_LRUS];
|
|
|
|
static struct eggsfs_span_lru* get_span_lru(struct eggsfs_span* span) {
|
|
int h;
|
|
h = hash_64(span->ino, EGGSFS_SPAN_BITS);
|
|
h ^= hash_64(span->start, EGGSFS_SPAN_BITS);
|
|
return &span_lrus[h%EGGSFS_SPAN_LRUS];
|
|
}
|
|
|
|
static struct eggsfs_span* lookup_span(struct rb_root* spans, u64 offset) {
|
|
struct rb_node* node = spans->rb_node;
|
|
while (node) {
|
|
struct eggsfs_span* span = container_of(node, struct eggsfs_span, node);
|
|
if (offset < span->start) { node = node->rb_left; }
|
|
else if (offset >= span->end) { node = node->rb_right; }
|
|
else {
|
|
eggsfs_debug("off=%llu span=%p", offset, span);
|
|
return span;
|
|
}
|
|
}
|
|
eggsfs_debug("off=%llu no_span", offset);
|
|
return NULL;
|
|
}
|
|
|
|
static bool insert_span(struct rb_root* spans, struct eggsfs_span* span) {
|
|
struct rb_node** new = &(spans->rb_node);
|
|
struct rb_node* parent = NULL;
|
|
while (*new) {
|
|
struct eggsfs_span* this = container_of(*new, struct eggsfs_span, node);
|
|
parent = *new;
|
|
if (span->end <= this->start) { new = &((*new)->rb_left); }
|
|
else if (span->start >= this->end) { new = &((*new)->rb_right); }
|
|
else {
|
|
// TODO: be loud if there are overlaping spans
|
|
eggsfs_debug("span=%p (%llu-%llu) already present", span, span->start, span->end);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
rb_link_node(&span->node, parent, new);
|
|
rb_insert_color(&span->node, spans);
|
|
eggsfs_debug("span=%p (%llu-%llu) inserted", span, span->start, span->end);
|
|
return true;
|
|
}
|
|
|
|
static void init_block_span(void* p) {
|
|
struct eggsfs_block_span* span = (struct eggsfs_block_span*)p;
|
|
|
|
xa_init(&span->pages);
|
|
int i;
|
|
for (i = 0; i < EGGSFS_MAX_STRIPES; i++) {
|
|
eggsfs_latch_init(&span->stripe_latches[i]);
|
|
}
|
|
}
|
|
|
|
static void free_span(struct eggsfs_span* span, u64* dropped_pages) {
|
|
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
|
|
kmem_cache_free(eggsfs_inline_span_cachep, EGGSFS_INLINE_SPAN(span));
|
|
} else {
|
|
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
|
|
eggsfs_counter_dec(eggsfs_stat_cached_spans);
|
|
struct page* page;
|
|
unsigned long page_ix;
|
|
xa_for_each(&block_span->pages, page_ix, page) {
|
|
if (dropped_pages) { (*dropped_pages)++; }
|
|
put_page(page);
|
|
xa_erase(&block_span->pages, page_ix);
|
|
atomic64_dec(&eggsfs_stat_cached_span_pages);
|
|
}
|
|
int i;
|
|
for (i = 0; i < EGGSFS_MAX_STRIPES; i++) {
|
|
BUG_ON(atomic64_read(&block_span->stripe_latches[i].counter) & 1); // nobody still hanging onto this
|
|
}
|
|
kmem_cache_free(eggsfs_block_span_cachep, block_span);
|
|
}
|
|
}
|
|
|
|
struct get_span_ctx {
|
|
struct eggsfs_inode* enode;
|
|
struct list_head spans;
|
|
int err;
|
|
};
|
|
|
|
void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs) {
|
|
eggsfs_debug("offset=%llu size=%u crc=%08x storage_class=%d parity=%d stripes=%d cell_size=%u", offset, size, crc, storage_class, parity, stripes, cell_size);
|
|
|
|
struct get_span_ctx* ctx = (struct get_span_ctx*)data;
|
|
if (ctx->err) { return; }
|
|
|
|
struct eggsfs_block_span* span = kmem_cache_alloc(eggsfs_block_span_cachep, GFP_KERNEL);
|
|
if (!span) { ctx->err = -ENOMEM; return; }
|
|
|
|
if (eggsfs_data_blocks(parity) > EGGSFS_MAX_DATA || eggsfs_parity_blocks(parity) > EGGSFS_MAX_PARITY) {
|
|
eggsfs_warn("D=%d > %d || P=%d > %d", eggsfs_data_blocks(parity), EGGSFS_MAX_DATA, eggsfs_data_blocks(parity), EGGSFS_MAX_PARITY);
|
|
ctx->err = -EIO;
|
|
return;
|
|
}
|
|
|
|
span->span.ino = ctx->enode->inode.i_ino;
|
|
span->span.start = offset;
|
|
span->span.end = offset + size;
|
|
span->span.storage_class = storage_class;
|
|
span->enode = ctx->enode; // no ihold: `span->enode` is cleared before eviction
|
|
|
|
|
|
span->cell_size = cell_size;
|
|
memcpy(span->stripes_crc, stripes_crcs, sizeof(uint32_t)*stripes);
|
|
span->stripes = stripes;
|
|
span->parity = parity;
|
|
// important, this will have readers skip over this until it ends up in the LRU
|
|
span->refcount = -1;
|
|
|
|
eggsfs_debug("adding normal span");
|
|
list_add_tail(&span->span.lru, &ctx->spans);
|
|
}
|
|
|
|
void eggsfs_file_spans_cb_block(
|
|
void* data, int block_ix,
|
|
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2, u8 flags,
|
|
u64 block_id, u32 crc
|
|
) {
|
|
struct get_span_ctx* ctx = (struct get_span_ctx*)data;
|
|
if (ctx->err) { return; }
|
|
|
|
struct eggsfs_span* span = list_last_entry(&ctx->spans, struct eggsfs_span, lru);
|
|
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
|
|
|
|
struct eggsfs_block* block = &block_span->blocks[block_ix];
|
|
block->bs.id = bs_id;
|
|
block->bs.ip1 = ip1;
|
|
block->bs.port1 = port1;
|
|
block->bs.ip2 = ip2;
|
|
block->bs.port2 = port2;
|
|
block->bs.flags = flags;
|
|
block->id = block_id;
|
|
block->crc = crc;
|
|
}
|
|
|
|
void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body) {
|
|
eggsfs_debug("offset=%llu size=%u len=%u body=%*pE", offset, size, len, len, body);
|
|
|
|
struct get_span_ctx* ctx = (struct get_span_ctx*)data;
|
|
if (ctx->err) { return; }
|
|
|
|
struct eggsfs_inline_span* span = kmem_cache_alloc(eggsfs_inline_span_cachep, GFP_KERNEL);
|
|
if (!span) { ctx->err = -ENOMEM; return; }
|
|
|
|
span->span.ino = ctx->enode->inode.i_ino;
|
|
|
|
span->span.start = offset;
|
|
span->span.end = offset + size;
|
|
span->span.storage_class = EGGSFS_INLINE_STORAGE;
|
|
span->len = len;
|
|
memcpy(span->body, body, len);
|
|
|
|
eggsfs_debug("adding inline span");
|
|
|
|
list_add_tail(&span->span.lru, &ctx->spans);
|
|
}
|
|
|
|
// returns whether we could acquire it
|
|
static bool span_acquire(struct eggsfs_span* span) {
|
|
if (span->storage_class == EGGSFS_INLINE_STORAGE) { return true; }
|
|
|
|
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
|
|
struct eggsfs_span_lru* lru = get_span_lru(span);
|
|
|
|
spin_lock_bh(&lru->lock);
|
|
if (unlikely(block_span->refcount < 0)) { // the reclaimer got here before us.
|
|
spin_unlock_bh(&lru->lock);
|
|
return false;
|
|
}
|
|
block_span->refcount++;
|
|
if (block_span->refcount == 1) {
|
|
// we're the first ones here, take it out of the LRU
|
|
list_del(&span->lru);
|
|
}
|
|
spin_unlock_bh(&lru->lock);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Looks up and acquires a span. Returns -EBUSY if
|
|
// the span is there, but it could not be acquired (contention
|
|
// with LRU reclaimer). Returns NULL if the span is not there.
|
|
static struct eggsfs_span* lookup_and_acquire_span(struct eggsfs_inode* enode, u64 offset) {
|
|
struct eggsfs_inode_file* file = &enode->file;
|
|
int err = down_read_killable(&enode->file.spans_lock);
|
|
if (err) { return ERR_PTR(err); }
|
|
|
|
struct eggsfs_span* span = lookup_span(&file->spans, offset);
|
|
if (likely(span)) {
|
|
if (unlikely(!span_acquire(span))) { err = -EBUSY; }
|
|
}
|
|
up_read(&enode->file.spans_lock);
|
|
|
|
return span;
|
|
}
|
|
|
|
void eggsfs_put_span(struct eggsfs_span* span, bool was_read) {
|
|
if (span->storage_class == EGGSFS_INLINE_STORAGE) { return; }
|
|
|
|
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
|
|
struct eggsfs_span_lru* lru = get_span_lru(span);
|
|
|
|
spin_lock_bh(&lru->lock);
|
|
BUG_ON(block_span->refcount < 1);
|
|
block_span->refcount--;
|
|
if (block_span->refcount == 0) { // we need to put it back into the LRU, or free it
|
|
if (block_span->touched) {
|
|
list_add_tail(&span->lru, &lru->lru);
|
|
} else {
|
|
list_add(&span->lru, &lru->lru);
|
|
}
|
|
}
|
|
spin_unlock_bh(&lru->lock);
|
|
}
|
|
|
|
struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset) {
|
|
struct eggsfs_inode_file* file = &enode->file;
|
|
int err;
|
|
|
|
eggsfs_debug("ino=%016lx, pid=%d, off=%llu getting span", enode->inode.i_ino, get_current()->pid, offset);
|
|
|
|
trace_eggsfs_get_span_enter(enode->inode.i_ino, offset);
|
|
|
|
#define GET_SPAN_EXIT(s) do { \
|
|
trace_eggsfs_get_span_exit(enode->inode.i_ino, offset, IS_ERR(s) ? PTR_ERR(s) : 0); \
|
|
return s; \
|
|
} while(0)
|
|
|
|
// This helps below: it means that we _must_ have a span. So if we
|
|
// get NULL at any point, we can retry, because it means we're conflicting
|
|
// with a reclaimer.
|
|
if (offset >= enode->inode.i_size) { GET_SPAN_EXIT(NULL); }
|
|
|
|
u64 iterations = 0;
|
|
|
|
retry:
|
|
iterations++;
|
|
// Sadly while this should be somewhat uncommon it can happen when we're busy
|
|
// looping while another task is between inserting span in the span tree and
|
|
// adding it to the LRU.
|
|
if (unlikely(iterations == 10)) {
|
|
eggsfs_warn("we've been fetching the same span for %llu iterations, we're probably stuck on a yet-to-be enabled span we just fetched", iterations);
|
|
}
|
|
|
|
// Check if we already have the span
|
|
{
|
|
struct eggsfs_span* span = lookup_and_acquire_span(enode, offset);
|
|
if (unlikely(IS_ERR(span))) {
|
|
int err = PTR_ERR(span);
|
|
if (err == -EBUSY) { goto retry; } // we couldn't acquire the span
|
|
GET_SPAN_EXIT(span); // some other error
|
|
} else if (likely(span != NULL)) {
|
|
GET_SPAN_EXIT(span);
|
|
}
|
|
}
|
|
|
|
// We need to fetch the spans.
|
|
err = down_write_killable(&file->spans_lock);
|
|
if (err) { GET_SPAN_EXIT(ERR_PTR(err)); }
|
|
|
|
// Check if somebody go to it first, in which case we can acquire it and
|
|
// return immediately.
|
|
{
|
|
struct eggsfs_span* span = lookup_span(&file->spans, offset);
|
|
if (unlikely(span)) {
|
|
if (unlikely(!span_acquire(span))) {
|
|
up_write(&file->spans_lock);
|
|
goto retry;
|
|
}
|
|
up_write(&file->spans_lock);
|
|
GET_SPAN_EXIT(span);
|
|
}
|
|
}
|
|
|
|
// We always get the full set of spans, this simplifies prefetching (we just
|
|
// blindly assume the span is there, and if it's been reclaimed in the meantime
|
|
// we just don't care).
|
|
u64 spans_offset;
|
|
for (spans_offset = 0;;) {
|
|
// fetch next batch of spans
|
|
u64 next_offset;
|
|
struct get_span_ctx ctx = { .err = 0, .enode = enode };
|
|
INIT_LIST_HEAD(&ctx.spans);
|
|
err = eggsfs_error_to_linux(eggsfs_shard_file_spans(
|
|
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, spans_offset, &next_offset,&ctx
|
|
));
|
|
err = err ?: ctx.err;
|
|
if (unlikely(err)) {
|
|
eggsfs_debug("failed to get file spans at %llu err=%d", spans_offset, err);
|
|
for (;;) {
|
|
struct eggsfs_span* span = list_first_entry_or_null(&ctx.spans, struct eggsfs_span, lru);
|
|
list_del(&span->lru);
|
|
free_span(span, NULL);
|
|
}
|
|
up_write(&file->spans_lock);
|
|
GET_SPAN_EXIT(ERR_PTR(err));
|
|
}
|
|
// add them to enode spans and LRU
|
|
for (;;) {
|
|
struct eggsfs_span* span = list_first_entry_or_null(&ctx.spans, struct eggsfs_span, lru);
|
|
if (span == NULL) { break; }
|
|
list_del(&span->lru);
|
|
if (!insert_span(&file->spans, span)) {
|
|
// Span is already cached
|
|
free_span(span, NULL);
|
|
} else {
|
|
if (span->storage_class != EGGSFS_INLINE_STORAGE) {
|
|
// Not already cached, must add it to the LRU
|
|
eggsfs_counter_inc(eggsfs_stat_cached_spans);
|
|
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
|
|
struct eggsfs_span_lru* lru = get_span_lru(span);
|
|
spin_lock_bh(&lru->lock);
|
|
block_span->refcount = 0;
|
|
list_add(&span->lru, &lru->lru);
|
|
spin_unlock_bh(&lru->lock);
|
|
}
|
|
}
|
|
}
|
|
// stop if we're done
|
|
if (next_offset == 0) { break; }
|
|
spans_offset = next_offset;
|
|
}
|
|
|
|
up_write(&file->spans_lock);
|
|
|
|
// We now restart, we know that the span must be there (unless the shard is broken).
|
|
// It might get reclaimed in the meantime though.
|
|
goto retry;
|
|
|
|
#undef GET_SPAN_EXIT
|
|
}
|
|
|
|
// If it returns -1, there are no spans to drop. Otherwise, returns the
|
|
// next index to pass in to reclaim the next span.
|
|
static int drop_one_span(int lru_ix, u64* examined_spans, u64* dropped_pages) {
|
|
BUG_ON(lru_ix < 0 || lru_ix >= EGGSFS_SPAN_LRUS);
|
|
|
|
int i;
|
|
for (i = lru_ix; i < lru_ix + EGGSFS_SPAN_LRUS; i++) {
|
|
struct eggsfs_span_lru* lru = &span_lrus[i%EGGSFS_SPAN_LRUS];
|
|
|
|
struct eggsfs_span* candidate = NULL;
|
|
struct eggsfs_inode* enode = NULL;
|
|
|
|
// Pick a candidate from the LRU, mark it as reclaiming, take it out of
|
|
// the LRU.
|
|
spin_lock_bh(&lru->lock);
|
|
candidate = list_first_entry_or_null(&lru->lru, struct eggsfs_span, lru);
|
|
if (unlikely(candidate == NULL)) {
|
|
goto unlock;
|
|
} else {
|
|
(*examined_spans)++;
|
|
BUG_ON(candidate->storage_class == EGGSFS_INLINE_STORAGE); // no inline spans in LRU
|
|
struct eggsfs_block_span* candidate_blocks = EGGSFS_BLOCK_SPAN(candidate);
|
|
if (likely(candidate_blocks->enode)) {
|
|
// There's a span of time between the last inode reference being dropped
|
|
// and the eviction function being called where we do have this enode
|
|
// here but we also don't have references to it anymore. In that case
|
|
// we let the enode eviction take care of this (we can't do it ourselves
|
|
// because we can't safely remove the span from the enode afterwards).
|
|
if (likely(atomic_inc_not_zero(&candidate_blocks->enode->inode.i_count))) { // conditional ihold
|
|
enode = candidate_blocks->enode;
|
|
} else {
|
|
candidate = NULL; // skip
|
|
goto unlock;
|
|
}
|
|
}
|
|
// we just got this from LRU, it must be readers == 0
|
|
BUG_ON(candidate_blocks->refcount != 0);
|
|
candidate_blocks->refcount = -1;
|
|
list_del(&candidate->lru);
|
|
}
|
|
unlock:
|
|
spin_unlock_bh(&lru->lock);
|
|
|
|
if (candidate) {
|
|
// Take it out of the spans for the file
|
|
if (likely(enode)) {
|
|
down_write(&enode->file.spans_lock);
|
|
rb_erase(&candidate->node, &enode->file.spans);
|
|
up_write(&enode->file.spans_lock);
|
|
iput(&enode->inode);
|
|
}
|
|
|
|
// At this point we're free to do what we please with the span
|
|
// (it's fully private to us).
|
|
free_span(candidate, dropped_pages);
|
|
|
|
i++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// we've fully gone around without finding anything
|
|
if (i == lru_ix + EGGSFS_SPAN_LRUS) { return -1; }
|
|
|
|
// We've found something
|
|
return i%EGGSFS_SPAN_LRUS;
|
|
}
|
|
|
|
static inline void drop_spans_enter(const char* type) {
|
|
trace_eggsfs_drop_spans_enter(type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_span_pages), eggsfs_counter_get(&eggsfs_stat_cached_spans));
|
|
}
|
|
|
|
static inline void drop_spans_exit(const char* type, u64 dropped_pages) {
|
|
trace_eggsfs_drop_spans_exit(type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_span_pages), eggsfs_counter_get(&eggsfs_stat_cached_spans), dropped_pages);
|
|
}
|
|
|
|
// returns the number of dropped pages
|
|
u64 eggsfs_drop_all_spans(void) {
|
|
drop_spans_enter("all");
|
|
|
|
u64 dropped_pages = 0;
|
|
u64 examined_spans = 0;
|
|
s64 spans_begin = eggsfs_counter_get(&eggsfs_stat_cached_spans);
|
|
int lru_ix = 0;
|
|
for (;;) {
|
|
lru_ix = drop_one_span(lru_ix, &examined_spans, &dropped_pages);
|
|
if (lru_ix < 0) { break; }
|
|
}
|
|
s64 spans_end = eggsfs_counter_get(&eggsfs_stat_cached_spans);
|
|
eggsfs_info("reclaimed %llu pages, %lld spans (approx)", dropped_pages, spans_begin-spans_end);
|
|
drop_spans_exit("all", dropped_pages);
|
|
return dropped_pages;
|
|
}
|
|
|
|
static DEFINE_MUTEX(drop_spans_mu);
|
|
|
|
static u64 drop_spans(const char* type) {
|
|
mutex_lock(&drop_spans_mu);
|
|
|
|
drop_spans_enter(type);
|
|
|
|
u64 dropped_pages = 0;
|
|
u64 examined_spans = 0;
|
|
int lru_ix = 0;
|
|
for (;;) {
|
|
u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages);
|
|
if (
|
|
pages*PAGE_SIZE < eggsfs_span_cache_max_size_drop &&
|
|
si_mem_available()*PAGE_SIZE > eggsfs_span_cache_min_avail_mem_drop
|
|
) {
|
|
break;
|
|
}
|
|
lru_ix = drop_one_span(lru_ix, &examined_spans, &dropped_pages);
|
|
if (lru_ix < 0) { break; }
|
|
}
|
|
eggsfs_debug("dropped %llu pages, %llu spans examined", dropped_pages, examined_spans);
|
|
|
|
drop_spans_exit(type, dropped_pages);
|
|
|
|
mutex_unlock(&drop_spans_mu);
|
|
|
|
return dropped_pages;
|
|
}
|
|
|
|
static void reclaim_spans_async(struct work_struct* work) {
|
|
if (!mutex_is_locked(&drop_spans_mu)) { // somebody's already taking care of it
|
|
drop_spans("async");
|
|
}
|
|
}
|
|
|
|
static DECLARE_WORK(reclaim_spans_work, reclaim_spans_async);
|
|
|
|
static unsigned long eggsfs_span_shrinker_count(struct shrinker* shrinker, struct shrink_control* sc) {
|
|
u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages);
|
|
if (pages == 0) { return SHRINK_EMPTY; }
|
|
return pages;
|
|
}
|
|
|
|
static int eggsfs_span_shrinker_lru_ix = 0;
|
|
static u64 eggsfs_span_shrinker_pages_round = 25600; // We drop at most 100MiB in one shrinker round
|
|
|
|
static unsigned long eggsfs_span_shrinker_scan(struct shrinker* shrinker, struct shrink_control* sc) {
|
|
drop_spans_enter("shrinker");
|
|
u64 dropped_pages = 0;
|
|
u64 examined_spans = 0;
|
|
int lru_ix = eggsfs_span_shrinker_lru_ix;
|
|
for (dropped_pages = 0; dropped_pages < eggsfs_span_shrinker_pages_round;) {
|
|
lru_ix = drop_one_span(lru_ix, &examined_spans, &dropped_pages);
|
|
if (lru_ix < 0) { break; }
|
|
}
|
|
eggsfs_span_shrinker_lru_ix = lru_ix >= 0 ? lru_ix : 0;
|
|
drop_spans_exit("shrinker", dropped_pages);
|
|
return dropped_pages;
|
|
}
|
|
|
|
static struct shrinker eggsfs_span_shrinker = {
|
|
.count_objects = eggsfs_span_shrinker_count,
|
|
.scan_objects = eggsfs_span_shrinker_scan,
|
|
.seeks = DEFAULT_SEEKS,
|
|
.flags = SHRINKER_NONSLAB, // what's this?
|
|
};
|
|
|
|
void eggsfs_drop_file_spans(struct eggsfs_inode* enode) {
|
|
again:
|
|
down_write(&enode->file.spans_lock);
|
|
|
|
for (;;) {
|
|
struct rb_node* node = rb_first(&enode->file.spans);
|
|
if (node == NULL) { break; }
|
|
|
|
struct eggsfs_span* span = rb_entry(node, struct eggsfs_span, node);
|
|
struct eggsfs_block_span* block_span = NULL;
|
|
bool can_free_span = true;
|
|
if (span->storage_class != EGGSFS_INLINE_STORAGE) {
|
|
block_span = EGGSFS_BLOCK_SPAN(span);
|
|
struct eggsfs_span_lru* lru = get_span_lru(span);
|
|
spin_lock_bh(&lru->lock);
|
|
if (unlikely(block_span->refcount < 0)) {
|
|
// This is being reclaimed, we wait until the reclaimer cleans
|
|
// it up for us. This is very unlikely: the reclaiming would have
|
|
// had to start before the last inode reference was dropped.
|
|
spin_unlock_bh(&lru->lock);
|
|
up_write(&enode->file.spans_lock);
|
|
goto again;
|
|
} else if (unlikely(block_span->refcount > 0)) {
|
|
// Note that the assumption here is that the reader will correctly put
|
|
// the span back in the LRU once done.
|
|
eggsfs_warn("span at %llu for inode %016lx still has %d readers, will linger on until next reclaim", span->start, enode->inode.i_ino, block_span->refcount);
|
|
can_free_span = false;
|
|
} else {
|
|
// Make ourselves the reclaimer. Note that we could just let the
|
|
// reclaimer take care of it eventually, but this is the very
|
|
// common case and it'll be faster.
|
|
list_del(&span->lru);
|
|
block_span->refcount = -1;
|
|
}
|
|
block_span->enode = NULL;
|
|
spin_unlock_bh(&lru->lock);
|
|
}
|
|
|
|
rb_erase(node, &enode->file.spans);
|
|
|
|
if (can_free_span) { free_span(span, NULL); }
|
|
}
|
|
|
|
up_write(&enode->file.spans_lock);
|
|
}
|
|
|
|
struct fetch_stripe_state {
|
|
// ihold'd
|
|
struct eggsfs_inode *enode;
|
|
struct semaphore sema; // used to wait on every block being finished. could be done faster with a wait_queue, but don't want to worry about smb subtleties.
|
|
s64 stripe_seqno; // used to release the stripe when we're done with it
|
|
struct eggsfs_block_span* span; // we hold this (via span acquire) until we're done.
|
|
// these will be filled in by the fetching (only D of them well be
|
|
// filled by fetching the blocks, the others might be filled in by the RS
|
|
// recovery)
|
|
struct list_head blocks_pages[EGGSFS_MAX_BLOCKS];
|
|
atomic_t refcount; // to garbage collect
|
|
atomic_t err; // the result of the stripe fetching
|
|
static_assert(EGGSFS_MAX_BLOCKS <= 16);
|
|
// 00-16: blocks which are downloading.
|
|
// 16-32: blocks which have succeeded.
|
|
// 32-48: blocks which have failed.
|
|
atomic64_t blocks;
|
|
u8 stripe; // which stripe we're into
|
|
u8 prefetching; // used for logging
|
|
};
|
|
|
|
#define DOWNLOADING_SHIFT 0
|
|
#define DOWNLOADING_MASK (0xFFFFull<<DOWNLOADING_SHIFT)
|
|
#define DOWNLOADING_GET(__b) ((__b>>DOWNLOADING_SHIFT) & 0xFFFFull)
|
|
#define DOWNLOADING_SET(__b, __i) (__b | 1ull<<(__i+DOWNLOADING_SHIFT))
|
|
#define DOWNLOADING_UNSET(__b, __i) (__b & ~(1ull<<(__i+DOWNLOADING_SHIFT)))
|
|
|
|
#define SUCCEEDED_SHIFT 16
|
|
#define SUCCEEDED_MASK (0xFFFFull<<SUCCEEDED_SHIFT)
|
|
#define SUCCEEDED_GET(__b) ((__b>>SUCCEEDED_SHIFT) & 0xFFFFull)
|
|
#define SUCCEEDED_SET(__b, __i) (__b | 1ull<<(__i+SUCCEEDED_SHIFT))
|
|
#define SUCCEEDED_UNSET(__b, __i) (__b & ~(1ull<<(__i+SUCCEEDED_SHIFT)))
|
|
|
|
#define FAILED_SHIFT 32
|
|
#define FAILED_MASK (0xFFFFull<<FAILED_SHIFT)
|
|
#define FAILED_GET(__b) ((__b>>FAILED_SHIFT) & 0xFFFFull)
|
|
#define FAILED_SET(__b, __i) (__b | 1ull<<(__i+FAILED_SHIFT))
|
|
#define FAILED_UNSET(__b, __i) (__b & ~(1ull<<(__i+FAILED_SHIFT)))
|
|
|
|
static void init_fetch_stripe(void* p) {
|
|
struct fetch_stripe_state* st = (struct fetch_stripe_state*)p;
|
|
|
|
sema_init(&st->sema, 0);
|
|
int i;
|
|
for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) {
|
|
INIT_LIST_HEAD(&st->blocks_pages[i]);
|
|
}
|
|
}
|
|
|
|
// This must be called while already holding the span, which means
|
|
// that span_acquire will always succeed.
|
|
static struct fetch_stripe_state* new_fetch_stripe(
|
|
struct eggsfs_inode* enode,
|
|
struct eggsfs_block_span* span,
|
|
u8 stripe,
|
|
u64 stripe_seqno,
|
|
bool prefetching
|
|
) {
|
|
struct fetch_stripe_state* st = (struct fetch_stripe_state*)kmem_cache_alloc(eggsfs_fetch_stripe_cachep, GFP_KERNEL);
|
|
if (!st) { return st; }
|
|
|
|
BUG_ON(!span_acquire(&span->span));
|
|
|
|
st->enode = enode;
|
|
st->stripe_seqno = stripe_seqno;
|
|
st->span = span;
|
|
atomic_set(&st->refcount, 1); // the caller
|
|
atomic_set(&st->err, 0);
|
|
st->stripe = stripe;
|
|
st->prefetching = prefetching;
|
|
atomic64_set(&st->blocks, 0);
|
|
|
|
eggsfs_in_flight_begin(enode);
|
|
|
|
return st;
|
|
}
|
|
|
|
static void fetch_stripe_trace(struct fetch_stripe_state* st, u8 event, s8 block, int err) {
|
|
trace_eggsfs_fetch_stripe(
|
|
st->enode->inode.i_ino,
|
|
st->span->span.start,
|
|
st->stripe,
|
|
st->span->parity,
|
|
st->prefetching,
|
|
event,
|
|
block,
|
|
err
|
|
);
|
|
}
|
|
|
|
static void free_fetch_stripe(struct fetch_stripe_state* st) {
|
|
// we're the last ones here
|
|
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_FREE, -1, atomic_read(&st->err));
|
|
|
|
// Note that we release this here, rather than when `remaining == 0`, since
|
|
// there's no guarantee that we reach `remaining == 0` (we might fail to start
|
|
// enough requests)
|
|
eggsfs_latch_release(&st->span->stripe_latches[st->stripe], st->stripe_seqno);
|
|
|
|
while (down_trylock(&st->sema) == 0) {} // reset sema to zero for next usage
|
|
|
|
// Free leftover blocks (also ensures the lists are all nice and empty for the
|
|
// next user)
|
|
int i;
|
|
for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) {
|
|
put_pages_list(&st->blocks_pages[i]);
|
|
}
|
|
|
|
eggsfs_in_flight_end(st->enode);
|
|
|
|
// We don't need the span anymore
|
|
eggsfs_put_span(&st->span->span, !st->prefetching);
|
|
|
|
kmem_cache_free(eggsfs_fetch_stripe_cachep, st);
|
|
}
|
|
|
|
static void put_fetch_stripe(struct fetch_stripe_state* st) {
|
|
if (atomic_dec_return(&st->refcount) > 0) { return; }
|
|
free_fetch_stripe(st);
|
|
}
|
|
|
|
static void hold_fetch_stripe(struct fetch_stripe_state* st) {
|
|
WARN_ON(atomic_inc_return(&st->refcount) < 2);
|
|
}
|
|
|
|
static void store_pages(struct fetch_stripe_state* st) {
|
|
struct eggsfs_block_span* span = st->span;
|
|
int D = eggsfs_data_blocks(span->parity);
|
|
|
|
int i, j;
|
|
|
|
// we need to recover data using RS
|
|
u16 blocks = SUCCEEDED_GET(atomic64_read(&st->blocks));
|
|
eggsfs_debug("blocks=%d", blocks);
|
|
BUG_ON(__builtin_popcountll(blocks) != D);
|
|
if (unlikely(blocks != (1ull<<D)-1)) {
|
|
u32 pages = 0;
|
|
{ // get the number of pages
|
|
struct list_head* tmp;
|
|
// eggsfs_info("picking %d", __builtin_ctz(st->blocks));
|
|
list_for_each(tmp, &st->blocks_pages[__builtin_ctz(blocks)]) { // just pick the first one to count pages
|
|
pages++;
|
|
}
|
|
}
|
|
for (i = 0; i < D; i++) { // fill in missing data blocks
|
|
if ((1ull<<i) & blocks) { continue; } // we have this one already
|
|
// eggsfs_info("recovering %d, pages=%u", i, pages);
|
|
// allocate the pages
|
|
struct list_head* i_pages = &st->blocks_pages[i];
|
|
for (j = 0; j < pages; j++) {
|
|
struct page* page = alloc_page(GFP_KERNEL);
|
|
if (!page) {
|
|
atomic_set(&st->err, -ENOMEM);
|
|
return;
|
|
}
|
|
list_add_tail(&page->lru, i_pages);
|
|
}
|
|
// compute
|
|
int err = eggsfs_recover(span->parity, blocks, 1ull<<i, pages, st->blocks_pages);
|
|
if (err) {
|
|
atomic_set(&st->err, err);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
u32 start_page = (span->cell_size/PAGE_SIZE)*D*st->stripe;
|
|
u32 end_page = (span->cell_size/PAGE_SIZE)*D*((int)st->stripe + 1);
|
|
u32 curr_page = start_page;
|
|
u32 crc = 0;
|
|
kernel_fpu_begin(); // crc
|
|
// move the pages to the span
|
|
for (i = 0; i < D; i++) {
|
|
struct page* page;
|
|
struct page* tmp;
|
|
list_for_each_entry_safe(page, tmp, &st->blocks_pages[i], lru) {
|
|
list_del(&page->lru);
|
|
// eggsfs_info("storing page for span %p at %u", span, curr_page);
|
|
char* page_buf = kmap_atomic(page);
|
|
crc = eggsfs_crc32c(crc, page_buf, PAGE_SIZE);
|
|
kunmap_atomic(page_buf);
|
|
struct page* old_page = xa_store(&span->pages, curr_page, page, GFP_KERNEL);
|
|
if (IS_ERR(old_page)) {
|
|
atomic_set(&st->err, PTR_ERR(old_page));
|
|
eggsfs_debug("xa_store failed: %ld", PTR_ERR(old_page));
|
|
put_page(page); // we removed it from the list
|
|
kernel_fpu_end();
|
|
return;
|
|
}
|
|
BUG_ON(old_page != NULL); // the latch protects against this
|
|
atomic64_inc(&eggsfs_stat_cached_span_pages);
|
|
curr_page++;
|
|
}
|
|
// eggsfs_info("i=%d curr_page=%u", i, curr_page);
|
|
}
|
|
kernel_fpu_end();
|
|
if (curr_page != end_page) {
|
|
eggsfs_info("curr_page=%u end_page=%u", curr_page, end_page);
|
|
}
|
|
BUG_ON(curr_page != end_page);
|
|
if (crc != span->stripes_crc[st->stripe]) {
|
|
eggsfs_warn("bad crc for file %016lx, stripe %u, expected %08x, got %08x", st->enode->inode.i_ino, st->stripe, span->stripes_crc[st->stripe], crc);
|
|
atomic_set(&st->err, -EIO);
|
|
}
|
|
}
|
|
|
|
static void block_done(void* data, u64 block_id, struct list_head* pages, int err);
|
|
|
|
// Starts loading up D blocks as necessary.
|
|
static int fetch_blocks(struct fetch_stripe_state* st) {
|
|
int i;
|
|
int err = 0;
|
|
|
|
struct eggsfs_block_span* span = st->span;
|
|
int D = eggsfs_data_blocks(span->parity);
|
|
int P = eggsfs_parity_blocks(span->parity);
|
|
int B = eggsfs_blocks(span->parity);
|
|
|
|
#define LOG_STR "span=%llu file=%016llx D=%d P=%d downloading=%04x(%llu) succeeded=%04x(%llu) failed=%04x(%llu) err=%d"
|
|
#define LOG_ARGS span->span.start, span->span.ino, D, P, downloading, __builtin_popcountll(downloading), succeeded, __builtin_popcountll(succeeded), failed, __builtin_popcountll(failed), err
|
|
|
|
hold_fetch_stripe(st); // extra hold to make sure that the stripe survives this loop
|
|
for (;;) {
|
|
s64 blocks = atomic64_read(&st->blocks);
|
|
u16 downloading = DOWNLOADING_GET(blocks);
|
|
u16 succeeded = SUCCEEDED_GET(blocks);
|
|
u16 failed = FAILED_GET(blocks);
|
|
if (__builtin_popcountll(downloading)+__builtin_popcountll(succeeded) >= D) { // we managed to get out of the woods
|
|
eggsfs_debug("we have enough blocks, terminating " LOG_STR, LOG_ARGS);
|
|
goto out;
|
|
}
|
|
if (__builtin_popcountll(failed) > P) { // nowhere to go from here but tears
|
|
eggsfs_debug("we're out of blocks, giving up " LOG_STR, LOG_ARGS);
|
|
err = err ?: -EIO;
|
|
goto out;
|
|
}
|
|
|
|
// find the next block to download
|
|
for (i = 0; i < B; i++) {
|
|
if (!((1ull<<i) & (downloading|failed|succeeded))) { break; }
|
|
}
|
|
BUG_ON(i == B); // something _must_ be there given the checks above
|
|
|
|
eggsfs_debug("next block to be fetched is %d " LOG_STR, i, LOG_ARGS);
|
|
|
|
// try to set ourselves as selected
|
|
if (atomic64_cmpxchg(&st->blocks, blocks, DOWNLOADING_SET(blocks, i)) != blocks) {
|
|
eggsfs_debug("skipping %d " LOG_STR, i, LOG_ARGS);
|
|
// somebody else got here first (can happen if some blocks complete before this loop finishes) first, keep going
|
|
continue;
|
|
}
|
|
|
|
// ok, we're responsible for this now, start
|
|
struct eggsfs_block* block = &st->span->blocks[i];
|
|
u32 block_offset = st->stripe * span->cell_size;
|
|
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_START, i, 0);
|
|
hold_fetch_stripe(st);
|
|
int err = eggsfs_fetch_block(
|
|
&block_done,
|
|
(void*)st,
|
|
&block->bs, block->id, block_offset, span->cell_size
|
|
);
|
|
if (err) {
|
|
put_fetch_stripe(st);
|
|
eggsfs_debug("loading block failed %d " LOG_STR, i, LOG_ARGS);
|
|
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_DONE, i, err);
|
|
// mark it as failed and not downloading
|
|
blocks = atomic64_read(&st->blocks);
|
|
while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, FAILED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {}
|
|
} else {
|
|
eggsfs_debug("loading block %d " LOG_STR, i, LOG_ARGS);
|
|
}
|
|
}
|
|
|
|
out:
|
|
put_fetch_stripe(st);
|
|
return err;
|
|
|
|
#undef LOG_STR
|
|
#undef LOG_ARGS
|
|
}
|
|
|
|
static void block_done(void* data, u64 block_id, struct list_head* pages, int err) {
|
|
int i;
|
|
|
|
eggsfs_debug("block complete %016llx", block_id);
|
|
struct fetch_stripe_state* st = (struct fetch_stripe_state*)data;
|
|
struct eggsfs_block_span* span = st->span;
|
|
int B = eggsfs_blocks(span->parity);
|
|
int D = eggsfs_data_blocks(span->parity);
|
|
|
|
for (i = 0; i < B; i++) {
|
|
if (st->span->blocks[i].id == block_id) { break; }
|
|
}
|
|
// eggsfs_info("block ix %d, B=%d", i, B);
|
|
BUG_ON(i == B);
|
|
|
|
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_DONE, i, err);
|
|
|
|
bool finished = false;
|
|
if (err) {
|
|
// it failed, try to restore order
|
|
eggsfs_debug("block %016llx ix %d failed with %d", block_id, i, err);
|
|
// mark it as failed and not downloading
|
|
s64 blocks = atomic64_read(&st->blocks);
|
|
while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, FAILED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {}
|
|
err = fetch_blocks(st);
|
|
// if we failed to restore order, mark the whole thing as failed
|
|
if (err) { atomic_cmpxchg(&st->err, 0, err); }
|
|
} else {
|
|
// mark as fetched, see if we're the last ones
|
|
s64 blocks = atomic64_read(&st->blocks);
|
|
while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, SUCCEEDED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {}
|
|
finished = __builtin_popcountll(SUCCEEDED_GET(SUCCEEDED_SET(blocks, i))) == D;
|
|
// it succeeded, grab all the pages
|
|
struct page* page;
|
|
struct page* tmp;
|
|
list_for_each_entry_safe(page, tmp, pages, lru) {
|
|
list_del(&page->lru);
|
|
list_add_tail(&page->lru, &st->blocks_pages[i]);
|
|
}
|
|
}
|
|
|
|
if (finished) {
|
|
eggsfs_debug("finished");
|
|
// we're the last one to finish, we need to store the pages in the span
|
|
store_pages(st);
|
|
// and wake up waiters, if any
|
|
up(&st->sema);
|
|
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_END, -1, atomic_read(&st->err));
|
|
}
|
|
put_fetch_stripe(st); // one refcount per request
|
|
eggsfs_debug("done");
|
|
}
|
|
|
|
static struct fetch_stripe_state* start_fetch_stripe(
|
|
struct eggsfs_block_span* span, u8 stripe, s64 stripe_seqno, bool prefetching
|
|
) {
|
|
int err;
|
|
|
|
BUG_ON(!span->enode); // must still be alive here
|
|
struct eggsfs_inode* enode = span->enode;
|
|
|
|
trace_eggsfs_fetch_stripe(
|
|
enode->inode.i_ino,
|
|
span->span.start,
|
|
stripe,
|
|
span->parity,
|
|
prefetching,
|
|
EGGSFS_FETCH_STRIPE_START,
|
|
-1,
|
|
0
|
|
);
|
|
|
|
struct fetch_stripe_state* st = NULL;
|
|
|
|
int D = eggsfs_data_blocks(span->parity);
|
|
int B = eggsfs_blocks(span->parity);
|
|
if (D > EGGSFS_MAX_DATA || B > EGGSFS_MAX_BLOCKS) {
|
|
eggsfs_error("got out of bounds parity of RS(%d,%d) for span %llu in file %016lx", D, B-D, span->span.start, enode->inode.i_ino);
|
|
err = -EIO;
|
|
goto out_err;
|
|
}
|
|
|
|
st = new_fetch_stripe(enode, span, stripe, stripe_seqno, prefetching);
|
|
|
|
// start fetching blocks
|
|
eggsfs_debug("fetching blocks");
|
|
err = fetch_blocks(st);
|
|
if (err) { goto out_err; }
|
|
|
|
eggsfs_debug("fetch stripe started");
|
|
return st;
|
|
|
|
out_err:
|
|
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_END, -1, err);
|
|
if (st) { put_fetch_stripe(st); }
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
static void do_prefetch(struct eggsfs_block_span* block_span, u64 off) {
|
|
BUG_ON(!block_span->enode); // must still be alive at this point
|
|
struct eggsfs_inode* enode = block_span->enode;
|
|
|
|
if (off < block_span->span.start || off >= block_span->span.end) {
|
|
// Another span, we need to load it. We give up if we can't at any occurrence.
|
|
struct eggsfs_span* span = lookup_and_acquire_span(enode, off);
|
|
if (unlikely(span == NULL || IS_ERR(span))) { return; }
|
|
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
|
|
return; // nothing to do, it's an inline span
|
|
}
|
|
block_span = EGGSFS_BLOCK_SPAN(span);
|
|
} else if (unlikely(!span_acquire(&block_span->span))) {
|
|
return; // it's being reclaimed, don't bother
|
|
}
|
|
|
|
// At this point we've acquired the span. We now need to check if there's something to get
|
|
|
|
int D = eggsfs_data_blocks(block_span->parity);
|
|
u32 page_ix = (off - block_span->span.start)/PAGE_SIZE;
|
|
u32 span_offset = page_ix * PAGE_SIZE;
|
|
u32 stripe = span_offset / (block_span->cell_size*D);
|
|
if (stripe > block_span->stripes) {
|
|
eggsfs_warn("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, block_span->stripes);
|
|
goto out_span;
|
|
}
|
|
|
|
struct page* page = xa_load(&block_span->pages, page_ix);
|
|
if (page != NULL) { // already there
|
|
goto out_span;
|
|
}
|
|
|
|
s64 seqno;
|
|
if (!eggsfs_latch_try_acquire(&block_span->stripe_latches[stripe], seqno)) { // somebody else is working on it
|
|
goto out_span;
|
|
}
|
|
|
|
page = xa_load(&block_span->pages, page_ix);
|
|
if (page != NULL) { // somebody got to it first
|
|
eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno);
|
|
goto out_span;
|
|
}
|
|
|
|
// Finally start the work.
|
|
struct fetch_stripe_state* st = start_fetch_stripe(block_span, stripe, seqno, true);
|
|
if (IS_ERR(st)) {
|
|
eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno);
|
|
eggsfs_debug("prefetch failed err=%ld", PTR_ERR(st));
|
|
goto out_span;
|
|
}
|
|
|
|
// We want the state to go once all the block fetches are done.
|
|
put_fetch_stripe(st);
|
|
|
|
out_span:
|
|
// The fetch state has ownership itself
|
|
eggsfs_put_span(&block_span->span, false);
|
|
return;
|
|
}
|
|
|
|
// Returns whether we're low on memory
|
|
static bool reclaim_after_fetch_stripe(void) {
|
|
// Reclaim pages if we went over the limit
|
|
u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages);
|
|
u64 free_pages = si_mem_available();
|
|
bool low_on_memory = false;
|
|
if (pages*PAGE_SIZE > eggsfs_span_cache_max_size_sync || free_pages*PAGE_SIZE < eggsfs_span_cache_min_avail_mem_sync) {
|
|
low_on_memory = true;
|
|
// sync dropping, apply backpressure
|
|
if (!mutex_lock_killable(&drop_spans_mu)) {
|
|
drop_spans("sync");
|
|
mutex_unlock(&drop_spans_mu);
|
|
}
|
|
} else if (pages*PAGE_SIZE > eggsfs_span_cache_max_size_async || free_pages*PAGE_SIZE < eggsfs_span_cache_min_avail_mem_async) {
|
|
low_on_memory = true;
|
|
// don't bother submitting if another span dropper is running already
|
|
if (!mutex_is_locked(&drop_spans_mu)) {
|
|
// TODO Is it a good idea to do it on system_long_wq rather than eggsfs_wq? The freeing
|
|
// job might face heavy contention, so maybe yes?
|
|
// On the other end, this might make it harder to know when it's safe to unload
|
|
// the module by making sure that there's no work left on the queue.
|
|
queue_work(system_long_wq, &reclaim_spans_work);
|
|
}
|
|
}
|
|
return low_on_memory;
|
|
}
|
|
|
|
static void maybe_prefetch(
|
|
struct eggsfs_block_span* span, bool low_on_memory, u64 stripe_start, u64 stripe_end
|
|
) {
|
|
BUG_ON(!span->enode); // must still be alive here
|
|
struct eggsfs_inode* enode = span->enode;
|
|
|
|
eggsfs_debug("file=%016llx eggsfs_prefetch=%d low_on_memory=%d stripe_start=%llu stripe_end=%llu", span->span.ino, (int)eggsfs_prefetch, (int)low_on_memory, stripe_start, stripe_end);
|
|
|
|
if (!eggsfs_prefetch || low_on_memory) { return; }
|
|
|
|
struct eggsfs_inode_file* file = &enode->file;
|
|
u64 prefetch_section = atomic64_read(&file->prefetch_section);
|
|
u32 prefetch_section_begin = prefetch_section & (uint32_t)(-1);
|
|
u32 prefetch_section_end = (prefetch_section>>32) & (uint32_t)(-1);
|
|
if (stripe_start > prefetch_section_begin && stripe_end < prefetch_section_end) {
|
|
eggsfs_debug("within bounds, not prefetching");
|
|
// we're comfortably inside, nothing to do
|
|
} if (stripe_end < prefetch_section_begin || stripe_start > prefetch_section_end) {
|
|
eggsfs_debug("out of bounds, resetting prefetch");
|
|
// we're totally out of bounds, reset
|
|
prefetch_section_begin = stripe_start;
|
|
prefetch_section_end = stripe_end;
|
|
} else {
|
|
u64 off;
|
|
// there is some overlap, but we're pushing the boundary, prefetch the next or previous thing
|
|
if (stripe_end > prefetch_section_end) { // going rightwards
|
|
eggsfs_debug("prefetching forwards");
|
|
prefetch_section_end = stripe_end;
|
|
off = stripe_end;
|
|
} else { // going leftwards
|
|
eggsfs_debug("prefetching backwards");
|
|
prefetch_section_begin = stripe_start;
|
|
off = stripe_start-1;
|
|
}
|
|
u64 new_prefetch_section = prefetch_section_begin | ((u64)prefetch_section_end << 32);
|
|
if (likely(atomic64_cmpxchg(&file->prefetch_section, prefetch_section, new_prefetch_section) == prefetch_section)) {
|
|
do_prefetch(span, off);
|
|
}
|
|
}
|
|
}
|
|
|
|
struct page* eggsfs_get_span_page(struct eggsfs_block_span* block_span, u32 page_ix) {
|
|
struct eggsfs_span* span = &block_span->span;
|
|
|
|
trace_eggsfs_get_span_page_enter(span->ino, span->start, page_ix*PAGE_SIZE);
|
|
|
|
// We need to load the stripe
|
|
int D = eggsfs_data_blocks(block_span->parity);
|
|
u32 span_offset = page_ix * PAGE_SIZE;
|
|
u32 stripe = span_offset / (block_span->cell_size*D);
|
|
bool low_on_memory = false;
|
|
int err = 0;
|
|
|
|
#define GET_PAGE_EXIT(p) do { \
|
|
int __err = IS_ERR(p) ? PTR_ERR(p) : 0; \
|
|
if (likely(__err == 0)) { \
|
|
maybe_prefetch(block_span, low_on_memory, span->start + stripe*(block_span->cell_size*D), span->start + (stripe+1)*(block_span->cell_size*D)); \
|
|
} \
|
|
trace_eggsfs_get_span_page_exit(span->ino, span->start, page_ix*PAGE_SIZE, __err); \
|
|
return p; \
|
|
} while(0)
|
|
|
|
// TODO better error?
|
|
if (stripe > block_span->stripes) {
|
|
eggsfs_warn("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, block_span->stripes);
|
|
GET_PAGE_EXIT(ERR_PTR(-EIO));
|
|
}
|
|
|
|
// this should be guaranteed by the caller but we rely on it below, so let's check
|
|
if (block_span->cell_size%PAGE_SIZE != 0) {
|
|
eggsfs_warn("cell_size=%u, PAGE_SIZE=%lu, block_span->cell_size%%PAGE_SIZE=%lu", block_span->cell_size, PAGE_SIZE, block_span->cell_size%PAGE_SIZE);
|
|
GET_PAGE_EXIT(ERR_PTR(-EIO));
|
|
}
|
|
|
|
struct page* page;
|
|
|
|
again:
|
|
page = xa_load(&block_span->pages, page_ix);
|
|
if (page != NULL) { GET_PAGE_EXIT(page); }
|
|
|
|
// There is no matching release here because the latch is released
|
|
// when the fetchers finish (even if this call is interrupted)
|
|
s64 seqno;
|
|
if (!eggsfs_latch_try_acquire(&block_span->stripe_latches[stripe], seqno)) {
|
|
int err = eggsfs_latch_wait_killable(&block_span->stripe_latches[stripe], seqno);
|
|
if (err) { GET_PAGE_EXIT(ERR_PTR(err)); }
|
|
goto again;
|
|
}
|
|
|
|
page = xa_load(&block_span->pages, page_ix);
|
|
if (page != NULL) {
|
|
// somebody got to it first
|
|
eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno);
|
|
GET_PAGE_EXIT(page);
|
|
}
|
|
|
|
eggsfs_debug("about to start fetch stripe");
|
|
struct fetch_stripe_state* st = NULL;
|
|
st = start_fetch_stripe(block_span, stripe, seqno, false);
|
|
if (IS_ERR(st)) {
|
|
err = PTR_ERR(st);
|
|
st = NULL;
|
|
goto out_err;
|
|
}
|
|
eggsfs_debug("started fetch stripe, waiting");
|
|
|
|
// Wait for all the block requests
|
|
err = down_killable(&st->sema);
|
|
if (err) { goto out_err; }
|
|
|
|
eggsfs_debug("done");
|
|
|
|
err = atomic_read(&st->err);
|
|
if (err) { goto out_err; }
|
|
|
|
// We're finally done
|
|
page = xa_load(&block_span->pages, page_ix);
|
|
// eggsfs_info("loaded page %p for span %p at %u", page, span, page_ix);
|
|
BUG_ON(page == NULL);
|
|
|
|
out:
|
|
if (st != NULL) { put_fetch_stripe(st); }
|
|
|
|
// Reclaim if needed
|
|
low_on_memory = reclaim_after_fetch_stripe();
|
|
|
|
GET_PAGE_EXIT(err == 0 ? page : ERR_PTR(err));
|
|
|
|
out_err:
|
|
eggsfs_debug("getting span page failed, err=%d", err);
|
|
goto out;
|
|
|
|
#undef GET_PAGE_EXIT
|
|
|
|
}
|
|
|
|
int eggsfs_span_init(void) {
|
|
int i;
|
|
for (i = 0; i < EGGSFS_SPAN_LRUS; i++) {
|
|
spin_lock_init(&span_lrus[i].lock);
|
|
INIT_LIST_HEAD(&span_lrus[i].lru);
|
|
}
|
|
|
|
int err = register_shrinker(&eggsfs_span_shrinker);
|
|
if (err) { return err; }
|
|
|
|
eggsfs_block_span_cachep = kmem_cache_create(
|
|
"eggsfs_block_span_cache",
|
|
sizeof(struct eggsfs_block_span),
|
|
0,
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
|
|
&init_block_span
|
|
);
|
|
if (!eggsfs_block_span_cachep) {
|
|
err = -ENOMEM;
|
|
goto out_shrinker;
|
|
}
|
|
|
|
eggsfs_inline_span_cachep = kmem_cache_create(
|
|
"eggsfs_inline_span_cache",
|
|
sizeof(struct eggsfs_inline_span),
|
|
0,
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
|
|
NULL
|
|
);
|
|
if (!eggsfs_block_span_cachep) {
|
|
err = -ENOMEM;
|
|
goto out_block;
|
|
}
|
|
|
|
eggsfs_fetch_stripe_cachep = kmem_cache_create(
|
|
"eggsfs_fetch_stripe_cache",
|
|
sizeof(struct fetch_stripe_state),
|
|
0,
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
|
|
&init_fetch_stripe
|
|
);
|
|
if (!eggsfs_fetch_stripe_cachep) {
|
|
err = -ENOMEM;
|
|
goto out_inline;
|
|
}
|
|
|
|
return 0;
|
|
|
|
out_inline:
|
|
kmem_cache_destroy(eggsfs_fetch_stripe_cachep);
|
|
out_block:
|
|
kmem_cache_destroy(eggsfs_block_span_cachep);
|
|
out_shrinker:
|
|
unregister_shrinker(&eggsfs_span_shrinker);
|
|
return err;
|
|
}
|
|
|
|
void eggsfs_span_exit(void) {
|
|
kmem_cache_destroy(eggsfs_fetch_stripe_cachep);
|
|
kmem_cache_destroy(eggsfs_inline_span_cachep);
|
|
kmem_cache_destroy(eggsfs_block_span_cachep);
|
|
unregister_shrinker(&eggsfs_span_shrinker);
|
|
}
|