Files
ternfs-XTXMarkets/kmod/span.c
T
Francesco Mazzoli 3560d3c98b More kmod fixes
2023-07-06 19:39:12 +01:00

1254 lines
46 KiB
C

#include <linux/module.h>
#include "bincode.h"
#include "inode.h"
#include "log.h"
#include "span.h"
#include "metadata.h"
#include "rs.h"
#include "latch.h"
#include "crc.h"
#include "err.h"
#include "counter.h"
#include "wq.h"
#include "trace.h"
#include "intrshims.h"
#include "sysctl.h"
EGGSFS_DEFINE_COUNTER(eggsfs_stat_cached_spans);
// This currently also includes in-flight pages (i.e. pages that cannot be
// reclaimed), just because the code is a bit simpler this way.
atomic64_t eggsfs_stat_cached_span_pages = ATOMIC64_INIT(0);
// These numbers do not mean anything in particular. We do want to avoid
// flickering sync drops though, therefore we go down to 25GiB from
// 50GiB, and similarly for free memory.
unsigned long eggsfs_span_cache_max_size_async = (50ull << 30); // 50GiB
unsigned long eggsfs_span_cache_min_avail_mem_async = (2ull << 30); // 2GiB
unsigned long eggsfs_span_cache_max_size_drop = (45ull << 30); // 25GiB
unsigned long eggsfs_span_cache_min_avail_mem_drop = (2ull << 30) + (500ull << 20); // 2GiB + 500MiB
unsigned long eggsfs_span_cache_max_size_sync = (100ull << 30); // 100GiB
unsigned long eggsfs_span_cache_min_avail_mem_sync = (1ull << 30); // 1GiB
static struct kmem_cache* eggsfs_block_span_cachep;
static struct kmem_cache* eggsfs_inline_span_cachep;
static struct kmem_cache* eggsfs_fetch_stripe_cachep;
struct eggsfs_span_lru {
spinlock_t lock;
struct list_head lru;
} ____cacheline_aligned;
// These are global locks, but the sharding should alleviate contention.
#define EGGSFS_SPAN_BITS 8
#define EGGSFS_SPAN_LRUS (1<<EGGSFS_SPAN_BITS) // 256
static struct eggsfs_span_lru span_lrus[EGGSFS_SPAN_LRUS];
static struct eggsfs_span_lru* get_span_lru(struct eggsfs_span* span) {
int h;
h = hash_64(span->ino, EGGSFS_SPAN_BITS);
h ^= hash_64(span->start, EGGSFS_SPAN_BITS);
return &span_lrus[h%EGGSFS_SPAN_LRUS];
}
static struct eggsfs_span* lookup_span(struct rb_root* spans, u64 offset) {
struct rb_node* node = spans->rb_node;
while (node) {
struct eggsfs_span* span = container_of(node, struct eggsfs_span, node);
if (offset < span->start) { node = node->rb_left; }
else if (offset >= span->end) { node = node->rb_right; }
else {
eggsfs_debug("off=%llu span=%p", offset, span);
return span;
}
}
eggsfs_debug("off=%llu no_span", offset);
return NULL;
}
static bool insert_span(struct rb_root* spans, struct eggsfs_span* span) {
struct rb_node** new = &(spans->rb_node);
struct rb_node* parent = NULL;
while (*new) {
struct eggsfs_span* this = container_of(*new, struct eggsfs_span, node);
parent = *new;
if (span->end <= this->start) { new = &((*new)->rb_left); }
else if (span->start >= this->end) { new = &((*new)->rb_right); }
else {
// TODO: be loud if there are overlaping spans
eggsfs_debug("span=%p (%llu-%llu) already present", span, span->start, span->end);
return false;
}
}
rb_link_node(&span->node, parent, new);
rb_insert_color(&span->node, spans);
eggsfs_debug("span=%p (%llu-%llu) inserted", span, span->start, span->end);
return true;
}
static void init_block_span(void* p) {
struct eggsfs_block_span* span = (struct eggsfs_block_span*)p;
xa_init(&span->pages);
int i;
for (i = 0; i < EGGSFS_MAX_STRIPES; i++) {
eggsfs_latch_init(&span->stripe_latches[i]);
}
}
static void free_span(struct eggsfs_span* span, u64* dropped_pages) {
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
kmem_cache_free(eggsfs_inline_span_cachep, EGGSFS_INLINE_SPAN(span));
} else {
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
eggsfs_counter_dec(eggsfs_stat_cached_spans);
struct page* page;
unsigned long page_ix;
xa_for_each(&block_span->pages, page_ix, page) {
if (dropped_pages) { (*dropped_pages)++; }
put_page(page);
xa_erase(&block_span->pages, page_ix);
atomic64_dec(&eggsfs_stat_cached_span_pages);
}
int i;
for (i = 0; i < EGGSFS_MAX_STRIPES; i++) {
BUG_ON(atomic64_read(&block_span->stripe_latches[i].counter) & 1); // nobody still hanging onto this
}
kmem_cache_free(eggsfs_block_span_cachep, block_span);
}
}
struct get_span_ctx {
struct eggsfs_inode* enode;
struct list_head spans;
int err;
};
void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs) {
eggsfs_debug("offset=%llu size=%u crc=%08x storage_class=%d parity=%d stripes=%d cell_size=%u", offset, size, crc, storage_class, parity, stripes, cell_size);
struct get_span_ctx* ctx = (struct get_span_ctx*)data;
if (ctx->err) { return; }
struct eggsfs_block_span* span = kmem_cache_alloc(eggsfs_block_span_cachep, GFP_KERNEL);
if (!span) { ctx->err = -ENOMEM; return; }
if (eggsfs_data_blocks(parity) > EGGSFS_MAX_DATA || eggsfs_parity_blocks(parity) > EGGSFS_MAX_PARITY) {
eggsfs_warn("D=%d > %d || P=%d > %d", eggsfs_data_blocks(parity), EGGSFS_MAX_DATA, eggsfs_data_blocks(parity), EGGSFS_MAX_PARITY);
ctx->err = -EIO;
return;
}
span->span.ino = ctx->enode->inode.i_ino;
span->span.start = offset;
span->span.end = offset + size;
span->span.storage_class = storage_class;
span->enode = ctx->enode; // no ihold: `span->enode` is cleared before eviction
span->cell_size = cell_size;
memcpy(span->stripes_crc, stripes_crcs, sizeof(uint32_t)*stripes);
span->stripes = stripes;
span->parity = parity;
// important, this will have readers skip over this until it ends up in the LRU
span->refcount = -1;
eggsfs_debug("adding normal span");
list_add_tail(&span->span.lru, &ctx->spans);
}
void eggsfs_file_spans_cb_block(
void* data, int block_ix,
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2, u8 flags,
u64 block_id, u32 crc
) {
struct get_span_ctx* ctx = (struct get_span_ctx*)data;
if (ctx->err) { return; }
struct eggsfs_span* span = list_last_entry(&ctx->spans, struct eggsfs_span, lru);
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
struct eggsfs_block* block = &block_span->blocks[block_ix];
block->bs.id = bs_id;
block->bs.ip1 = ip1;
block->bs.port1 = port1;
block->bs.ip2 = ip2;
block->bs.port2 = port2;
block->bs.flags = flags;
block->id = block_id;
block->crc = crc;
}
void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body) {
eggsfs_debug("offset=%llu size=%u len=%u body=%*pE", offset, size, len, len, body);
struct get_span_ctx* ctx = (struct get_span_ctx*)data;
if (ctx->err) { return; }
struct eggsfs_inline_span* span = kmem_cache_alloc(eggsfs_inline_span_cachep, GFP_KERNEL);
if (!span) { ctx->err = -ENOMEM; return; }
span->span.ino = ctx->enode->inode.i_ino;
span->span.start = offset;
span->span.end = offset + size;
span->span.storage_class = EGGSFS_INLINE_STORAGE;
span->len = len;
memcpy(span->body, body, len);
eggsfs_debug("adding inline span");
list_add_tail(&span->span.lru, &ctx->spans);
}
// returns whether we could acquire it
static bool span_acquire(struct eggsfs_span* span) {
if (span->storage_class == EGGSFS_INLINE_STORAGE) { return true; }
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
struct eggsfs_span_lru* lru = get_span_lru(span);
spin_lock_bh(&lru->lock);
if (unlikely(block_span->refcount < 0)) { // the reclaimer got here before us.
spin_unlock_bh(&lru->lock);
return false;
}
block_span->refcount++;
if (block_span->refcount == 1) {
// we're the first ones here, take it out of the LRU
list_del(&span->lru);
}
spin_unlock_bh(&lru->lock);
return true;
}
// Looks up and acquires a span. Returns -EBUSY if
// the span is there, but it could not be acquired (contention
// with LRU reclaimer). Returns NULL if the span is not there.
static struct eggsfs_span* lookup_and_acquire_span(struct eggsfs_inode* enode, u64 offset) {
struct eggsfs_inode_file* file = &enode->file;
int err = down_read_killable(&enode->file.spans_lock);
if (err) { return ERR_PTR(err); }
struct eggsfs_span* span = lookup_span(&file->spans, offset);
if (likely(span)) {
if (unlikely(!span_acquire(span))) { err = -EBUSY; }
}
up_read(&enode->file.spans_lock);
return span;
}
void eggsfs_put_span(struct eggsfs_span* span, bool was_read) {
if (span->storage_class == EGGSFS_INLINE_STORAGE) { return; }
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
struct eggsfs_span_lru* lru = get_span_lru(span);
spin_lock_bh(&lru->lock);
BUG_ON(block_span->refcount < 1);
block_span->refcount--;
if (block_span->refcount == 0) { // we need to put it back into the LRU, or free it
if (block_span->touched) {
list_add_tail(&span->lru, &lru->lru);
} else {
list_add(&span->lru, &lru->lru);
}
}
spin_unlock_bh(&lru->lock);
}
struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset) {
struct eggsfs_inode_file* file = &enode->file;
int err;
eggsfs_debug("ino=%016lx, pid=%d, off=%llu getting span", enode->inode.i_ino, get_current()->pid, offset);
trace_eggsfs_get_span_enter(enode->inode.i_ino, offset);
#define GET_SPAN_EXIT(s) do { \
trace_eggsfs_get_span_exit(enode->inode.i_ino, offset, IS_ERR(s) ? PTR_ERR(s) : 0); \
return s; \
} while(0)
// This helps below: it means that we _must_ have a span. So if we
// get NULL at any point, we can retry, because it means we're conflicting
// with a reclaimer.
if (offset >= enode->inode.i_size) { GET_SPAN_EXIT(NULL); }
u64 iterations = 0;
retry:
iterations++;
// Sadly while this should be somewhat uncommon it can happen when we're busy
// looping while another task is between inserting span in the span tree and
// adding it to the LRU.
if (unlikely(iterations == 10)) {
eggsfs_warn("we've been fetching the same span for %llu iterations, we're probably stuck on a yet-to-be enabled span we just fetched", iterations);
}
// Check if we already have the span
{
struct eggsfs_span* span = lookup_and_acquire_span(enode, offset);
if (unlikely(IS_ERR(span))) {
int err = PTR_ERR(span);
if (err == -EBUSY) { goto retry; } // we couldn't acquire the span
GET_SPAN_EXIT(span); // some other error
} else if (likely(span != NULL)) {
GET_SPAN_EXIT(span);
}
}
// We need to fetch the spans.
err = down_write_killable(&file->spans_lock);
if (err) { GET_SPAN_EXIT(ERR_PTR(err)); }
// Check if somebody go to it first, in which case we can acquire it and
// return immediately.
{
struct eggsfs_span* span = lookup_span(&file->spans, offset);
if (unlikely(span)) {
if (unlikely(!span_acquire(span))) {
up_write(&file->spans_lock);
goto retry;
}
up_write(&file->spans_lock);
GET_SPAN_EXIT(span);
}
}
// We always get the full set of spans, this simplifies prefetching (we just
// blindly assume the span is there, and if it's been reclaimed in the meantime
// we just don't care).
u64 spans_offset;
for (spans_offset = 0;;) {
// fetch next batch of spans
u64 next_offset;
struct get_span_ctx ctx = { .err = 0, .enode = enode };
INIT_LIST_HEAD(&ctx.spans);
err = eggsfs_error_to_linux(eggsfs_shard_file_spans(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, spans_offset, &next_offset,&ctx
));
err = err ?: ctx.err;
if (unlikely(err)) {
eggsfs_debug("failed to get file spans at %llu err=%d", spans_offset, err);
for (;;) {
struct eggsfs_span* span = list_first_entry_or_null(&ctx.spans, struct eggsfs_span, lru);
list_del(&span->lru);
free_span(span, NULL);
}
up_write(&file->spans_lock);
GET_SPAN_EXIT(ERR_PTR(err));
}
// add them to enode spans and LRU
for (;;) {
struct eggsfs_span* span = list_first_entry_or_null(&ctx.spans, struct eggsfs_span, lru);
if (span == NULL) { break; }
list_del(&span->lru);
if (!insert_span(&file->spans, span)) {
// Span is already cached
free_span(span, NULL);
} else {
if (span->storage_class != EGGSFS_INLINE_STORAGE) {
// Not already cached, must add it to the LRU
eggsfs_counter_inc(eggsfs_stat_cached_spans);
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
struct eggsfs_span_lru* lru = get_span_lru(span);
spin_lock_bh(&lru->lock);
block_span->refcount = 0;
list_add(&span->lru, &lru->lru);
spin_unlock_bh(&lru->lock);
}
}
}
// stop if we're done
if (next_offset == 0) { break; }
spans_offset = next_offset;
}
up_write(&file->spans_lock);
// We now restart, we know that the span must be there (unless the shard is broken).
// It might get reclaimed in the meantime though.
goto retry;
#undef GET_SPAN_EXIT
}
// If it returns -1, there are no spans to drop. Otherwise, returns the
// next index to pass in to reclaim the next span.
static int drop_one_span(int lru_ix, u64* examined_spans, u64* dropped_pages) {
BUG_ON(lru_ix < 0 || lru_ix >= EGGSFS_SPAN_LRUS);
int i;
for (i = lru_ix; i < lru_ix + EGGSFS_SPAN_LRUS; i++) {
struct eggsfs_span_lru* lru = &span_lrus[i%EGGSFS_SPAN_LRUS];
struct eggsfs_span* candidate = NULL;
struct eggsfs_inode* enode = NULL;
// Pick a candidate from the LRU, mark it as reclaiming, take it out of
// the LRU.
spin_lock_bh(&lru->lock);
candidate = list_first_entry_or_null(&lru->lru, struct eggsfs_span, lru);
if (unlikely(candidate == NULL)) {
goto unlock;
} else {
(*examined_spans)++;
BUG_ON(candidate->storage_class == EGGSFS_INLINE_STORAGE); // no inline spans in LRU
struct eggsfs_block_span* candidate_blocks = EGGSFS_BLOCK_SPAN(candidate);
if (likely(candidate_blocks->enode)) {
// There's a span of time between the last inode reference being dropped
// and the eviction function being called where we do have this enode
// here but we also don't have references to it anymore. In that case
// we let the enode eviction take care of this (we can't do it ourselves
// because we can't safely remove the span from the enode afterwards).
if (likely(atomic_inc_not_zero(&candidate_blocks->enode->inode.i_count))) { // conditional ihold
enode = candidate_blocks->enode;
} else {
candidate = NULL; // skip
goto unlock;
}
}
// we just got this from LRU, it must be readers == 0
BUG_ON(candidate_blocks->refcount != 0);
candidate_blocks->refcount = -1;
list_del(&candidate->lru);
}
unlock:
spin_unlock_bh(&lru->lock);
if (candidate) {
// Take it out of the spans for the file
if (likely(enode)) {
down_write(&enode->file.spans_lock);
rb_erase(&candidate->node, &enode->file.spans);
up_write(&enode->file.spans_lock);
iput(&enode->inode);
}
// At this point we're free to do what we please with the span
// (it's fully private to us).
free_span(candidate, dropped_pages);
i++;
break;
}
}
// we've fully gone around without finding anything
if (i == lru_ix + EGGSFS_SPAN_LRUS) { return -1; }
// We've found something
return i%EGGSFS_SPAN_LRUS;
}
static inline void drop_spans_enter(const char* type) {
trace_eggsfs_drop_spans_enter(type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_span_pages), eggsfs_counter_get(&eggsfs_stat_cached_spans));
}
static inline void drop_spans_exit(const char* type, u64 dropped_pages) {
trace_eggsfs_drop_spans_exit(type, PAGE_SIZE*si_mem_available(), atomic64_read(&eggsfs_stat_cached_span_pages), eggsfs_counter_get(&eggsfs_stat_cached_spans), dropped_pages);
}
// returns the number of dropped pages
u64 eggsfs_drop_all_spans(void) {
drop_spans_enter("all");
u64 dropped_pages = 0;
u64 examined_spans = 0;
s64 spans_begin = eggsfs_counter_get(&eggsfs_stat_cached_spans);
int lru_ix = 0;
for (;;) {
lru_ix = drop_one_span(lru_ix, &examined_spans, &dropped_pages);
if (lru_ix < 0) { break; }
}
s64 spans_end = eggsfs_counter_get(&eggsfs_stat_cached_spans);
eggsfs_info("reclaimed %llu pages, %lld spans (approx)", dropped_pages, spans_begin-spans_end);
drop_spans_exit("all", dropped_pages);
return dropped_pages;
}
static DEFINE_MUTEX(drop_spans_mu);
static u64 drop_spans(const char* type) {
mutex_lock(&drop_spans_mu);
drop_spans_enter(type);
u64 dropped_pages = 0;
u64 examined_spans = 0;
int lru_ix = 0;
for (;;) {
u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages);
if (
pages*PAGE_SIZE < eggsfs_span_cache_max_size_drop &&
si_mem_available()*PAGE_SIZE > eggsfs_span_cache_min_avail_mem_drop
) {
break;
}
lru_ix = drop_one_span(lru_ix, &examined_spans, &dropped_pages);
if (lru_ix < 0) { break; }
}
eggsfs_debug("dropped %llu pages, %llu spans examined", dropped_pages, examined_spans);
drop_spans_exit(type, dropped_pages);
mutex_unlock(&drop_spans_mu);
return dropped_pages;
}
static void reclaim_spans_async(struct work_struct* work) {
if (!mutex_is_locked(&drop_spans_mu)) { // somebody's already taking care of it
drop_spans("async");
}
}
static DECLARE_WORK(reclaim_spans_work, reclaim_spans_async);
static unsigned long eggsfs_span_shrinker_count(struct shrinker* shrinker, struct shrink_control* sc) {
u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages);
if (pages == 0) { return SHRINK_EMPTY; }
return pages;
}
static int eggsfs_span_shrinker_lru_ix = 0;
static u64 eggsfs_span_shrinker_pages_round = 25600; // We drop at most 100MiB in one shrinker round
static unsigned long eggsfs_span_shrinker_scan(struct shrinker* shrinker, struct shrink_control* sc) {
drop_spans_enter("shrinker");
u64 dropped_pages = 0;
u64 examined_spans = 0;
int lru_ix = eggsfs_span_shrinker_lru_ix;
for (dropped_pages = 0; dropped_pages < eggsfs_span_shrinker_pages_round;) {
lru_ix = drop_one_span(lru_ix, &examined_spans, &dropped_pages);
if (lru_ix < 0) { break; }
}
eggsfs_span_shrinker_lru_ix = lru_ix >= 0 ? lru_ix : 0;
drop_spans_exit("shrinker", dropped_pages);
return dropped_pages;
}
static struct shrinker eggsfs_span_shrinker = {
.count_objects = eggsfs_span_shrinker_count,
.scan_objects = eggsfs_span_shrinker_scan,
.seeks = DEFAULT_SEEKS,
.flags = SHRINKER_NONSLAB, // what's this?
};
void eggsfs_drop_file_spans(struct eggsfs_inode* enode) {
again:
down_write(&enode->file.spans_lock);
for (;;) {
struct rb_node* node = rb_first(&enode->file.spans);
if (node == NULL) { break; }
struct eggsfs_span* span = rb_entry(node, struct eggsfs_span, node);
struct eggsfs_block_span* block_span = NULL;
bool can_free_span = true;
if (span->storage_class != EGGSFS_INLINE_STORAGE) {
block_span = EGGSFS_BLOCK_SPAN(span);
struct eggsfs_span_lru* lru = get_span_lru(span);
spin_lock_bh(&lru->lock);
if (unlikely(block_span->refcount < 0)) {
// This is being reclaimed, we wait until the reclaimer cleans
// it up for us. This is very unlikely: the reclaiming would have
// had to start before the last inode reference was dropped.
spin_unlock_bh(&lru->lock);
up_write(&enode->file.spans_lock);
goto again;
} else if (unlikely(block_span->refcount > 0)) {
// Note that the assumption here is that the reader will correctly put
// the span back in the LRU once done.
eggsfs_warn("span at %llu for inode %016lx still has %d readers, will linger on until next reclaim", span->start, enode->inode.i_ino, block_span->refcount);
can_free_span = false;
} else {
// Make ourselves the reclaimer. Note that we could just let the
// reclaimer take care of it eventually, but this is the very
// common case and it'll be faster.
list_del(&span->lru);
block_span->refcount = -1;
}
block_span->enode = NULL;
spin_unlock_bh(&lru->lock);
}
rb_erase(node, &enode->file.spans);
if (can_free_span) { free_span(span, NULL); }
}
up_write(&enode->file.spans_lock);
}
struct fetch_stripe_state {
// ihold'd
struct eggsfs_inode *enode;
struct semaphore sema; // used to wait on every block being finished. could be done faster with a wait_queue, but don't want to worry about smb subtleties.
s64 stripe_seqno; // used to release the stripe when we're done with it
struct eggsfs_block_span* span; // we hold this (via span acquire) until we're done.
// these will be filled in by the fetching (only D of them well be
// filled by fetching the blocks, the others might be filled in by the RS
// recovery)
struct list_head blocks_pages[EGGSFS_MAX_BLOCKS];
atomic_t refcount; // to garbage collect
atomic_t err; // the result of the stripe fetching
static_assert(EGGSFS_MAX_BLOCKS <= 16);
// 00-16: blocks which are downloading.
// 16-32: blocks which have succeeded.
// 32-48: blocks which have failed.
atomic64_t blocks;
u8 stripe; // which stripe we're into
u8 prefetching; // used for logging
};
#define DOWNLOADING_SHIFT 0
#define DOWNLOADING_MASK (0xFFFFull<<DOWNLOADING_SHIFT)
#define DOWNLOADING_GET(__b) ((__b>>DOWNLOADING_SHIFT) & 0xFFFFull)
#define DOWNLOADING_SET(__b, __i) (__b | 1ull<<(__i+DOWNLOADING_SHIFT))
#define DOWNLOADING_UNSET(__b, __i) (__b & ~(1ull<<(__i+DOWNLOADING_SHIFT)))
#define SUCCEEDED_SHIFT 16
#define SUCCEEDED_MASK (0xFFFFull<<SUCCEEDED_SHIFT)
#define SUCCEEDED_GET(__b) ((__b>>SUCCEEDED_SHIFT) & 0xFFFFull)
#define SUCCEEDED_SET(__b, __i) (__b | 1ull<<(__i+SUCCEEDED_SHIFT))
#define SUCCEEDED_UNSET(__b, __i) (__b & ~(1ull<<(__i+SUCCEEDED_SHIFT)))
#define FAILED_SHIFT 32
#define FAILED_MASK (0xFFFFull<<FAILED_SHIFT)
#define FAILED_GET(__b) ((__b>>FAILED_SHIFT) & 0xFFFFull)
#define FAILED_SET(__b, __i) (__b | 1ull<<(__i+FAILED_SHIFT))
#define FAILED_UNSET(__b, __i) (__b & ~(1ull<<(__i+FAILED_SHIFT)))
static void init_fetch_stripe(void* p) {
struct fetch_stripe_state* st = (struct fetch_stripe_state*)p;
sema_init(&st->sema, 0);
int i;
for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) {
INIT_LIST_HEAD(&st->blocks_pages[i]);
}
}
// This must be called while already holding the span, which means
// that span_acquire will always succeed.
static struct fetch_stripe_state* new_fetch_stripe(
struct eggsfs_inode* enode,
struct eggsfs_block_span* span,
u8 stripe,
u64 stripe_seqno,
bool prefetching
) {
struct fetch_stripe_state* st = (struct fetch_stripe_state*)kmem_cache_alloc(eggsfs_fetch_stripe_cachep, GFP_KERNEL);
if (!st) { return st; }
BUG_ON(!span_acquire(&span->span));
st->enode = enode;
st->stripe_seqno = stripe_seqno;
st->span = span;
atomic_set(&st->refcount, 1); // the caller
atomic_set(&st->err, 0);
st->stripe = stripe;
st->prefetching = prefetching;
atomic64_set(&st->blocks, 0);
eggsfs_in_flight_begin(enode);
return st;
}
static void fetch_stripe_trace(struct fetch_stripe_state* st, u8 event, s8 block, int err) {
trace_eggsfs_fetch_stripe(
st->enode->inode.i_ino,
st->span->span.start,
st->stripe,
st->span->parity,
st->prefetching,
event,
block,
err
);
}
static void free_fetch_stripe(struct fetch_stripe_state* st) {
// we're the last ones here
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_FREE, -1, atomic_read(&st->err));
// Note that we release this here, rather than when `remaining == 0`, since
// there's no guarantee that we reach `remaining == 0` (we might fail to start
// enough requests)
eggsfs_latch_release(&st->span->stripe_latches[st->stripe], st->stripe_seqno);
while (down_trylock(&st->sema) == 0) {} // reset sema to zero for next usage
// Free leftover blocks (also ensures the lists are all nice and empty for the
// next user)
int i;
for (i = 0; i < EGGSFS_MAX_BLOCKS; i++) {
put_pages_list(&st->blocks_pages[i]);
}
eggsfs_in_flight_end(st->enode);
// We don't need the span anymore
eggsfs_put_span(&st->span->span, !st->prefetching);
kmem_cache_free(eggsfs_fetch_stripe_cachep, st);
}
static void put_fetch_stripe(struct fetch_stripe_state* st) {
if (atomic_dec_return(&st->refcount) > 0) { return; }
free_fetch_stripe(st);
}
static void hold_fetch_stripe(struct fetch_stripe_state* st) {
WARN_ON(atomic_inc_return(&st->refcount) < 2);
}
static void store_pages(struct fetch_stripe_state* st) {
struct eggsfs_block_span* span = st->span;
int D = eggsfs_data_blocks(span->parity);
int i, j;
// we need to recover data using RS
u16 blocks = SUCCEEDED_GET(atomic64_read(&st->blocks));
eggsfs_debug("blocks=%d", blocks);
BUG_ON(__builtin_popcountll(blocks) != D);
if (unlikely(blocks != (1ull<<D)-1)) {
u32 pages = 0;
{ // get the number of pages
struct list_head* tmp;
// eggsfs_info("picking %d", __builtin_ctz(st->blocks));
list_for_each(tmp, &st->blocks_pages[__builtin_ctz(blocks)]) { // just pick the first one to count pages
pages++;
}
}
for (i = 0; i < D; i++) { // fill in missing data blocks
if ((1ull<<i) & blocks) { continue; } // we have this one already
// eggsfs_info("recovering %d, pages=%u", i, pages);
// allocate the pages
struct list_head* i_pages = &st->blocks_pages[i];
for (j = 0; j < pages; j++) {
struct page* page = alloc_page(GFP_KERNEL);
if (!page) {
atomic_set(&st->err, -ENOMEM);
return;
}
list_add_tail(&page->lru, i_pages);
}
// compute
int err = eggsfs_recover(span->parity, blocks, 1ull<<i, pages, st->blocks_pages);
if (err) {
atomic_set(&st->err, err);
return;
}
}
}
u32 start_page = (span->cell_size/PAGE_SIZE)*D*st->stripe;
u32 end_page = (span->cell_size/PAGE_SIZE)*D*((int)st->stripe + 1);
u32 curr_page = start_page;
u32 crc = 0;
kernel_fpu_begin(); // crc
// move the pages to the span
for (i = 0; i < D; i++) {
struct page* page;
struct page* tmp;
list_for_each_entry_safe(page, tmp, &st->blocks_pages[i], lru) {
list_del(&page->lru);
// eggsfs_info("storing page for span %p at %u", span, curr_page);
char* page_buf = kmap_atomic(page);
crc = eggsfs_crc32c(crc, page_buf, PAGE_SIZE);
kunmap_atomic(page_buf);
struct page* old_page = xa_store(&span->pages, curr_page, page, GFP_KERNEL);
if (IS_ERR(old_page)) {
atomic_set(&st->err, PTR_ERR(old_page));
eggsfs_debug("xa_store failed: %ld", PTR_ERR(old_page));
put_page(page); // we removed it from the list
kernel_fpu_end();
return;
}
BUG_ON(old_page != NULL); // the latch protects against this
atomic64_inc(&eggsfs_stat_cached_span_pages);
curr_page++;
}
// eggsfs_info("i=%d curr_page=%u", i, curr_page);
}
kernel_fpu_end();
if (curr_page != end_page) {
eggsfs_info("curr_page=%u end_page=%u", curr_page, end_page);
}
BUG_ON(curr_page != end_page);
if (crc != span->stripes_crc[st->stripe]) {
eggsfs_warn("bad crc for file %016lx, stripe %u, expected %08x, got %08x", st->enode->inode.i_ino, st->stripe, span->stripes_crc[st->stripe], crc);
atomic_set(&st->err, -EIO);
}
}
static void block_done(void* data, u64 block_id, struct list_head* pages, int err);
// Starts loading up D blocks as necessary.
static int fetch_blocks(struct fetch_stripe_state* st) {
int i;
int err = 0;
struct eggsfs_block_span* span = st->span;
int D = eggsfs_data_blocks(span->parity);
int P = eggsfs_parity_blocks(span->parity);
int B = eggsfs_blocks(span->parity);
#define LOG_STR "span=%llu file=%016llx D=%d P=%d downloading=%04x(%llu) succeeded=%04x(%llu) failed=%04x(%llu) err=%d"
#define LOG_ARGS span->span.start, span->span.ino, D, P, downloading, __builtin_popcountll(downloading), succeeded, __builtin_popcountll(succeeded), failed, __builtin_popcountll(failed), err
hold_fetch_stripe(st); // extra hold to make sure that the stripe survives this loop
for (;;) {
s64 blocks = atomic64_read(&st->blocks);
u16 downloading = DOWNLOADING_GET(blocks);
u16 succeeded = SUCCEEDED_GET(blocks);
u16 failed = FAILED_GET(blocks);
if (__builtin_popcountll(downloading)+__builtin_popcountll(succeeded) >= D) { // we managed to get out of the woods
eggsfs_debug("we have enough blocks, terminating " LOG_STR, LOG_ARGS);
goto out;
}
if (__builtin_popcountll(failed) > P) { // nowhere to go from here but tears
eggsfs_debug("we're out of blocks, giving up " LOG_STR, LOG_ARGS);
err = err ?: -EIO;
goto out;
}
// find the next block to download
for (i = 0; i < B; i++) {
if (!((1ull<<i) & (downloading|failed|succeeded))) { break; }
}
BUG_ON(i == B); // something _must_ be there given the checks above
eggsfs_debug("next block to be fetched is %d " LOG_STR, i, LOG_ARGS);
// try to set ourselves as selected
if (atomic64_cmpxchg(&st->blocks, blocks, DOWNLOADING_SET(blocks, i)) != blocks) {
eggsfs_debug("skipping %d " LOG_STR, i, LOG_ARGS);
// somebody else got here first (can happen if some blocks complete before this loop finishes) first, keep going
continue;
}
// ok, we're responsible for this now, start
struct eggsfs_block* block = &st->span->blocks[i];
u32 block_offset = st->stripe * span->cell_size;
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_START, i, 0);
hold_fetch_stripe(st);
int err = eggsfs_fetch_block(
&block_done,
(void*)st,
&block->bs, block->id, block_offset, span->cell_size
);
if (err) {
put_fetch_stripe(st);
eggsfs_debug("loading block failed %d " LOG_STR, i, LOG_ARGS);
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_DONE, i, err);
// mark it as failed and not downloading
blocks = atomic64_read(&st->blocks);
while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, FAILED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {}
} else {
eggsfs_debug("loading block %d " LOG_STR, i, LOG_ARGS);
}
}
out:
put_fetch_stripe(st);
return err;
#undef LOG_STR
#undef LOG_ARGS
}
static void block_done(void* data, u64 block_id, struct list_head* pages, int err) {
int i;
eggsfs_debug("block complete %016llx", block_id);
struct fetch_stripe_state* st = (struct fetch_stripe_state*)data;
struct eggsfs_block_span* span = st->span;
int B = eggsfs_blocks(span->parity);
int D = eggsfs_data_blocks(span->parity);
for (i = 0; i < B; i++) {
if (st->span->blocks[i].id == block_id) { break; }
}
// eggsfs_info("block ix %d, B=%d", i, B);
BUG_ON(i == B);
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_BLOCK_DONE, i, err);
bool finished = false;
if (err) {
// it failed, try to restore order
eggsfs_debug("block %016llx ix %d failed with %d", block_id, i, err);
// mark it as failed and not downloading
s64 blocks = atomic64_read(&st->blocks);
while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, FAILED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {}
err = fetch_blocks(st);
// if we failed to restore order, mark the whole thing as failed
if (err) { atomic_cmpxchg(&st->err, 0, err); }
} else {
// mark as fetched, see if we're the last ones
s64 blocks = atomic64_read(&st->blocks);
while (unlikely(!atomic64_try_cmpxchg(&st->blocks, &blocks, SUCCEEDED_SET(DOWNLOADING_UNSET(blocks, i), i)))) {}
finished = __builtin_popcountll(SUCCEEDED_GET(SUCCEEDED_SET(blocks, i))) == D;
// it succeeded, grab all the pages
struct page* page;
struct page* tmp;
list_for_each_entry_safe(page, tmp, pages, lru) {
list_del(&page->lru);
list_add_tail(&page->lru, &st->blocks_pages[i]);
}
}
if (finished) {
eggsfs_debug("finished");
// we're the last one to finish, we need to store the pages in the span
store_pages(st);
// and wake up waiters, if any
up(&st->sema);
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_END, -1, atomic_read(&st->err));
}
put_fetch_stripe(st); // one refcount per request
eggsfs_debug("done");
}
static struct fetch_stripe_state* start_fetch_stripe(
struct eggsfs_block_span* span, u8 stripe, s64 stripe_seqno, bool prefetching
) {
int err;
BUG_ON(!span->enode); // must still be alive here
struct eggsfs_inode* enode = span->enode;
trace_eggsfs_fetch_stripe(
enode->inode.i_ino,
span->span.start,
stripe,
span->parity,
prefetching,
EGGSFS_FETCH_STRIPE_START,
-1,
0
);
struct fetch_stripe_state* st = NULL;
int D = eggsfs_data_blocks(span->parity);
int B = eggsfs_blocks(span->parity);
if (D > EGGSFS_MAX_DATA || B > EGGSFS_MAX_BLOCKS) {
eggsfs_error("got out of bounds parity of RS(%d,%d) for span %llu in file %016lx", D, B-D, span->span.start, enode->inode.i_ino);
err = -EIO;
goto out_err;
}
st = new_fetch_stripe(enode, span, stripe, stripe_seqno, prefetching);
// start fetching blocks
eggsfs_debug("fetching blocks");
err = fetch_blocks(st);
if (err) { goto out_err; }
eggsfs_debug("fetch stripe started");
return st;
out_err:
fetch_stripe_trace(st, EGGSFS_FETCH_STRIPE_END, -1, err);
if (st) { put_fetch_stripe(st); }
return ERR_PTR(err);
}
static void do_prefetch(struct eggsfs_block_span* block_span, u64 off) {
BUG_ON(!block_span->enode); // must still be alive at this point
struct eggsfs_inode* enode = block_span->enode;
if (off < block_span->span.start || off >= block_span->span.end) {
// Another span, we need to load it. We give up if we can't at any occurrence.
struct eggsfs_span* span = lookup_and_acquire_span(enode, off);
if (unlikely(span == NULL || IS_ERR(span))) { return; }
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
return; // nothing to do, it's an inline span
}
block_span = EGGSFS_BLOCK_SPAN(span);
} else if (unlikely(!span_acquire(&block_span->span))) {
return; // it's being reclaimed, don't bother
}
// At this point we've acquired the span. We now need to check if there's something to get
int D = eggsfs_data_blocks(block_span->parity);
u32 page_ix = (off - block_span->span.start)/PAGE_SIZE;
u32 span_offset = page_ix * PAGE_SIZE;
u32 stripe = span_offset / (block_span->cell_size*D);
if (stripe > block_span->stripes) {
eggsfs_warn("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, block_span->stripes);
goto out_span;
}
struct page* page = xa_load(&block_span->pages, page_ix);
if (page != NULL) { // already there
goto out_span;
}
s64 seqno;
if (!eggsfs_latch_try_acquire(&block_span->stripe_latches[stripe], seqno)) { // somebody else is working on it
goto out_span;
}
page = xa_load(&block_span->pages, page_ix);
if (page != NULL) { // somebody got to it first
eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno);
goto out_span;
}
// Finally start the work.
struct fetch_stripe_state* st = start_fetch_stripe(block_span, stripe, seqno, true);
if (IS_ERR(st)) {
eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno);
eggsfs_debug("prefetch failed err=%ld", PTR_ERR(st));
goto out_span;
}
// We want the state to go once all the block fetches are done.
put_fetch_stripe(st);
out_span:
// The fetch state has ownership itself
eggsfs_put_span(&block_span->span, false);
return;
}
// Returns whether we're low on memory
static bool reclaim_after_fetch_stripe(void) {
// Reclaim pages if we went over the limit
u64 pages = atomic64_read(&eggsfs_stat_cached_span_pages);
u64 free_pages = si_mem_available();
bool low_on_memory = false;
if (pages*PAGE_SIZE > eggsfs_span_cache_max_size_sync || free_pages*PAGE_SIZE < eggsfs_span_cache_min_avail_mem_sync) {
low_on_memory = true;
// sync dropping, apply backpressure
if (!mutex_lock_killable(&drop_spans_mu)) {
drop_spans("sync");
mutex_unlock(&drop_spans_mu);
}
} else if (pages*PAGE_SIZE > eggsfs_span_cache_max_size_async || free_pages*PAGE_SIZE < eggsfs_span_cache_min_avail_mem_async) {
low_on_memory = true;
// don't bother submitting if another span dropper is running already
if (!mutex_is_locked(&drop_spans_mu)) {
// TODO Is it a good idea to do it on system_long_wq rather than eggsfs_wq? The freeing
// job might face heavy contention, so maybe yes?
// On the other end, this might make it harder to know when it's safe to unload
// the module by making sure that there's no work left on the queue.
queue_work(system_long_wq, &reclaim_spans_work);
}
}
return low_on_memory;
}
static void maybe_prefetch(
struct eggsfs_block_span* span, bool low_on_memory, u64 stripe_start, u64 stripe_end
) {
BUG_ON(!span->enode); // must still be alive here
struct eggsfs_inode* enode = span->enode;
eggsfs_debug("file=%016llx eggsfs_prefetch=%d low_on_memory=%d stripe_start=%llu stripe_end=%llu", span->span.ino, (int)eggsfs_prefetch, (int)low_on_memory, stripe_start, stripe_end);
if (!eggsfs_prefetch || low_on_memory) { return; }
struct eggsfs_inode_file* file = &enode->file;
u64 prefetch_section = atomic64_read(&file->prefetch_section);
u32 prefetch_section_begin = prefetch_section & (uint32_t)(-1);
u32 prefetch_section_end = (prefetch_section>>32) & (uint32_t)(-1);
if (stripe_start > prefetch_section_begin && stripe_end < prefetch_section_end) {
eggsfs_debug("within bounds, not prefetching");
// we're comfortably inside, nothing to do
} if (stripe_end < prefetch_section_begin || stripe_start > prefetch_section_end) {
eggsfs_debug("out of bounds, resetting prefetch");
// we're totally out of bounds, reset
prefetch_section_begin = stripe_start;
prefetch_section_end = stripe_end;
} else {
u64 off;
// there is some overlap, but we're pushing the boundary, prefetch the next or previous thing
if (stripe_end > prefetch_section_end) { // going rightwards
eggsfs_debug("prefetching forwards");
prefetch_section_end = stripe_end;
off = stripe_end;
} else { // going leftwards
eggsfs_debug("prefetching backwards");
prefetch_section_begin = stripe_start;
off = stripe_start-1;
}
u64 new_prefetch_section = prefetch_section_begin | ((u64)prefetch_section_end << 32);
if (likely(atomic64_cmpxchg(&file->prefetch_section, prefetch_section, new_prefetch_section) == prefetch_section)) {
do_prefetch(span, off);
}
}
}
struct page* eggsfs_get_span_page(struct eggsfs_block_span* block_span, u32 page_ix) {
struct eggsfs_span* span = &block_span->span;
trace_eggsfs_get_span_page_enter(span->ino, span->start, page_ix*PAGE_SIZE);
// We need to load the stripe
int D = eggsfs_data_blocks(block_span->parity);
u32 span_offset = page_ix * PAGE_SIZE;
u32 stripe = span_offset / (block_span->cell_size*D);
bool low_on_memory = false;
int err = 0;
#define GET_PAGE_EXIT(p) do { \
int __err = IS_ERR(p) ? PTR_ERR(p) : 0; \
if (likely(__err == 0)) { \
maybe_prefetch(block_span, low_on_memory, span->start + stripe*(block_span->cell_size*D), span->start + (stripe+1)*(block_span->cell_size*D)); \
} \
trace_eggsfs_get_span_page_exit(span->ino, span->start, page_ix*PAGE_SIZE, __err); \
return p; \
} while(0)
// TODO better error?
if (stripe > block_span->stripes) {
eggsfs_warn("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, block_span->stripes);
GET_PAGE_EXIT(ERR_PTR(-EIO));
}
// this should be guaranteed by the caller but we rely on it below, so let's check
if (block_span->cell_size%PAGE_SIZE != 0) {
eggsfs_warn("cell_size=%u, PAGE_SIZE=%lu, block_span->cell_size%%PAGE_SIZE=%lu", block_span->cell_size, PAGE_SIZE, block_span->cell_size%PAGE_SIZE);
GET_PAGE_EXIT(ERR_PTR(-EIO));
}
struct page* page;
again:
page = xa_load(&block_span->pages, page_ix);
if (page != NULL) { GET_PAGE_EXIT(page); }
// There is no matching release here because the latch is released
// when the fetchers finish (even if this call is interrupted)
s64 seqno;
if (!eggsfs_latch_try_acquire(&block_span->stripe_latches[stripe], seqno)) {
int err = eggsfs_latch_wait_killable(&block_span->stripe_latches[stripe], seqno);
if (err) { GET_PAGE_EXIT(ERR_PTR(err)); }
goto again;
}
page = xa_load(&block_span->pages, page_ix);
if (page != NULL) {
// somebody got to it first
eggsfs_latch_release(&block_span->stripe_latches[stripe], seqno);
GET_PAGE_EXIT(page);
}
eggsfs_debug("about to start fetch stripe");
struct fetch_stripe_state* st = NULL;
st = start_fetch_stripe(block_span, stripe, seqno, false);
if (IS_ERR(st)) {
err = PTR_ERR(st);
st = NULL;
goto out_err;
}
eggsfs_debug("started fetch stripe, waiting");
// Wait for all the block requests
err = down_killable(&st->sema);
if (err) { goto out_err; }
eggsfs_debug("done");
err = atomic_read(&st->err);
if (err) { goto out_err; }
// We're finally done
page = xa_load(&block_span->pages, page_ix);
// eggsfs_info("loaded page %p for span %p at %u", page, span, page_ix);
BUG_ON(page == NULL);
out:
if (st != NULL) { put_fetch_stripe(st); }
// Reclaim if needed
low_on_memory = reclaim_after_fetch_stripe();
GET_PAGE_EXIT(err == 0 ? page : ERR_PTR(err));
out_err:
eggsfs_debug("getting span page failed, err=%d", err);
goto out;
#undef GET_PAGE_EXIT
}
int eggsfs_span_init(void) {
int i;
for (i = 0; i < EGGSFS_SPAN_LRUS; i++) {
spin_lock_init(&span_lrus[i].lock);
INIT_LIST_HEAD(&span_lrus[i].lru);
}
int err = register_shrinker(&eggsfs_span_shrinker);
if (err) { return err; }
eggsfs_block_span_cachep = kmem_cache_create(
"eggsfs_block_span_cache",
sizeof(struct eggsfs_block_span),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
&init_block_span
);
if (!eggsfs_block_span_cachep) {
err = -ENOMEM;
goto out_shrinker;
}
eggsfs_inline_span_cachep = kmem_cache_create(
"eggsfs_inline_span_cache",
sizeof(struct eggsfs_inline_span),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL
);
if (!eggsfs_block_span_cachep) {
err = -ENOMEM;
goto out_block;
}
eggsfs_fetch_stripe_cachep = kmem_cache_create(
"eggsfs_fetch_stripe_cache",
sizeof(struct fetch_stripe_state),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
&init_fetch_stripe
);
if (!eggsfs_fetch_stripe_cachep) {
err = -ENOMEM;
goto out_inline;
}
return 0;
out_inline:
kmem_cache_destroy(eggsfs_fetch_stripe_cachep);
out_block:
kmem_cache_destroy(eggsfs_block_span_cachep);
out_shrinker:
unregister_shrinker(&eggsfs_span_shrinker);
return err;
}
void eggsfs_span_exit(void) {
kmem_cache_destroy(eggsfs_fetch_stripe_cachep);
kmem_cache_destroy(eggsfs_inline_span_cachep);
kmem_cache_destroy(eggsfs_block_span_cachep);
unregister_shrinker(&eggsfs_span_shrinker);
}