Implement RS recovery, although it won't really be used now...

...since it only relies on block service flags, and we don't
set them right now.
This commit is contained in:
Francesco Mazzoli
2023-06-02 10:32:51 +00:00
parent efb92be31a
commit cd86e632e2
18 changed files with 373 additions and 169 deletions

View File

@@ -7,9 +7,8 @@
#include "rs.h"
#define die(...) do { fprintf(stderr, __VA_ARGS__); raise(SIGABRT); } while(false)
#define rs_malloc malloc
#define rs_free free
#define rs_warn(fmt, ...) fprintf(stderr, "rs: " fmt "\n" __VA_OPT__(,) __VA_ARGS__); raise(SIGABRT);
#define die(...) do { rs_warn(__VA_ARGS__); raise(SIGABRT); } while (0)
// See `valgrind.h`
static uint64_t rs_valgrind_client_request(uint64_t defaultResult, uint64_t reqID, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5) {
@@ -102,14 +101,15 @@ struct rs* rs_get(uint8_t parity) {
}
struct rs* r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
if (__builtin_expect(r == nullptr, 0)) {
r = rs_new_core(parity);
r = (struct rs*)malloc(RS_SIZE(rs_data_blocks(parity), rs_parity_blocks(parity)));
rs_new_core(parity, r);
if (r == nullptr) {
die("could not allocate RS data");
}
struct rs* expected = nullptr;
if (!__atomic_compare_exchange_n(&rs_cached[parity], &expected, r, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// somebody else got to it first
rs_delete_core(r);
free(r);
r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
}
}
@@ -194,12 +194,18 @@ void rs_recover(
uint32_t want_block,
uint8_t* want
) {
rs_recover_core(
r, size, have_blocks, have, want_block, want,
[](int D, uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_funcs[D](size, have, want, mat);
}
);
int D = rs_data_blocks(r->parity);
u8* mat = (u8*)malloc(RS_RECOVER_MAT_SIZE(D));
if (mat == NULL) {
free(mat);
die("could not allocate mat");
}
if (!rs_recover_mat(r, have_blocks, want_block, mat)) {
free(mat);
die("could not get recover matrix");
}
rs_recover_matmul_funcs[D](size, have, want, mat);
free(mat);
}
__attribute__((constructor))

View File

@@ -193,7 +193,8 @@ static bool rs_has_cpu_level_core(enum rs_cpu_level level) {
case RS_CPU_GFNI:
return _7_0[2] & (1<<8) && !rs_detect_valgrind();
default:
die("bad CPU level %d\n", level);
rs_warn("bad CPU level %d", level);
return false;
}
}
@@ -364,12 +365,12 @@ static bool rs_has_cpu_level_core(enum rs_cpu_level level) {
} \
} while (0)
static struct rs* rs_new_core(u8 parity) {
#define RS_SIZE(D, P) (sizeof(struct rs) + (D+P)*D + D*P*32)
static void rs_new_core(u8 parity, struct rs* r) {
int B = rs_blocks_core(parity);
int D = rs_data_blocks_core(parity);
int P = rs_parity_blocks_core(parity);
struct rs* r = (struct rs*)rs_malloc(sizeof(struct rs) + B*D + D*P*32);
if (r == NULL) { return NULL; }
r->parity = parity;
r->matrix = (u8*)(r + 1);
r->expanded_matrix = r->matrix + B*D;
@@ -381,35 +382,31 @@ static struct rs* rs_new_core(u8 parity) {
gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
}
}
return r;
}
static void rs_delete_core(struct rs* r) {
rs_free(r);
}
// This is actually enough space to hold two matrices,
// we need the space for the second temporarily.
#define RS_RECOVER_MAT_SIZE(D) (D*D + D*D)
static void rs_recover_core(
static bool rs_recover_mat(
struct rs* r,
u64 size,
u32 have_blocks,
const u8** have,
u32 want_block,
u8* want,
void (*recover_func)(int D, u64 size, const u8** have, u8* want, const u8* mat)
u8* scratch // out mat
) {
int D = rs_data_blocks_core(r->parity);
int B = rs_blocks_core(r->parity);
// Create some space
u8* scratch = (u8*)rs_malloc(D*D + D*D);
u8* mat_1 = scratch;
u8* mat_2 = scratch + D*D;
// Preliminary checks
int i, j, d, b;
if ((have_blocks >> B) || (want_block >> B)) {
die("have_blocks=%08x or want_block=%08x out of bounds wrt B=%d\n", have_blocks, want_block, B);
rs_warn("have_blocks=%08x or want_block=%08x out of bounds wrt B=%d", have_blocks, want_block, B);
return false;
}
if (have_blocks & want_block) {
die("have_blocks=%08x contains want_block=%08x\n", have_blocks, want_block);
rs_warn("have_blocks=%08x contains want_block=%08x", have_blocks, want_block);
return false;
}
// below in the dimensionality annotation we paper over transposes
// [DxD] matrix going from the data blocks to the blocks we currently have
@@ -423,7 +420,8 @@ static void rs_recover_core(
// [DxD] matrix going from what we have to the original data blocks
u8* have_to_data = mat_2;
if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
die("unexpected singular matrix\n");
rs_warn("unexpected singular matrix");
return false;
}
data_to_have = NULL;
// [Dx1] matrix going from the data blocks to the block we want
@@ -438,7 +436,6 @@ static void rs_recover_core(
}
}
// want = have_to_want * have
recover_func(D, size, have, want, have_to_want);
// We're done.
rs_free(scratch);
return true;
}

View File

@@ -209,8 +209,8 @@ struct BlockServiceCache {
AES128Key secretKey;
std::array<uint8_t, 16> failureDomain;
std::array<uint8_t, 4> ip1;
uint16_t port1;
std::array<uint8_t, 4> ip2;
uint16_t port1;
uint16_t port2;
uint8_t storageClass;
};

View File

@@ -137,7 +137,7 @@ func checkFileData(id any, from int, to int, actualData []byte, expectedData []b
if err := os.WriteFile(actualPath, actualData, 0644); err != nil {
panic(fmt.Errorf("mismatching data (%v,%v), could not create data file", from, to))
}
panic(fmt.Errorf("mismatching data (%v,%v), expected data is in %v, found data is in %v", from, to, expectedPath, actualPath))
panic(fmt.Errorf("mismatching data (%v,%v) for file %v, expected data is in %v, found data is in %v", from, to, id, expectedPath, actualPath))
}
}

View File

@@ -38,6 +38,3 @@ clean:
bincode_tests: bincode_tests.c bincodegen.h bincode.h
gcc -Wall -g -O2 -fsanitize=undefined,address -fno-sanitize-recover=all bincode_tests.c -o bincode_tests
rs_tests: rs_tests.c rs.c rs.h intrshims.h
gcc -Wall -g -O2 -fsanitize=undefined,address -fno-sanitize-recover=all rs_tests.c -o rs_tests

View File

@@ -142,4 +142,12 @@ static const u8 EGGSFS_FLASH_STORAGE = 3;
#define EGGSFS_MAX_FILENAME 255
#define EGGSFS_BLOCK_SERVICE_STALE 1u
#define EGGSFS_BLOCK_SERVICE_NO_READ (1u<<1)
#define EGGSFS_BLOCK_SERVICE_NO_WRITE (1u<<2)
#define EGGSFS_BLOCK_SERVICE_DECOMMISSIONED (1u<<3)
#define EGGSFS_BLOCK_SERVICE_DONT_READ (EGGSFS_BLOCK_SERVICE_STALE | EGGSFS_BLOCK_SERVICE_NO_READ | EGGSFS_BLOCK_SERVICE_DECOMMISSIONED)
#define EGGSFS_BLOCK_SERVICE_DONT_WRITE (EGGSFS_BLOCK_SERVICE_STALE | EGGSFS_BLOCK_SERVICE_NO_WRITE | EGGSFS_BLOCK_SERVICE_DECOMMISSIONED)
#endif

View File

@@ -18,6 +18,7 @@ struct eggsfs_block_service {
u32 ip2;
u16 port1;
u16 port2;
u8 flags;
};
struct eggsfs_block_socket;

View File

@@ -8,6 +8,7 @@
#include "block.h"
#include "latch.h"
#include "bincode.h"
#include "rs.h"
#define EGGSFS_ROOT_INODE 0x2000000000000000ull
@@ -61,10 +62,6 @@ static inline void eggsfs_span_policies_last(struct eggsfs_span_policies* polici
#define EGGSFS_SPAN_STATUS_FLUSHING 3 // we're flushing this to the block services
#define EGGSFS_SPAN_STATUS_FLUSHING_LAST 4 // we're flushing this to the block services, and it's the last span
#define EGGSFS_MAX_DATA 10
#define EGGSFS_MAX_PARITY 4
#define EGGSFS_MAX_BLOCKS (EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY)
struct eggsfs_transient_span {
struct list_head list;
// Linear list of pages with the body of the span, used when we're still gathering

View File

@@ -58,4 +58,11 @@ typedef long long __v4di __attribute__((vector_size(32)));
})
#define _mm256_gf2p8mul_epi8(a, b) ((__m256i)__builtin_ia32_vgf2p8mulb_v32qi((__v32qi)(a), (__v32qi)(b)))
// gcc doesn't emit popcnt somehow <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105253>
#define __builtin_popcount(v) ({ \
int c; \
__asm__("popcnt %1, %0" : "=r"(c) : "r"(v)); \
c; \
})
#endif

View File

@@ -3,6 +3,7 @@
extern int eggsfs_debug_output;
#define eggsfs_error_print(fmt, args...) printk(KERN_ERR "eggsfs: %s: " fmt "\n", __func__, ##args)
#define eggsfs_warn_print(fmt, args...) printk(KERN_WARNING "eggsfs: %s: " fmt "\n", __func__, ##args)
#define eggsfs_info_print(fmt, args...) printk(KERN_INFO "eggsfs: %s: " fmt "\n", __func__, ##args)
#define eggsfs_debug_print(fmt, args...) \

View File

@@ -827,7 +827,7 @@ int eggsfs_shard_file_spans(struct eggsfs_fs_info* info, u64 file, u64 offset, u
eggsfs_block_service_get_end(&bs_ctx, bs_flags, end);
eggsfs_bincode_get_finish_list_el(end);
if (likely(bs_ctx.err == 0)) {
eggsfs_file_spans_cb_block(data, j, bs_id.x, ip1.x, port1.x, ip2.x, port2.x, block_id.x, crc.x);
eggsfs_file_spans_cb_block(data, j, bs_id.x, ip1.x, port1.x, ip2.x, port2.x, bs_flags.x, block_id.x, crc.x);
}
blocks_ctx.err = bs_ctx.err;
}

View File

@@ -81,3 +81,8 @@ tmux attach-session -t uovo:1
# ./eggstests -kmod -filter 'mounted fs$' -cfg fsTest.checkThreads=10 -cfg fsTest.numFiles=100 -cfg fsTest.numDirs=10 -short -binaries-dir $(pwd)
# ./eggstests -drop-cached-spans-every 100ms -kmod -filter 'mounted fs$' -cfg fsTest.checkThreads=10 -cfg fsTest.numFiles=100 -cfg fsTest.numDirs=10 -short -binaries-dir $(pwd)
# ./eggstests -kmod -filter 'mounted fs$' -cfg fsTest.checkThreads=100 -cfg fsTest.numFiles=10 -cfg fsTest.numDirs=1 -short -binaries-dir $(pwd)
# sudo sysctl fs.eggsfs.debug=1
# sudo sh -c 'echo eggsfs_fetch_stripe >> /sys/kernel/debug/tracing/set_event'
# sudo cat /sys/kernel/debug/tracing/trace_pipe
# ./eggs/eggstests -kmod -filter 'mounted fs$' -cfg fsTest.checkThreads=1 -cfg fsTest.numFiles=100 -cfg fsTest.numDirs=1 -short -binaries-dir $(pwd)/eggs

176
kmod/rs.c
View File

@@ -1,18 +1,13 @@
#include "rs.h"
#ifdef __KERNEL__
#include <linux/string.h>
#include <linux/slab.h>
#include <asm/fpu/api.h>
#define rs_malloc(sz) kmalloc(sz, GFP_KERNEL)
#define rs_free(p) kfree(p)
#define die(...) BUG()
#include <linux/highmem.h>
#include "log.h"
#endif // __KERNEL__
#define rs_warn(...) eggsfs_error_print(__VA_ARGS__)
#include "intrshims.h"
@@ -50,33 +45,70 @@ int eggsfs_rs_cpu_level = RS_CPU_SCALAR;
rs_compute_parity_gfni(D, P, r, size, data, parity); \
}
#define rs_recover_matmul_scalar_func(D) \
static void rs_recover_matmul_scalar_##D(u64 size, const u8** have, u8* want, const u8* mat) { \
rs_recover_matmul_scalar(D, size, have, want, mat); \
}
#define rs_recover_matmul_avx2_func(D) \
__attribute__((target("avx,avx2"))) \
static void rs_recover_matmul_avx2_##D(u64 size, const u8** have, u8* want, const u8* mat) { \
rs_recover_matmul_avx2(D, size, have, want, mat); \
}
#define rs_recover_matmul_gfni_func(D) \
__attribute__((target("avx,avx2,gfni"))) \
static void rs_recover_matmul_gfni_##D(u64 size, const u8** have, u8* want, const u8* mat) { \
rs_recover_matmul_gfni(D, size, have, want, mat); \
}
#define rs_gen(D, P) \
static struct rs* rs_##D##_##P = NULL; \
static int rs_init_##D##_##P(void) { \
rs_##D##_##P = rs_new_core(eggsfs_mk_parity(D, P)); \
if (rs_##D##_##P == NULL) { \
return -ENOMEM; \
} \
return 0; \
static char rs_##D##_##P_data[RS_SIZE(D, P)]; \
static struct rs* rs_##D##_##P = (struct rs*)rs_##D##_##P_data; \
static void rs_init_##D##_##P(void) { \
rs_new_core(eggsfs_mk_parity(D, P), rs_##D##_##P); \
} \
rs_compute_parity_scalar_func(D, P) \
rs_compute_parity_avx2_func(D, P) \
rs_compute_parity_gfni_func(D, P) \
static void rs_compute_parity_##D##_##P(uint64_t size, const uint8_t** data, uint8_t** parity) { \
static int rs_compute_parity_##D##_##P(uint64_t size, const uint8_t** data, uint8_t** parity) { \
switch (eggsfs_rs_cpu_level) { \
case RS_CPU_SCALAR: \
rs_compute_parity_scalar_##D##_##P(rs_##D##_##P, size, data, parity); \
break; \
return 0; \
case RS_CPU_AVX2: \
rs_compute_parity_avx2_##D##_##P(rs_##D##_##P, size, data, parity); \
break; \
return 0; \
case RS_CPU_GFNI: \
rs_compute_parity_gfni_##D##_##P(rs_##D##_##P, size, data, parity); \
return 0; \
default: \
rs_warn("bad cpu level %d", eggsfs_rs_cpu_level); \
return -EIO; \
} \
} \
rs_recover_matmul_scalar_func(D) \
rs_recover_matmul_avx2_func(D) \
rs_recover_matmul_gfni_func(D)
#define rs_recover_matmul(D) ({ \
void (*fun)(u64 size, const u8** have, u8* want, const u8* mat) = NULL; \
switch (eggsfs_rs_cpu_level) { \
case RS_CPU_SCALAR: \
fun = rs_recover_matmul_scalar_##D; \
break; \
case RS_CPU_AVX2: \
fun = rs_recover_matmul_avx2_##D; \
break; \
case RS_CPU_GFNI: \
fun = rs_recover_matmul_gfni_##D; \
break; \
default: \
die("bad cpu level %d", eggsfs_rs_cpu_level); \
rs_warn("bad cpu level %d", eggsfs_rs_cpu_level); \
break; \
} \
}
fun; \
})
// Right now we don't need anything else, let's not needlessly generate tons of code
rs_gen( 4, 4)
@@ -94,18 +126,104 @@ int eggsfs_compute_parity(u8 parity, ssize_t size, const char** data, char** out
return 0;
}
int err = 0;
int err;
if (parity == eggsfs_mk_parity(4, 4)) {
rs_compute_parity_4_4(size, (const u8**)data, (u8**)out);
err = rs_compute_parity_4_4(size, (const u8**)data, (u8**)out);
} else if (parity == eggsfs_mk_parity(10, 4)) {
rs_compute_parity_10_4(size, (const u8**)data, (u8**)out);
err = rs_compute_parity_10_4(size, (const u8**)data, (u8**)out);
} else {
eggsfs_warn_print("cannot compute with RS(%d,%d)", eggsfs_data_blocks(parity), eggsfs_parity_blocks(parity));
rs_warn("cannot compute with RS(%d,%d)", eggsfs_data_blocks(parity), eggsfs_parity_blocks(parity));
err = -EINVAL;
}
return err;
}
int eggsfs_recover(
u8 parity,
u32 have_blocks,
u32 want_block,
u32 num_pages,
struct list_head* pages
) {
int D = eggsfs_data_blocks(parity);
int P = eggsfs_parity_blocks(parity);
int B = eggsfs_blocks(parity);
BUG_ON(D < 1);
BUG_ON(P == 0);
BUG_ON(__builtin_popcount(have_blocks) != D || __builtin_popcount(want_block) != 1);
if (D == 1) { // mirroring, just copy over
u32 i;
struct list_head* have = &pages[__builtin_ctz(have_blocks)];
struct list_head* want = &pages[__builtin_ctz(want_block)];
for (i = 0; i < num_pages; i++, list_rotate_left(have), list_rotate_left(want)) {
memcpy(
kmap(list_first_entry(want, struct page, lru)),
kmap(list_first_entry(have, struct page, lru)),
PAGE_SIZE
);
kunmap(list_first_entry(have, struct page, lru));
kunmap(list_first_entry(want, struct page, lru));
}
return 0;
}
// decide which one to do
struct rs* rs;
void (*recover_matmul)(u64 size, const u8** have, u8* want, const u8* mat);
if (parity == eggsfs_mk_parity(4, 4)) {
rs = rs_4_4;
recover_matmul = rs_recover_matmul(4);
} else if (parity == eggsfs_mk_parity(10, 4)) {
rs = rs_10_4;
recover_matmul = rs_recover_matmul(10);
} else {
eggsfs_error_print("cannot compute with RS(%d,%d)", D, P);
return -EIO;
}
if (recover_matmul == NULL) {
return -EIO;
}
kernel_fpu_begin();
// compute matrix
u8 mat[RS_RECOVER_MAT_SIZE(EGGSFS_MAX_DATA)];
if (!rs_recover_mat(rs, have_blocks, want_block, mat)) {
kernel_fpu_end();
return -EIO;
}
// compute data
const char* have_bufs[EGGSFS_MAX_DATA];
char* want_buf;
int i, j, b;
for (i = 0; i < num_pages; i++) {
for (b = 0, j = 0; b < B; b++) { // map pages
if ((1u<<b) & have_blocks) {
have_bufs[j] = kmap(list_first_entry(&pages[b], struct page, lru));
j++;
}
if ((1u<<b) & want_block) {
want_buf = kmap(list_first_entry(&pages[b], struct page, lru));
}
}
recover_matmul(PAGE_SIZE, (const u8**)have_bufs, want_buf, mat);
for (b = 0; b < B; b++) { // unmap pages, rotate list
if ((1u<<b) & (have_blocks|want_block)) {
kunmap(list_first_entry(&pages[b], struct page, lru));
list_rotate_left(&pages[b]);
}
}
}
kernel_fpu_end();
return 0;
}
int __init eggsfs_rs_init(void) {
if (rs_has_cpu_level_core(RS_CPU_GFNI)) {
eggsfs_info_print("picking GFNI");
@@ -118,18 +236,12 @@ int __init eggsfs_rs_init(void) {
eggsfs_rs_cpu_level = RS_CPU_SCALAR;
}
int err;
err = rs_init_4_4();
if (err < 0) { return err; }
err = rs_init_10_4();
if (err < 0) { rs_free(rs_4_4); return err; }
rs_init_4_4();
rs_init_10_4();
return 0;
}
void __cold eggsfs_rs_exit(void) {
rs_free(rs_4_4);
rs_free(rs_10_4);
}
void __cold eggsfs_rs_exit(void) {}
#include "gf_tables.c"

View File

@@ -1,9 +1,11 @@
#ifndef _EGGSFS_RS_H
#define _EGGSFS_RS_H
#ifdef __KERNEL__
#include <linux/kernel.h>
#endif
#define EGGSFS_MAX_DATA 10
#define EGGSFS_MAX_PARITY 4
#define EGGSFS_MAX_BLOCKS (EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY)
extern int eggsfs_rs_cpu_level;
@@ -29,6 +31,18 @@ static inline u8 eggsfs_mk_parity(u8 data, u8 parity) {
// You _must_ wrap this with kernel_fpu_begin()/kernel_fpu_end()!
int eggsfs_compute_parity(u8 parity, ssize_t size, const char** data, char** out);
// This function does kernel_fpu_begin()/kernel_fpu_end() itself, since it already
// works with pages.
int eggsfs_recover(
u8 parity,
u32 have_blocks,
u32 want_block,
u32 num_pages,
// B long array of pages. We'll get the pages from the blocks we have and
// put them in the block we want. The lists should all be of the same length.
struct list_head* pages
);
int __init eggsfs_rs_init(void);
void __cold eggsfs_rs_exit(void);

View File

@@ -1,47 +0,0 @@
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
#define die(...) do { fprintf(stderr, __VA_ARGS__); exit(1); } while(false)
#define rs_malloc malloc
#define rs_free free
#define eggsfs_warn_print(fmt, args...) fprintf(stderr, "%s: " fmt "\n", __func__, ##args)
#define eggsfs_info_print(fmt, args...) fprintf(stderr, "%s: " fmt "\n", __func__, ##args)
#define __init
#define __cold
#include "rs.h"
#include "rs.c"
#define DATA_BLOCKS 10
#define PARITY_BLOCKS 4
#define BLOCKS (10+4)
#define PAGE_SIZE 4096
int main(void) {
eggsfs_rs_init();
char* blocks = malloc(PAGE_SIZE*BLOCKS);
if (!blocks) { die("couldn't allocate"); }
char* ptrs[BLOCKS];
int i;
for (i = 0; i < BLOCKS; i++) {
ptrs[i] = blocks + PAGE_SIZE*i;
}
for (i = 1; i < 4; i++) {
eggsfs_rs_cpu_level = i;
int res = eggsfs_compute_parity(eggsfs_mk_parity(DATA_BLOCKS, PARITY_BLOCKS), PAGE_SIZE, (const char**)ptrs, ptrs + DATA_BLOCKS);
if (res < 0) {
die("couldn't compute: %d", res);
}
}
return 0;
}

View File

@@ -9,6 +9,7 @@
#include "counter.h"
#include "wq.h"
#include "trace.h"
#include "intrshims.h"
EGGSFS_DEFINE_COUNTER(eggsfs_stat_cached_spans);
@@ -132,7 +133,7 @@ void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 sto
void eggsfs_file_spans_cb_block(
void* data, int block_ix,
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2,
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2, u8 flags,
u64 block_id, u32 crc
) {
struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data;
@@ -147,6 +148,7 @@ void eggsfs_file_spans_cb_block(
block->bs.port1 = port1;
block->bs.ip2 = ip2;
block->bs.port2 = port2;
block->bs.flags = flags;
block->id = block_id;
block->crc = crc;
}
@@ -240,7 +242,6 @@ retry:
// adding it to the LRU.
if (unlikely(iterations == 10)) {
eggsfs_warn_print("we've been fetching the same span for %llu iterations, we're probably stuck on a yet-to-be enabled span we just fetched", iterations);
GET_SPAN_EXIT(ERR_PTR(-EIO));
}
// Try to read the semaphore if it's already there.
@@ -536,13 +537,18 @@ again:
struct eggsfs_fetch_stripe_state {
struct semaphore sema; // used to wait on remaining = 0. could be done faster with a wait_queue, but don't want to worry about smb subtleties.
s64 stripe_seqno; // used to release the stripe when we're done with it
struct eggsfs_block_socket* socks[EGGSFS_MAX_DATA];
struct eggsfs_block_socket* socks[EGGSFS_MAX_BLOCKS];
struct eggsfs_block_span* span; // what we're working on (must not be put until the end of the fetch)
struct list_head blocks_pages[EGGSFS_MAX_DATA]; // these will be filled in by the fetching
u32 block_crcs[EGGSFS_MAX_DATA];
// these will be filled in by the fetching (only D of them well be
// filled by fetching the blocks, the others might be filled in by the RS
// recovery)
struct list_head blocks_pages[EGGSFS_MAX_BLOCKS];
u32 block_crcs[EGGSFS_MAX_BLOCKS];
atomic_t remaining; // how many block fetch requests are still ongoing
atomic_t refcount; // to garbage collect the state
atomic_t err; // the result of the stripe fetching
static_assert(EGGSFS_MAX_BLOCKS <= 16);
u16 blocks; // which blocks we're fetching, bitmap
u8 stripe; // which stripe we're into
u8 prefetching; // wether we're prefetching (if we are we'll have to dispose of the span ourselves)
};
@@ -555,13 +561,14 @@ static void eggsfs_put_fetch_stripe(struct eggsfs_fetch_stripe_state* st) {
trace_eggsfs_fetch_stripe(enode->inode.i_ino, st->span->span.start, st->stripe, st->prefetching, EGGSFS_FETCH_STRIPE_FREE, atomic_read(&st->err));
struct eggsfs_block_span* span = st->span;
int D = eggsfs_data_blocks(span->parity);
int B = eggsfs_blocks(span->parity);
int i;
for (i = 0; i < D; i++) {
if (st->socks[i]) {
eggsfs_put_fetch_block_socket(st->socks[i]);
}
}
for (i = 0; i < D; i++) {
for (i = 0; i < B; i++) {
put_pages_list(&st->blocks_pages[i]);
}
if (st->prefetching) {
@@ -573,19 +580,78 @@ static void eggsfs_put_fetch_stripe(struct eggsfs_fetch_stripe_state* st) {
}
}
static void eggsfs_fetch_stripe_store_pages(struct eggsfs_fetch_stripe_state* st) {
struct eggsfs_block_span* span = st->span;
int D = eggsfs_data_blocks(span->parity);
int i, j;
// we need to recover data using RS
if (unlikely(st->blocks != (1u<<D)-1)) {
u32 pages = 0;
{ // get the number of pages
struct list_head* tmp;
// eggsfs_info_print("picking %d", __builtin_ctz(st->blocks));
list_for_each(tmp, &st->blocks_pages[__builtin_ctz(st->blocks)]) { // just pick the first one
pages++;
}
}
for (i = 0; i < D; i++) { // fill in missing data blocks
if ((1u<<i) & st->blocks) { continue; } // we have this one already
// eggsfs_info_print("recovering %d, pages=%u", i, pages);
// allocate the pages
struct list_head* i_pages = &st->blocks_pages[i];
for (j = 0; j < pages; j++) {
struct page* page = alloc_page(GFP_KERNEL);
if (!page) {
atomic_set(&st->err, -ENOMEM);
return;
}
list_add_tail(&page->lru, i_pages);
}
// compute
int err = eggsfs_recover(span->parity, st->blocks, 1u<<i, pages, st->blocks_pages);
if (err) {
atomic_set(&st->err, err);
return;
}
}
}
u32 start_page = (span->cell_size/PAGE_SIZE)*D*st->stripe;
u32 end_page = (span->cell_size/PAGE_SIZE)*D*((int)st->stripe + 1);
u32 curr_page = start_page;
// move the pages to the span
for (i = 0; i < D; i++) {
struct page* page;
struct page* tmp;
list_for_each_entry_safe(page, tmp, &st->blocks_pages[i], lru) {
list_del(&page->lru);
// eggsfs_info_print("storing page for span %p at %u", span, curr_page);
struct page* old_page = xa_store(&span->pages, curr_page, page, GFP_KERNEL);
if (IS_ERR(old_page)) {
atomic_set(&st->err, PTR_ERR(old_page));
eggsfs_debug_print("xa_store failed: %ld", PTR_ERR(old_page));
put_page(page); // we removed it from the list
return;
}
BUG_ON(old_page != NULL); // the latch protects against this
atomic64_inc(&eggsfs_stat_cached_span_pages);
curr_page++;
}
// eggsfs_info_print("i=%d curr_page=%u", i, curr_page);
}
// eggsfs_info_print("curr_page=%u end_page=%u", curr_page, end_page);
BUG_ON(curr_page != end_page);
}
static void eggsfs_fetch_stripe_block_done(struct eggsfs_fetch_block_complete* complete, void* data) {
eggsfs_debug_print("block complete %016llx", complete->block_id);
struct eggsfs_fetch_stripe_state* st = (struct eggsfs_fetch_stripe_state*)data;
struct eggsfs_block_span* span = st->span;
struct eggsfs_inode* enode = span->span.enode;
int D = eggsfs_data_blocks(span->parity);
int i;
for (i = 0; i < D; i++) {
if (st->span->blocks[i].id == complete->block_id) { break; }
}
eggsfs_debug_print("block ix %d", i);
BUG_ON(i == D);
int B = eggsfs_blocks(span->parity);
trace_eggsfs_fetch_stripe(
enode->inode.i_ino,
@@ -596,6 +662,13 @@ static void eggsfs_fetch_stripe_block_done(struct eggsfs_fetch_block_complete* c
complete->err
);
int i;
for (i = 0; i < B; i++) {
if (st->span->blocks[i].id == complete->block_id) { break; }
}
// eggsfs_info_print("block ix %d, B=%d", i, B);
BUG_ON(i == B);
// It failed, mark it
if (complete->err) {
eggsfs_debug_print("block %016llx ix %d failed with %d", complete->block_id, i, complete->err);
@@ -618,29 +691,7 @@ out:
eggsfs_debug_print("remaining=%d", remaining);
if (remaining == 0) {
// we're the last one to finish, we need to store the pages in the span
u32 start_page = (span->cell_size/PAGE_SIZE)*D*st->stripe;
u32 end_page = (span->cell_size/PAGE_SIZE)*D*((int)st->stripe + 1);
u32 curr_page = start_page;
for (i = 0; i < D; i++) {
struct page* page;
struct page* tmp;
list_for_each_entry_safe(page, tmp, &st->blocks_pages[i], lru) {
list_del(&page->lru);
// eggsfs_info_print("storing page for span %p at %u", span, curr_page);
struct page* old_page = xa_store(&span->pages, curr_page, page, GFP_KERNEL);
if (IS_ERR(old_page)) {
atomic_set(&st->err, PTR_ERR(old_page));
eggsfs_debug_print("xa_store failed: %ld", PTR_ERR(old_page));
put_page(page); // we removed it from the list
goto out_page_put_err;
}
BUG_ON(old_page != NULL); // the latch protects against this
atomic64_inc(&eggsfs_stat_cached_span_pages);
curr_page++;
}
}
BUG_ON(curr_page != end_page);
out_page_put_err:
eggsfs_fetch_stripe_store_pages(st);
// release the stripe latch, we're done writing
eggsfs_latch_release(&span->stripe_latches[st->stripe], st->stripe_seqno);
// release the span too, if we were prefetching
@@ -666,19 +717,52 @@ static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe(
) {
int err;
struct eggsfs_inode* enode = span->span.enode;
struct eggsfs_fetch_stripe_state* st = kmalloc(sizeof(struct eggsfs_fetch_stripe_state), GFP_KERNEL);
if (!st) { return ERR_PTR(-ENOMEM); }
struct eggsfs_fetch_stripe_state* st = NULL;
int i;
trace_eggsfs_fetch_stripe(enode->inode.i_ino, span->span.start, stripe, prefetching, EGGSFS_FETCH_STRIPE_START, 0);
int D = eggsfs_data_blocks(span->parity);
int B = eggsfs_blocks(span->parity);
if (D > EGGSFS_MAX_DATA || B > EGGSFS_MAX_BLOCKS) {
eggsfs_error_print("got out of bounds parity of RS(%d,%d) for span %llu in file %016lx", D, B-D, span->span.start, enode->inode.i_ino);
err = -EIO;
goto out_err;
}
// If the span does not have enough available blocks, we're in trouble
u32 blocks = 0;
#if 1
{
int found_blocks = 0;
for (i = 0; i < B && found_blocks < D; i++) {
if (likely(!(span->blocks[i].bs.flags & EGGSFS_BLOCK_SERVICE_DONT_READ))) {
blocks |= 1u<<i;
found_blocks++;
}
}
if (found_blocks < D) {
eggsfs_warn_print("could only find %d blocks for span %llu in file %016lx", found_blocks, span->span.start, enode->inode.i_ino);
err = -EIO;
goto out_err;
}
}
#else
// Randomly drop some blocks, useful for testing
while (__builtin_popcount(blocks) < D) {
uint32_t r;
get_random_bytes(&r, sizeof(r));
blocks |= 1u << (r%B);
}
#endif
st = kmalloc(sizeof(struct eggsfs_fetch_stripe_state), GFP_KERNEL);
if (!st) { err = -ENOMEM; goto out_err; }
sema_init(&st->sema, 0);
st->stripe_seqno = stripe_seqno;
st->span = span;
int i;
for (i = 0; i < D; i++) {
for (i = 0; i < B; i++) {
INIT_LIST_HEAD(&st->blocks_pages[i]);
}
atomic_set(&st->remaining, D);
@@ -690,10 +774,14 @@ static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe(
if (st->prefetching) {
atomic_inc_return(&enode->file.prefetches);
}
st->blocks = blocks;
// get block service sockets
eggsfs_debug_print("getting sockets");
memset(&st->socks[0], 0, sizeof(st->socks));
for (i = 0; i < D; i++) {
u32 block;
for (i = 0, block = blocks; block; block >>=1, i++) {
if ((block&1u) == 0) { continue; }
st->socks[i] = eggsfs_get_fetch_block_socket(&span->blocks[i].bs);
if (IS_ERR(st->socks[i])) {
err = PTR_ERR(st->socks[i]);
@@ -703,8 +791,10 @@ static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe(
}
// start fetching blocks
eggsfs_debug_print("fetching blocks");
u32 block_offset = stripe * span->cell_size;
for (i = 0; i < D; i++) {
for (i = 0, block = blocks; block; block >>= 1, i++) {
if ((block&1u) == 0) { continue; }
struct eggsfs_block* block = &span->blocks[i];
atomic_inc(&st->refcount);
struct eggsfs_fetch_block_request* req = eggsfs_fetch_block(
@@ -720,11 +810,12 @@ static struct eggsfs_fetch_stripe_state* eggsfs_start_fetch_stripe(
}
}
eggsfs_debug_print("waiting");
return st;
out_err:
trace_eggsfs_fetch_stripe(enode->inode.i_ino, span->span.start, stripe, prefetching, EGGSFS_FETCH_STRIPE_END, err);
eggsfs_put_fetch_stripe(st); // can't free, there might be queued things already using this
if (st) { eggsfs_put_fetch_stripe(st); }
return ERR_PTR(err);
}
@@ -903,6 +994,7 @@ again:
GET_PAGE_EXIT(page);
}
eggsfs_debug_print("about to start fetch stripe");
struct eggsfs_fetch_stripe_state* st = NULL;
st = eggsfs_start_fetch_stripe(span, stripe, seqno, false);
if (IS_ERR(st)) {
@@ -910,11 +1002,14 @@ again:
st = NULL;
goto out_err;
}
eggsfs_debug_print("started fetch stripe, waiting");
// Wait for all the block requests
err = down_killable(&st->sema);
if (err) { goto out_err; }
eggsfs_debug_print("done");
err = atomic_read(&st->err);
if (err) { goto out_err; }

View File

@@ -77,11 +77,15 @@ void eggsfs_span_put(struct eggsfs_span* span, bool was_read);
// The page_ix is the page number inside the span.
struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix);
void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs);
void eggsfs_file_spans_cb_span(
void* data, u64 offset, u32 size, u32 crc,
u8 storage_class, u8 parity, u8 stripes, u32 cell_size,
const uint32_t* stripes_crcs
);
void eggsfs_file_spans_cb_block(
void* data, int block_ix,
// block service stuff
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2,
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2, u8 flags,
// block stuff
u64 block_id, u32 crc
);

View File

@@ -65,6 +65,13 @@ static struct ctl_table eggsfs_cb_sysctls[] = {
.proc_handler = eggsfs_drop_spans_sysctl,
},
#if 0
{
.procname = "refresh_info",
.data = &refresh_info,
}
#endif
EGGSFS_CTL_INT_TIME(dir_refresh_time),
EGGSFS_CTL_ULONG(span_cache_max_size_async),