First version of kernel module

Initial version really by Pawel, but many changes in between.

Big outstanding issues:

* span cache reclamation (unbounded memory otherwise...)
* bad block service detection and workarounds
* corrupted blocks detection and workaround

Co-authored-by: Paweł Dziepak <pawel.dziepak@xtxmarkets.com>
This commit is contained in:
Francesco Mazzoli
2022-12-01 15:09:50 +00:00
parent d94b582a4d
commit 6addbdee6a
96 changed files with 16355 additions and 2926 deletions
+7
View File
@@ -11,5 +11,12 @@
"go/runeggs/runeggs": true,
"go/gcdaemon/gcdaemon": true,
"go/cli/cli": true,
"kmod/*.cmd": true,
"kmod/*.o": true,
"kmod/*.ko": true,
"kmod/*.mod": true,
"kmod/*.mod.c": true,
"kmod/Module.symvers": true,
"kmod/modules.order": true,
}
}
+1
View File
@@ -6,6 +6,7 @@ set -eu -o pipefail
# build C++, all variants
./cpp/build.py alpine
./cpp/build.py alpine-debug
./cpp/build.py release
./cpp/build.py debug
./cpp/build.py sanitized
+2 -2
View File
@@ -59,7 +59,6 @@
#define EGGSFS_META_LOOKUP 0x1
#define EGGSFS_META_STAT_FILE 0x2
#define EGGSFS_META_STAT_TRANSIENT_FILE 0x3
#define EGGSFS_META_STAT_DIRECTORY 0x4
#define EGGSFS_META_READ_DIR 0x5
#define EGGSFS_META_CONSTRUCT_FILE 0x6
@@ -69,10 +68,11 @@
#define EGGSFS_META_SOFT_UNLINK_FILE 0xA
#define EGGSFS_META_FILE_SPANS 0xB
#define EGGSFS_META_SAME_DIRECTORY_RENAME 0xC
#define EGGSFS_META_ADD_INLINE_SPAN 0x10
#define EGGSFS_META_STAT_TRANSIENT_FILE 0x3
#define EGGSFS_META_SET_DIRECTORY_INFO 0xD
#define EGGSFS_META_SNAPSHOT_LOOKUP 0xE
#define EGGSFS_META_EXPIRE_TRANSIENT_FILE 0xF
#define EGGSFS_META_ADD_INLINE_SPAN 0x10
#define EGGSFS_META_VISIT_DIRECTORIES 0x70
#define EGGSFS_META_VISIT_FILES 0x71
#define EGGSFS_META_VISIT_TRANSIENT_FILES 0x72
+7 -7
View File
@@ -16,8 +16,8 @@ if(NOT CMAKE_BUILD_TYPE)
)
endif()
if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|sanitized|valgrind)$"))
message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, got ${CMAKE_BUILD_TYPE}")
if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|alpine-debug|sanitized|valgrind)$"))
message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, alpine-debug got ${CMAKE_BUILD_TYPE}")
endif()
set(CMAKE_CXX_STANDARD 20)
@@ -33,13 +33,13 @@ add_compile_options("$<$<CONFIG:valgrind>:-march=haswell;-maes;-mgfni>")
add_compile_options("$<$<NOT:$<CONFIG:valgrind>>:-march=skylake;-mgfni>")
# performance/debug stuff
add_compile_options("$<$<NOT:$<CONFIG:debug>>:-O3>")
add_compile_options("$<$<CONFIG:debug>:-O;-DEGGS_DEBUG>")
add_compile_options("$<$<NOT:$<CONFIG:debug,alpine-debug>>:-O3>")
add_compile_options("$<$<CONFIG:debug,alpine-debug>:-O;-DEGGS_DEBUG>")
# We build the release build statically in Alpine
add_compile_options("$<$<CONFIG:alpine>:-DEGGS_ALPINE>")
add_link_options("$<$<CONFIG:alpine>:-static>")
add_link_options("$<$<NOT:$<CONFIG:alpine>>:-no-pie>")
add_compile_options("$<$<CONFIG:alpine,alpine-debug>:-DEGGS_ALPINE>")
add_link_options("$<$<CONFIG:alpine,alpine-debug>:-static>")
add_link_options("$<$<NOT:$<CONFIG:alpine,alpine-debug>>:-no-pie>")
# sanitizer options
set(SANITIZE_OPTIONS "-fsanitize=undefined,address,integer,function;-fno-sanitize-recover=all;-fsanitize-blacklist=${CMAKE_SOURCE_DIR}/ubsan-ignorelist")
+3 -3
View File
@@ -5,7 +5,7 @@ from pathlib import Path
import subprocess
if len(sys.argv) < 2:
print(f'Usage: {sys.argv[0]} release|alpine|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
print(f'Usage: {sys.argv[0]} release|alpine|alpine-debug|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
sys.exit(2)
if len(sys.argv) == 1:
@@ -19,9 +19,9 @@ repo_dir = cpp_dir.parent
build_dir = cpp_dir / 'build' / build_type
build_dir.mkdir(parents=True, exist_ok=True)
if build_type == 'alpine' and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
if build_type in ('alpine', 'alpine-debug') and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
subprocess.run(
['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', 'alpine'] + sys.argv[2:],
['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', build_type] + sys.argv[2:],
check=True,
)
else:
+1 -1
View File
@@ -264,7 +264,6 @@ struct MakeDirectoryStateMachine {
void createDirectoryInode() {
auto& shardReq = env.needsShard(MAKE_DIRECTORY_CREATE_DIR, state.dirId().shard()).setCreateDirectoryInode();
shardReq.id = state.dirId();
shardReq.info = req.info;
shardReq.ownerId = req.ownerId;
}
@@ -738,6 +737,7 @@ struct SoftUnlinkDirectoryStateMachine {
shardReq.name = req.name;
shardReq.targetId = req.targetId;
shardReq.wasMoved = false;
shardReq.creationTime = req.creationTime;
}
void afterRollback(EggsError err, const ShardRespContainer* resp) {
-20
View File
@@ -3,26 +3,6 @@
std::ostream& operator<<(std::ostream& out, const BincodeBytesRef& x) {
return goLangBytesFmt(out, x.data(), x.size());
/*
out << "b\"";
uint8_t len = x.size();
const uint8_t* data = (const uint8_t*)x.data();
for (int i = 0; i < len; i++) {
uint8_t ch = data[i];
if (isprint(ch)) {
out << ch;
} else if (ch == 0) {
out << "\\0";
} else {
const char cfill = out.fill();
out << std::hex << std::setfill('0');
out << "\\x" << std::setw(2) << (int)ch;
out << std::setfill(cfill) << std::dec;
}
}
out << "\"";
return out;
*/
}
std::ostream& operator<<(std::ostream& out, const BincodeBytes& x) {
+24
View File
@@ -4,6 +4,7 @@
#include <netinet/in.h>
#include <string.h>
#include <unistd.h>
#include <iomanip>
// Throwing in static initialization is nasty, and there is no useful stacktrace
// Also use direct syscalls to write the error as iostream might not be initialized
@@ -48,4 +49,27 @@ std::ostream& goLangBytesFmt(std::ostream& out, const char* str, size_t len) {
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes) {
return goLangBytesFmt(out, bytes.str, bytes.len);
}
std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* data, size_t len) {
out << "\"";
for (int i = 0; i < len; i++) {
uint8_t ch = data[i];
if (isprint(ch)) {
out << ch;
} else if (ch == 0) {
out << "\\0";
} else {
const char cfill = out.fill();
out << std::hex << std::setfill('0');
out << "\\x" << std::setw(2) << (int)ch;
out << std::setfill(cfill) << std::dec;
}
}
out << "\"";
return out;
}
std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes) {
return goLangQuotedStringFmt(out, bytes.str, bytes.len);
}
+12 -1
View File
@@ -52,4 +52,15 @@ struct GoLangBytesFmt {
GoLangBytesFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
};
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* str, size_t len);
struct GoLangQuotedStringFmt {
const char* str;
size_t len;
GoLangQuotedStringFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
};
std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes);
+987 -987
View File
File diff suppressed because it is too large Load Diff
+546 -546
View File
File diff suppressed because it is too large Load Diff
+2 -2
View File
@@ -1,6 +1,6 @@
add_library(crc32c crc32c.h crc32c.cpp iscsi.hpp)
add_library(crc32c crc32c.h crc32c.c iscsi.h)
add_executable(crc32c-tables tables.cpp iscsi.hpp)
add_executable(crc32c-tables tables.cpp iscsi.h)
add_executable(crc32c-tests tests.cpp)
target_link_libraries(crc32c-tests PRIVATE crc32c)
+63 -23
View File
@@ -1,11 +1,29 @@
#ifndef __KERNEL__
#include "crc32c.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>
#define kernel_static
typedef uint8_t u8;
typedef uint32_t u32;
#else
#define kernel_static static
#endif
#ifdef __clang__
__attribute__((no_sanitize("integer")))
static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_blocks) {
#endif
#ifdef __KERNEL__
__attribute__((target("crc32,pclmul")))
#endif
static u32 crc32_fusion_kernel(u32 acc_a, const char* buf, size_t n_blocks) {
size_t stride = n_blocks * 24 + 8;
// Four chunks:
// Chunk A: 0 through stride
@@ -18,8 +36,8 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
__m128i x2 = _mm_loadu_si128((__m128i*)(buf2 + 16));
__m128i x3 = _mm_loadu_si128((__m128i*)(buf2 + 32));
__m128i x4 = _mm_loadu_si128((__m128i*)(buf2 + 48));
uint32_t acc_b = 0;
uint32_t acc_c = 0;
u32 acc_b = 0;
u32 acc_c = 0;
// Parallel fold remaining blocks of 64 from D, and 24 from each of A/B/C.
// k1 == magic(4*128+32-1)
// k2 == magic(4*128-32-1)
@@ -100,16 +118,16 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
stack_a = ~stack_a;
stack_b = ~stack_b;
stack_c = ~stack_c;
uint32_t magic_a = ((uint32_t)0x80000000) >> (bits_a & 31); bits_a >>= 5;
uint32_t magic_b = ((uint32_t)0x80000000) >> (bits_b & 31); bits_b >>= 5;
uint32_t magic_c = ((uint32_t)0x80000000) >> (bits_c & 31); bits_c >>= 5;
u32 magic_a = ((u32)0x80000000) >> (bits_a & 31); bits_a >>= 5;
u32 magic_b = ((u32)0x80000000) >> (bits_b & 31); bits_b >>= 5;
u32 magic_c = ((u32)0x80000000) >> (bits_c & 31); bits_c >>= 5;
bits_a -= bits_b;
bits_b -= bits_c;
for (; bits_c; --bits_c) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0), magic_c = _mm_crc32_u32(magic_c, 0);
for (; bits_b; --bits_b) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0);
for (; bits_a; --bits_a) magic_a = _mm_crc32_u32(magic_a, 0);
for (;;) {
uint32_t low = stack_a & 1;
u32 low = stack_a & 1;
if (!(stack_a >>= 1)) break;
__m128i x = _mm_cvtsi32_si128(magic_a);
uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
@@ -130,13 +148,18 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
x1 = _mm_xor_si128(x1, x5);
uint64_t abc = _mm_cvtsi128_si64(_mm_xor_si128(_mm_xor_si128(vec_c, vec_a), vec_b));
// Apply missing <<32 and fold down to 32-bits.
uint32_t crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
u32 crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
crc = _mm_crc32_u64(crc, abc ^ _mm_extract_epi64(x1, 1));
return crc;
}
#ifdef __clang__
__attribute__((no_sanitize("integer")))
uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
#endif
#ifdef __KERNEL__
__attribute__((target("crc32")))
#endif
kernel_static u32 crc32c(u32 crc, const char* buf, size_t length) {
crc = ~crc; // preset to -1 (distinguish leading zeros)
if (length >= 31) {
size_t n_blocks = (length - 16) / 136;
@@ -149,7 +172,7 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
const char* kernel_start = kernel_end - kernel_length;
length -= kernel_start - buf;
for (; buf != kernel_start; ++buf) {
crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
crc = _mm_crc32_u8(crc, *(const u8*)buf);
}
if (n_blocks) {
length -= kernel_length;
@@ -161,17 +184,34 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
}
for (; length; --length, ++buf) {
crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
crc = _mm_crc32_u8(crc, *(const u8*)buf);
}
return ~crc; // post-invert -- distinguish scaled multiples
}
#include "iscsi.hpp"
#ifdef __clang__
__attribute__((no_sanitize("integer")))
#endif
#ifdef __KERNEL__
__attribute__((target("crc32")))
#endif
kernel_static u32 crc32c_simple(u32 crc, const char* buf, size_t length) {
crc = ~crc; // preset to -1 (distinguish leading zeros)
for (; length >= 8; length -= 8, buf += 8) {
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
}
for (; length; --length, ++buf) {
crc = _mm_crc32_u8(crc, *(const u8*)buf);
}
return ~crc; // post-invert -- distinguish scaled multiples
}
#include "iscsi.h"
// In the comments below, multiplication is meant to be modulo ISCSI_POLY.
// Stores x^2^0, ..., x^2^31. Can be generated with `mult_mod_p`, see tables.cpp.
static uint32_t CRC_POWER_TABLE[31] = {
static u32 CRC_POWER_TABLE[31] = {
0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,
0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a,
@@ -182,7 +222,7 @@ static uint32_t CRC_POWER_TABLE[31] = {
};
// Stores x^-(2^0), ..., x^-(2^31). Can be generated with `mult_mod_p`, see tables.cpp.
static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
static u32 CRC_INVERSE_POWER_TABLE[31] = {
0x05ec76f1, 0x0bd8ede2, 0x2f63b788, 0xfde39562, 0xbef0965e,
0xd610d67e, 0xe67cce65, 0xa268b79e, 0x134fb088, 0x32998d96,
0xcedac2cc, 0x70118575, 0x0e004a40, 0xa7864c8b, 0xbc7be916,
@@ -193,7 +233,7 @@ static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
};
// Return x^(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
static uint32_t x2n_mod_p(size_t n, uint32_t k) {
static u32 x2n_mod_p(size_t n, u32 k) {
// CRC_POWER_TABLE has all powers of two can multiply combinations
// of these to achieve any power.
@@ -210,7 +250,7 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
// We walk along the p_is above and keep adding to the result.
//
// Note that 2^2^(31 + k) = 2^2^k TODO clarify
uint32_t p = 1u << 31;
u32 p = 1u << 31;
for (; n != 0; n >>= 1, k++) {
if (n & 1) {
// p(x) = p(x) * 2^(k % 31)
@@ -221,8 +261,8 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
}
// Return x^-(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
uint32_t p = 1u << 31;
static u32 x2n_mod_p_inv(size_t n, u32 k) {
u32 p = 1u << 31;
for (; n != 0; n >>= 1, k++) {
if (n & 1) {
p = crc32c_mult_mod_p(CRC_INVERSE_POWER_TABLE[k & 0x1F], p);
@@ -231,7 +271,7 @@ static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
return p;
}
uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
kernel_static u32 crc32c_zero_extend(u32 crc, ssize_t zeros) {
if (zeros > 0) {
return ~crc32c_mult_mod_p(x2n_mod_p(zeros, 3), ~crc);
} else {
@@ -239,17 +279,17 @@ uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
}
}
uint32_t crc32c_append(uint32_t crc_a, uint32_t crc_b, size_t len_b) {
kernel_static u32 crc32c_append(u32 crc_a, u32 crc_b, size_t len_b) {
// We need to extend crc_a with len_b*8 zeros (len_b is the number
// of bytes) (without the inversion).
// This amounts to performing `crc_a * x^(len_b*8)`.
return crc32c_mult_mod_p(x2n_mod_p(len_b, 3), crc_a) ^ crc_b;
}
uint32_t crc32c_xor(uint32_t crc_a, uint32_t crc_b, size_t len) {
kernel_static u32 crc32c_xor(u32 crc_a, u32 crc_b, size_t len) {
// We need to to extend crc_a with the crc of len*8 bits.
// We could do this in 32 steps rather than 32+32 with a dedicated
// table, but probably doesn't matter.
uint32_t crc_0 = ~crc32c_mult_mod_p(~(uint32_t)0, x2n_mod_p(len, 3));
u32 crc_0 = ~crc32c_mult_mod_p(~(u32)0, x2n_mod_p(len, 3));
return crc_a ^ crc_b ^ crc_0;
}
}
+24
View File
@@ -0,0 +1,24 @@
#define ISCSI_POLY 0x82F63B78u
// Return a(x) multiplied by b(x) modulo ISCSI_POLY, For speed, this requires
// that a(x) not be zero.
static u32 crc32c_mult_mod_p(u32 a, u32 b) {
// m goes from x^0 to x^31
u32 m = 1u << 31;
u32 p = 0;
for (;;) {
// If a(x) contains x^n, add b(x)*x^n
if (a & m) {
p ^= b;
// Exit when there are no higher bits in a(x)
if ((a & (m - 1)) == 0) {
break;
}
}
// Go from x^n to x^(n+1)
m >>= 1;
// Go from b(x)*x^n to b(x)*x^(n+1)
b = (b & 1) ? ((b >> 1) ^ ISCSI_POLY) : b >> 1;
}
return p;
}
+1
View File
@@ -30,6 +30,7 @@ int main() {
for (int i = 0; i < 1000; i++) {
auto s1 = randString(1 + wyhash64(&rand)%100);
uint32_t crc1 = crc32c(0, (const char*)s1.data(), s1.size());
ASSERT(crc1 == crc32c_append(0, crc1, s1.size()));
auto s2 = randString(1 + wyhash64(&rand)%100);
uint32_t crc2 = crc32c(0, (const char*)s2.data(), s2.size());
std::vector<uint8_t> s = s1;
+1 -1
View File
@@ -1,4 +1,4 @@
add_library(rs rs.h rs.cpp gf.hpp gf_tables.cpp)
add_library(rs rs.h rs.cpp gf_tables.c)
add_executable(rs-tests tests.cpp)
target_link_libraries(rs-tests PRIVATE rs)
-110
View File
@@ -1,110 +0,0 @@
#pragma once
#include <immintrin.h>
#include <stdint.h>
#include <string.h>
extern const uint8_t rs_gf_inv_table[256];
extern const uint8_t rs_gf_log_table[256];
extern const uint8_t rs_gf_exp_table[256];
inline uint8_t gf_inv(uint8_t x) {
return rs_gf_inv_table[x];
}
inline uint8_t gf_mul(uint8_t x, uint8_t y) {
if (x == 0 || y == 0) {
return 0;
}
int i = rs_gf_log_table[x] + rs_gf_log_table[y];
return rs_gf_exp_table[i > 254 ? i - 255 : i];
}
inline void gf_mul_expand_factor(uint8_t x, uint8_t* expanded_x) {
for (int i = 0; i < 16; i++) {
expanded_x[i] = gf_mul(i, x);
}
for (int i = 0; i < 16; i++) {
expanded_x[16 + i] = gf_mul(i << 4, x);
}
}
inline uint8_t gf_mul_expanded(uint8_t x, const uint8_t* expanded_y) {
return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
}
inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
__m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
__m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
__m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
__m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
return _mm256_xor_si256(
_mm256_shuffle_epi8(expanded_y_lo, x_lo),
_mm256_shuffle_epi8(expanded_y_hi, x_hi)
);
}
// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
// just Gaussian elimination.
//
// TODO this is in row-major, it'd be nice to have it in column-major
// to save have the final operation to be more natural in rs_recover.
__attribute__((noinline))
static bool rs_gf_invert_matrix(uint8_t* in_mat, uint8_t* out_mat, const int n) {
int i, j, k;
uint8_t temp;
// Set out_mat[] to the identity matrix
memset(out_mat, 0, n*n);
for (i = 0; i < n; i++) {
out_mat[i * n + i] = 1;
}
// Inverse
for (i = 0; i < n; i++) {
// Check for 0 in pivot element
if (in_mat[i * n + i] == 0) {
// Find a row with non-zero in current column and swap
for (j = i + 1; j < n; j++) {
if (in_mat[j * n + i]) {
break;
}
}
if (j == n) { // Couldn't find means it's singular
return false;
}
for (k = 0; k < n; k++) { // Swap rows i,j
temp = in_mat[i * n + k];
in_mat[i * n + k] = in_mat[j * n + k];
in_mat[j * n + k] = temp;
temp = out_mat[i * n + k];
out_mat[i * n + k] = out_mat[j * n + k];
out_mat[j * n + k] = temp;
}
}
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
}
for (j = 0; j < n; j++) {
if (j == i) {
continue;
}
temp = in_mat[j * n + i];
for (k = 0; k < n; k++) {
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
}
}
}
return true;
}
+5 -3
View File
@@ -1,7 +1,9 @@
// generated with gf_tables.py
#ifndef __KERNEL__
#include <stdint.h>
#endif
extern const uint8_t rs_gf_exp_table[256] = {
const uint8_t rs_gf_exp_table[256] = {
0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
@@ -20,7 +22,7 @@ extern const uint8_t rs_gf_exp_table[256] = {
0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01,
};
extern const uint8_t rs_gf_log_table[256] = {
const uint8_t rs_gf_log_table[256] = {
0x00, 0xff, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
@@ -39,7 +41,7 @@ extern const uint8_t rs_gf_log_table[256] = {
0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07,
};
extern const uint8_t rs_gf_inv_table[256] = {
const uint8_t rs_gf_inv_table[256] = {
0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0, 0xb0, 0xe1, 0xe5, 0xc7,
0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f, 0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2,
0x3a, 0x6e, 0x5a, 0xf1, 0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2,
+76 -337
View File
@@ -6,108 +6,10 @@
#include <array>
#include "rs.h"
#include "gf.hpp"
#define die(...) do { fprintf(stderr, __VA_ARGS__); raise(SIGABRT); } while(false)
static void* malloc_or_die(size_t size, const char* what) {
void* ptr = malloc(size);
if (ptr == nullptr) {
die(what);
}
return ptr;
}
struct rs {
uint8_t parity;
// uint8_t[D*B], in column-major.
uint8_t* matrix;
// uint8_t[D*P][32], in column-major. These are the lookup tables
// to perform multiplication quickly.
uint8_t* expanded_matrix;
};
static struct rs* rs_cached[256] = {
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
};
// Note that we pervasively assume that the first column of the parity columns
// is 1s, which causes the first parity to be the XORs of the data. So you can't
// really change how the matrix is generated.
static void rs_cauchy_matrix(struct rs* r) {
int D = rs_data_blocks(r->parity);
int B = rs_blocks(r->parity);
uint8_t* matrix = r->matrix;
memset(matrix, 0, D*B);
// Identity in the d*d upper half
for (int i = 0; i < D; i++) {
matrix[D*i + i] = 1;
}
// Fill in the rest using cauchy
for (int col = D; col < B; col++) {
for (int row = 0; row < D; row++) {
matrix[col*D + row] = gf_inv(col ^ row);
}
}
// Scale the columns
for (int col = D; col < B; col++) {
uint8_t factor = gf_inv(matrix[col*D]);
for (int row = 0; row < D; row++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
// Scale the rows
for (int row = 1; row < D; row++) {
uint8_t factor = gf_inv(matrix[D*D + row]);
for (int col = D; col < B; col++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
}
static struct rs* rs_new(uint8_t parity) {
int B = rs_blocks(parity);
int D = rs_data_blocks(parity);
int P = rs_parity_blocks(parity);
struct rs* r = (struct rs*)malloc_or_die(sizeof(struct rs) + B*D + D*P*32, "rs_new\n");
r->parity = parity;
r->matrix = (uint8_t*)(r + 1);
r->expanded_matrix = r->matrix + B*D;
rs_cauchy_matrix(r);
for (int p = 0; p < P; p++) {
for (int d = 0; d < D; d++) {
gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
}
}
return r;
}
static void rs_delete(struct rs* r) {
free(r);
}
static std::array<uint32_t, 4> rs_cpuidex(uint32_t function_id, uint32_t subfunction_id) {
uint32_t a, b, c, d;
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
return {a, b, c, d};
}
static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
#define rs_malloc malloc
#define rs_free free
// See `valgrind.h`
static uint64_t rs_valgrind_client_request(uint64_t defaultResult, uint64_t reqID, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5) {
@@ -129,20 +31,48 @@ static bool rs_detect_valgrind() {
return rs_valgrind_client_request(0, 0x1001, 0, 0, 0, 0, 0);
}
bool rs_has_cpu_level(rs_cpu_level level) {
const auto cpuid7 = (rs_cpuidex(0, 0)[0] >= 7) ? rs_cpuidex(7, 0) : std::array<uint32_t, 4>{0, 0, 0, 0};
switch (level) {
case RS_CPU_SCALAR:
return true;
case RS_CPU_AVX2:
return cpuid7[1] & (1<<5);
case RS_CPU_GFNI:
return cpuid7[2] & (1<<8) && !rs_detect_valgrind();
default:
die("bad CPU level %d\n", level);
}
// This will emit vbroadcastb
__attribute__((no_sanitize("integer")))
static inline __m256i broadcast_u8(uint8_t x) {
return _mm256_set_epi8(
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x
);
}
#include "rs_core.c"
uint8_t rs_parity(struct rs* r) {
return r->parity;
}
static struct rs* rs_cached[256] = {
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
};
bool rs_has_cpu_level(rs_cpu_level level) {
return rs_has_cpu_level_core(level);
}
static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
__attribute__((constructor))
void rs_detect_cpu_level() {
if (rs_has_cpu_level(RS_CPU_GFNI)) {
@@ -172,124 +102,46 @@ struct rs* rs_get(uint8_t parity) {
}
struct rs* r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
if (__builtin_expect(r == nullptr, 0)) {
r = rs_new(parity);
r = rs_new_core(parity);
if (r == nullptr) {
die("could not allocate RS data");
}
struct rs* expected = nullptr;
if (!__atomic_compare_exchange_n(&rs_cached[parity], &expected, r, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// somebody else got to it first
rs_delete(r);
rs_delete_core(r);
r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
}
}
return r;
}
uint8_t rs_parity(struct rs* r) {
return r->parity;
template<int D, int P> __attribute__((noinline))
static void rs_compute_parity_scalar_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
rs_compute_parity_scalar(D, P, r, size, data, parity);
}
// This will emit vbroadcastb
__attribute__((no_sanitize("integer")))
inline __m256i broadcast_u8(uint8_t x) {
return _mm256_set_epi8(
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x
);
template<int D, int P> __attribute__((noinline))
static void rs_compute_parity_avx2_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
rs_compute_parity_avx2(D, P, r, size, data, parity);
}
template<int D, int P>
static void rs_compute_parity_single(struct rs* r, uint64_t i, const uint8_t** data, uint8_t** parity) {
parity[0][i] = 0;
for (int d = 0; d < D; d++) {
parity[0][i] ^= data[d][i];
}
for (int p = 1; p < P; p++) {
const uint8_t* factor = &r->expanded_matrix[D*32*p];
parity[p][i] = 0;
for (int d = 0; d < D; d++, factor += 32) {
parity[p][i] ^= gf_mul_expanded(data[d][i], factor);
}
}
}
template<int D, int P>
__attribute__((noinline))
void rs_compute_parity_scalar(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
// parity = r->matrix * data
for (uint64_t i = 0; i < size; i++) {
rs_compute_parity_single<D, P>(r, i, data, parity);
}
}
template<int D, int P>
__attribute__((noinline))
void rs_compute_parity_avx2(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
__m256i low_nibble_mask = broadcast_u8(0x0f);
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
{
__m256i parity_0 = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
}
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
}
for (int p = 1; p < P; p++) {
__m256i parity_p = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
__m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]);
parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask));
}
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
}
}
for (uint64_t i = avx_size; i < size; i++) {
rs_compute_parity_single<D, P>(r, i, data, parity);
}
}
template<int D, int P>
__attribute__((noinline))
void rs_compute_parity_gfni(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
{
__m256i parity_0 = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
}
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
}
for (int p = 1; p < P; p++) {
__m256i parity_p = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
__m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]);
parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor));
}
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
}
}
for (uint64_t i = avx_size; i < size; i++) {
rs_compute_parity_single<D, P>(r, i, data, parity);
}
template<int D, int P> __attribute__((noinline))
static void rs_compute_parity_gfni_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
rs_compute_parity_gfni(D, P, r, size, data, parity);
}
template<int D, int P>
static void rs_compute_parity_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
switch (rs_cpu_level l = rs_get_cpu_level()) {
case RS_CPU_SCALAR:
rs_compute_parity_scalar<D, P>(r, size, data, parity);
rs_compute_parity_scalar_tmpl<D, P>(r, size, data, parity);
break;
case RS_CPU_AVX2:
rs_compute_parity_avx2<D, P>(r, size, data, parity);
rs_compute_parity_avx2_tmpl<D, P>(r, size, data, parity);
break;
case RS_CPU_GFNI:
rs_compute_parity_gfni<D, P>(r, size, data, parity);
rs_compute_parity_gfni_tmpl<D, P>(r, size, data, parity);
break;
default:
die("bad cpu_level %d\n", l);
@@ -301,101 +153,32 @@ void rs_compute_parity(struct rs* r, uint64_t size, const uint8_t** data, uint8_
rs_compute_parity_funcs[r->parity](r, size, data, parity);
}
template<int D>
static void rs_recover_matmul_single(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
want[i] = 0;
for (int j = 0; j < D; j++) {
want[i] ^= gf_mul(have_to_want[j], have[j][i]);
}
template<int D> __attribute__((noinline))
static void rs_recover_matmul_scalar_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_scalar(D, size, have, want, mat);
}
template<int D>
static void rs_recover_matmul_single_expanded(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want_expanded) {
want[i] = 0;
for (int j = 0; j < D; j++) {
want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]);
}
template<int D> __attribute__((noinline))
static void rs_recover_matmul_avx2_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_avx2(D, size, have, want, mat);
}
template<int D>
__attribute__((noinline))
static void rs_recover_matmul_scalar(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
uint8_t have_to_want_expanded[D*32];
for (int i = 0; i < D; i++) {
gf_mul_expand_factor(have_to_want[i], &have_to_want_expanded[i*32]);
}
for (size_t i = 0; i < size; i++) {
rs_recover_matmul_single_expanded<D>(i, have, want, have_to_want_expanded);
}
}
template<int D>
__attribute__((noinline))
static void rs_recover_matmul_avx2(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
__m256i have_to_want_expanded[D];
for (int i = 0; i < D; i++) {
gf_mul_expand_factor(have_to_want[i], (uint8_t*)&have_to_want_expanded[i]);
}
__m256i low_nibble_mask = broadcast_u8(0x0f);
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
__m256i want_i = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
want_i = _mm256_xor_si256(
want_i,
gf_mul_expanded_avx2(
_mm256_loadu_si256((const __m256i*)(have[d] + i)),
have_to_want_expanded[d],
low_nibble_mask
)
);
}
_mm256_storeu_si256((__m256i*)(want + i), want_i);
}
for (uint64_t i = avx_size; i < size; i++) {
rs_recover_matmul_single_expanded<D>(i, have, want, (const uint8_t*)have_to_want_expanded);
}
}
template<int D>
__attribute__((noinline))
static void rs_recover_matmul_gfni(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
__m256i have_to_want_avx[D];
for (int i = 0; i < D; i++) {
have_to_want_avx[i] = broadcast_u8(have_to_want[i]);
}
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
__m256i want_i = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
want_i = _mm256_xor_si256(
want_i,
_mm256_gf2p8mul_epi8(
_mm256_loadu_si256((const __m256i*)(have[d] + i)),
have_to_want_avx[d]
)
);
}
_mm256_storeu_si256((__m256i*)(want + i), want_i);
}
for (uint64_t i = avx_size; i < size; i++) {
rs_recover_matmul_single<D>(i, have, want, have_to_want);
}
template<int D> __attribute__((noinline))
static void rs_recover_matmul_gfni_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_gfni(D, size, have, want, mat);
}
template<int D>
static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
switch (rs_cpu_level l = rs_get_cpu_level()) {
case RS_CPU_SCALAR:
rs_recover_matmul_scalar<D>(size, have, want, mat);
rs_recover_matmul_scalar_tmpl<D>(size, have, want, mat);
break;
case RS_CPU_AVX2:
rs_recover_matmul_avx2<D>(size, have, want, mat);
rs_recover_matmul_avx2_tmpl<D>(size, have, want, mat);
break;
case RS_CPU_GFNI:
rs_recover_matmul_gfni<D>(size, have, want, mat);
rs_recover_matmul_gfni_tmpl<D>(size, have, want, mat);
break;
default:
die("bad cpu_level %d\n", l);
@@ -403,7 +186,6 @@ static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t*
}
static void (*rs_recover_matmul_funcs[16])(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat);
void rs_recover(
struct rs* r,
uint64_t size,
@@ -412,55 +194,12 @@ void rs_recover(
uint8_t want_block,
uint8_t* want
) {
int D = rs_data_blocks(r->parity);
int B = rs_blocks(r->parity);
// Create some space
uint8_t* scratch = (uint8_t*)malloc(D*D + D*D);
uint8_t* mat_1 = scratch;
uint8_t* mat_2 = scratch + D*D;
// Preliminary checks
for (int i = 0; i < D; i++) {
if (have_blocks[i] >= B) {
die("have_blocks[%d]=%d >= %d\n", i, have_blocks[i], B);
rs_recover_core(
r, size, have_blocks, have, want_block, want,
[](int D, uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_funcs[D](size, have, want, mat);
}
if (have_blocks[i] == want_block) {
die("have_blocks[%d]=%d == want_block=%d\n", i, have_blocks[i], want_block);
}
if (i > 0 && have_blocks[i] <= have_blocks[i-1]) {
die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", i, have_blocks[i], i, have_blocks[i-1]);
}
}
// below in the dimensionality annotation we paper over transposes
// [DxD] matrix going from the data blocks to the blocks we currently have
uint8_t* data_to_have = mat_1;
for (int i = 0, have_cursor = 0; i < B; i++) {
if (have_cursor >= D || have_blocks[have_cursor] != i) {
continue;
}
memcpy(data_to_have + have_cursor*D, r->matrix + i*D, D);
have_cursor++;
}
// [DxD] matrix going from what we have to the original data blocks
uint8_t* have_to_data = mat_2;
if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
die("unexpected singular matrix\n");
}
data_to_have = nullptr;
// [Dx1] matrix going from the data blocks to the block we want
uint8_t* data_to_want = &r->matrix[want_block*D];
// have_to_want = data_to_want * have_to_data
// [Dx1] matrix going from `blocks` to the block we're into
uint8_t* have_to_want = mat_1;
for (int i = 0; i < D; i++) {
have_to_want[i] = 0;
for (int j = 0; j < D; j++) {
have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
}
}
// want = have_to_want * have
rs_recover_matmul_funcs[D](size, have, want, have_to_want);
// We're done.
free(scratch);
);
}
__attribute__((constructor))
+451
View File
@@ -0,0 +1,451 @@
#ifndef __KERNEL__
typedef uint8_t u8;
typedef uint32_t u32;
typedef uint64_t u64;
#endif
extern const u8 rs_gf_inv_table[256];
extern const u8 rs_gf_log_table[256];
extern const u8 rs_gf_exp_table[256];
static inline u8 gf_inv(u8 x) {
return rs_gf_inv_table[x];
}
static inline u8 gf_mul(u8 x, u8 y) {
if (x == 0 || y == 0) {
return 0;
}
int i = rs_gf_log_table[x] + rs_gf_log_table[y];
return rs_gf_exp_table[i > 254 ? i - 255 : i];
}
static inline void gf_mul_expand_factor(u8 x, u8* expanded_x) {
int i;
for (i = 0; i < 16; i++) {
expanded_x[i] = gf_mul(i, x);
}
for (i = 0; i < 16; i++) {
expanded_x[16 + i] = gf_mul(i << 4, x);
}
}
static inline u8 gf_mul_expanded(u8 x, const u8* expanded_y) {
return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
}
__attribute__((target("avx2")))
static inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
__m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
__m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
__m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
__m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
return _mm256_xor_si256(
_mm256_shuffle_epi8(expanded_y_lo, x_lo),
_mm256_shuffle_epi8(expanded_y_hi, x_hi)
);
}
static inline u8 rs_data_blocks_core(u8 parity) {
return parity & 0x0F;
}
static inline u8 rs_parity_blocks_core(u8 parity) {
return parity >> 4;
}
static inline u8 rs_blocks_core(u8 parity) {
return rs_data_blocks_core(parity) + rs_parity_blocks_core(parity);
}
// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
// just Gaussian elimination.
//
// TODO this is in row-major, it'd be nice to have it in column-major
// to save have the final operation to be more natural in rs_recover.
static bool rs_gf_invert_matrix(u8* in_mat, u8* out_mat, const int n) {
int i, j, k;
u8 temp;
// Set out_mat[] to the identity matrix
memset(out_mat, 0, n*n);
for (i = 0; i < n; i++) {
out_mat[i * n + i] = 1;
}
// Inverse
for (i = 0; i < n; i++) {
// Check for 0 in pivot element
if (in_mat[i * n + i] == 0) {
// Find a row with non-zero in current column and swap
for (j = i + 1; j < n; j++) {
if (in_mat[j * n + i]) {
break;
}
}
if (j == n) { // Couldn't find means it's singular
return false;
}
for (k = 0; k < n; k++) { // Swap rows i,j
temp = in_mat[i * n + k];
in_mat[i * n + k] = in_mat[j * n + k];
in_mat[j * n + k] = temp;
temp = out_mat[i * n + k];
out_mat[i * n + k] = out_mat[j * n + k];
out_mat[j * n + k] = temp;
}
}
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
}
for (j = 0; j < n; j++) {
if (j == i) {
continue;
}
temp = in_mat[j * n + i];
for (k = 0; k < n; k++) {
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
}
}
}
return true;
}
struct rs {
u8 parity;
// u8[D*B], in column-major.
u8* matrix;
// u8[D*P][32], in column-major. These are the lookup tables
// to perform multiplication quickly.
u8* expanded_matrix;
};
// Note that we pervasively assume that the first column of the parity columns
// is 1s, which causes the first parity to be the XORs of the data. So you can't
// really change how the matrix is generated.
static void rs_cauchy_matrix(struct rs* r) {
int D = rs_data_blocks_core(r->parity);
int B = rs_blocks_core(r->parity);
u8* matrix = r->matrix;
memset(matrix, 0, D*B);
// Identity in the d*d upper half
int i;
for (i = 0; i < D; i++) {
matrix[D*i + i] = 1;
}
// Fill in the rest using cauchy
int col, row;
for (col = D; col < B; col++) {
for (row = 0; row < D; row++) {
matrix[col*D + row] = gf_inv(col ^ row);
}
}
// Scale the columns
for (col = D; col < B; col++) {
u8 factor = gf_inv(matrix[col*D]);
for (row = 0; row < D; row++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
// Scale the rows
for (row = 1; row < D; row++) {
u8 factor = gf_inv(matrix[D*D + row]);
for (col = D; col < B; col++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
}
// out must be at leas size 4
static void rs_cpuidex(u32 function_id, u32 subfunction_id, u32* out) {
u32 a, b, c, d;
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
}
static bool rs_has_cpu_level_core(enum rs_cpu_level level) {
u32 _0_0[4];
rs_cpuidex(0, 0, _0_0);
u32 _7_0[4];
if (_0_0[0] >= 7) {
rs_cpuidex(7, 0, _7_0);
} else {
memset(_7_0, 0, sizeof(_7_0));
}
switch (level) {
case RS_CPU_SCALAR:
return true;
case RS_CPU_AVX2:
return _7_0[1] & (1<<5);
case RS_CPU_GFNI:
return _7_0[2] & (1<<8) && !rs_detect_valgrind();
default:
die("bad CPU level %d\n", level);
}
}
#define rs_compute_parity_single(D, P, r, i, data, parity) do { \
parity[0][i] = 0; \
int d; \
int p; \
for (d = 0; d < D; d++) { \
parity[0][i] ^= data[d][i]; \
} \
for (p = 1; p < P; p++) { \
const u8* factor = &r->expanded_matrix[D*32*p]; \
parity[p][i] = 0; \
for (d = 0; d < D; d++, factor += 32) { \
parity[p][i] ^= gf_mul_expanded(data[d][i], factor); \
} \
} \
} while (0)
#define rs_compute_parity_scalar(D, P, r, size, data, parity) do { \
/* parity = r->matrix * data */ \
u64 i; \
for (i = 0; i < size; i++) { \
rs_compute_parity_single(D, P, r, i, data, parity); \
} \
} while (0)
#define rs_compute_parity_avx2(D, P, r, size, data, parity) do { \
__m256i low_nibble_mask = broadcast_u8(0x0f); \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u32 i; \
int p; \
int d; \
for (i = 0; i < avx_size; i += 32) { \
{ \
__m256i parity_0 = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
} \
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
} \
for (p = 1; p < P; p++) { \
__m256i parity_p = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
__m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]); \
parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask)); \
} \
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
} \
} \
for (i = avx_size; i < size; i++) { \
rs_compute_parity_single(D, P, r, i, data, parity); \
} \
} while (0)
#define rs_compute_parity_gfni(D, P, r, size, data, parity) do { \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u64 i; \
int p; \
int d; \
for (i = 0; i < avx_size; i += 32) { \
{ \
__m256i parity_0 = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
} \
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
} \
for (p = 1; p < P; p++) { \
__m256i parity_p = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
__m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]); \
parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor)); \
} \
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
} \
} \
for (i = avx_size; i < size; i++) { \
rs_compute_parity_single(D, P, r, i, data, parity); \
} \
} while (0)
#define rs_recover_matmul_single(D, i, have, want, have_to_want) do { \
want[i] = 0; \
int j; \
for (j = 0; j < D; j++) { \
want[i] ^= gf_mul(have_to_want[j], have[j][i]); \
} \
} while (0)
#define rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded) do { \
want[i] = 0; \
int j; \
for (j = 0; j < D; j++) { \
want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]); \
} \
} while(0)
#define rs_recover_matmul_scalar(D, size, have, want, have_to_want) do { \
u8 have_to_want_expanded[D*32]; \
int d; \
for (d = 0; d < D; d++) { \
gf_mul_expand_factor(have_to_want[d], &have_to_want_expanded[d*32]); \
} \
size_t i; \
for (i = 0; i < size; i++) { \
rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded); \
} \
} while (0) \
#define rs_recover_matmul_avx2(D, size, have, want, have_to_want) do { \
__m256i have_to_want_expanded[D]; \
int d; \
for (d = 0; d < D; d++) { \
gf_mul_expand_factor(have_to_want[d], ((u8*)&have_to_want_expanded[d])); \
} \
__m256i low_nibble_mask = broadcast_u8(0x0f); \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u64 i; \
for (i = 0; i < avx_size; i += 32) { \
__m256i want_i = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
want_i = _mm256_xor_si256( \
want_i, \
gf_mul_expanded_avx2( \
_mm256_loadu_si256((const __m256i*)(have[d] + i)), \
have_to_want_expanded[d], \
low_nibble_mask \
) \
); \
} \
_mm256_storeu_si256((__m256i*)(want + i), want_i); \
} \
for (i = avx_size; i < size; i++) { \
rs_recover_matmul_single_expanded(D, i, have, want, ((const u8*)have_to_want_expanded)); \
} \
} while(0)
#define rs_recover_matmul_gfni(D, size, have, want, have_to_want) do { \
__m256i have_to_want_avx[D]; \
int d; \
for (d = 0; d < D; d++) { \
have_to_want_avx[d] = broadcast_u8(have_to_want[d]); \
} \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u64 i; \
for (i = 0; i < avx_size; i += 32) { \
__m256i want_i = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
want_i = _mm256_xor_si256( \
want_i, \
_mm256_gf2p8mul_epi8( \
_mm256_loadu_si256((const __m256i*)(have[d] + i)), \
have_to_want_avx[d] \
) \
); \
} \
_mm256_storeu_si256((__m256i*)(want + i), want_i); \
} \
for (i = avx_size; i < size; i++) { \
rs_recover_matmul_single(D, i, have, want, have_to_want); \
} \
} while (0)
static struct rs* rs_new_core(u8 parity) {
int B = rs_blocks_core(parity);
int D = rs_data_blocks_core(parity);
int P = rs_parity_blocks_core(parity);
struct rs* r = (struct rs*)rs_malloc(sizeof(struct rs) + B*D + D*P*32);
if (r == NULL) { return NULL; }
r->parity = parity;
r->matrix = (u8*)(r + 1);
r->expanded_matrix = r->matrix + B*D;
rs_cauchy_matrix(r);
int p;
int d;
for (p = 0; p < P; p++) {
for (d = 0; d < D; d++) {
gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
}
}
return r;
}
static void rs_delete_core(struct rs* r) {
rs_free(r);
}
static void rs_recover_core(
struct rs* r,
u64 size,
const u8* have_blocks,
const u8** have,
u8 want_block,
u8* want,
void (*recover_func)(int D, u64 size, const u8** have, u8* want, const u8* mat)
) {
int D = rs_data_blocks_core(r->parity);
int B = rs_blocks_core(r->parity);
// Create some space
u8* scratch = (u8*)rs_malloc(D*D + D*D);
u8* mat_1 = scratch;
u8* mat_2 = scratch + D*D;
// Preliminary checks
int i, j, d, b;
for (d = 0; d < D; d++) {
if (have_blocks[d] >= B) {
die("have_blocks[%d]=%d >= %d\n", d, have_blocks[d], B);
}
if (have_blocks[d] == want_block) {
die("have_blocks[%d]=%d == want_block=%d\n", d, have_blocks[d], want_block);
}
if (d > 0 && have_blocks[d] <= have_blocks[d-1]) {
die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", d, have_blocks[d], d, have_blocks[d-1]);
}
}
// below in the dimensionality annotation we paper over transposes
// [DxD] matrix going from the data blocks to the blocks we currently have
u8* data_to_have = mat_1;
int have_cursor;
for (b = 0, have_cursor = 0; b < B; b++) {
if (have_cursor >= D || have_blocks[have_cursor] != b) {
continue;
}
memcpy(data_to_have + have_cursor*D, r->matrix + b*D, D);
have_cursor++;
}
// [DxD] matrix going from what we have to the original data blocks
u8* have_to_data = mat_2;
if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
die("unexpected singular matrix\n");
}
data_to_have = NULL;
// [Dx1] matrix going from the data blocks to the block we want
u8* data_to_want = &r->matrix[want_block*D];
// have_to_want = data_to_want * have_to_data
// [Dx1] matrix going from `blocks` to the block we're into
u8* have_to_want = mat_1;
for (i = 0; i < D; i++) {
have_to_want[i] = 0;
for (j = 0; j < D; j++) {
have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
}
}
// want = have_to_want * have
recover_func(D, size, have, want, have_to_want);
// We're done.
rs_free(scratch);
}
+36 -4
View File
@@ -156,6 +156,8 @@
//
// TODO fill in results
static constexpr uint64_t EGGSFS_PAGE_SIZE = 4096;
static auto wrappedSnapshot(rocksdb::DB* db) {
auto deleter = [db](const rocksdb::Snapshot* snapshot) {
db->ReleaseSnapshot(snapshot);
@@ -726,7 +728,11 @@ struct ShardDBImpl {
beginKey().setOffset(req.byteOffset);
{
std::unique_ptr<rocksdb::Iterator> it(_db->NewIterator(options, _spansCf));
for (it->SeekForPrev(beginKey.toSlice()); it->Valid(); it->Next()) {
for (
it->SeekForPrev(beginKey.toSlice());
it->Valid() && (req.limit == 0 || resp.spans.els.size() < req.limit);
it->Next()
) {
auto key = ExternalValue<SpanKey>::FromSlice(it->key());
if (key().fileId() != req.fileId) {
break;
@@ -1258,6 +1264,11 @@ struct ShardDBImpl {
LOG_DEBUG(_env, "inline span has bad storage class %s", req.storageClass);
return EggsError::BAD_SPAN_BODY;
}
if (req.byteOffset%EGGSFS_PAGE_SIZE != 0) {
LOG_DEBUG(_env, "req.byteOffset=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, EGGSFS_PAGE_SIZE);
return EggsError::BAD_SPAN_BODY;
}
uint32_t expectedCrc = crc32c(0, req.body.data(), req.body.size());
expectedCrc = crc32c_zero_extend(expectedCrc, req.size - req.body.size());
@@ -1293,10 +1304,14 @@ struct ShardDBImpl {
LOG_DEBUG(_env, "bad storage class %s for blocks span", (int)req.storageClass);
return EggsError::BAD_SPAN_BODY;
}
if (req.byteOffset%EGGSFS_PAGE_SIZE != 0 || req.cellSize%EGGSFS_PAGE_SIZE != 0) {
LOG_DEBUG(_env, "req.byteOffset=%s or cellSize=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, req.cellSize, EGGSFS_PAGE_SIZE);
return EggsError::BAD_SPAN_BODY;
}
if (!_checkSpanBody(req)) {
return EggsError::BAD_SPAN_BODY;
}
// start filling in entry
entry.fileId = req.fileId;
entry.byteOffset = req.byteOffset;
@@ -1888,6 +1903,7 @@ struct ShardDBImpl {
return EggsError::MISMATCHING_TARGET;
}
if (edgeBody().creationTime() != creationTime) {
LOG_DEBUG(_env, "expected time %s, got %s", edgeBody().creationTime(), creationTime);
return EggsError::MISMATCHING_CREATION_TIME;
}
if (edgeBody().targetIdWithLocked().extra()) { // locked
@@ -1992,6 +2008,7 @@ struct ShardDBImpl {
}
ExternalValue<CurrentEdgeBody> edge(edgeValue);
if (edge().creationTime() != entry.creationTime) {
LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
return EggsError::MISMATCHING_CREATION_TIME;
}
if (edge().locked()) {
@@ -2046,6 +2063,7 @@ struct ShardDBImpl {
}
ExternalValue<CurrentEdgeBody> edge(edgeValue);
if (edge().creationTime() != entry.creationTime) {
LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
return EggsError::MISMATCHING_CREATION_TIME;
}
if (!edge().locked()) {
@@ -2732,7 +2750,13 @@ struct ShardDBImpl {
const auto& cache = _blockServicesCache.at(blockServiceId.u64);
auto expectedProof = cbcmac(cache.secretKey, (uint8_t*)buf, sizeof(buf));
return proof.proof == expectedProof;
bool good = proof.proof == expectedProof;
if (!good) {
LOG_DEBUG(_env, "bad block write proof, expected %s, got %s", BincodeFixedBytes<8>(expectedProof), proof);
}
return good;
}
std::array<uint8_t, 8> _blockEraseCertificate(uint32_t blockSize, const BlockBody block, const AES128Key& secretKey) {
@@ -3335,8 +3359,16 @@ DirectoryInfo defaultDirectoryInfo() {
hddBlocks.storageClass = storageClassByName("HDD");
addSegment(BLOCK_POLICY_TAG, blockPolicy);
// Span policy: up to 10MiB: RS(4,4), up to 100MiB (max span size): RS(10,4)
// Span policy:
// * up to 64KiB: RS(1,4). This mirroring span simplifies things in the kernel (so that we
// we never have cell sizes that are not multiple of span sizes). We still set things
// up so that we can lose 4 copies and still be fine.
// * up to 10MiB: RS(4,4).
// * up to 100MiB (max span size): RS(10,4).
SpanPolicy spanPolicy;
auto& tinySpans = spanPolicy.entries.els.emplace_back();
tinySpans.maxSize = 1 << 16;
tinySpans.parity = Parity(1, 4);
auto& smallSpans = spanPolicy.entries.els.emplace_back();
smallSpans.maxSize = 10 << 20;
smallSpans.parity = Parity(4, 4);
+436 -65
View File
@@ -310,28 +310,381 @@ func generateGo(errors []string, shardReqResps []reqRespType, cdcReqResps []reqR
return out.Bytes()
}
func generateC(errors []string, shardReqResps []reqRespType, cdcReqResps []reqRespType, extras []reflect.Type) []byte {
out := new(bytes.Buffer)
func generateKmodMsgKind(w io.Writer, what string, reqResps []reqRespType) {
for _, reqResp := range reqResps {
fmt.Fprintf(w, "#define EGGSFS_%s_%s 0x%X\n", what, reqRespEnum(reqResp), reqResp.kind)
}
fmt.Fprintf(w, "\n")
}
fmt.Fprintln(out, "// Automatically generated with go run bincodegen.")
fmt.Fprintln(out, "// Run `go generate ./...` from the go/ directory to regenerate it.")
func kmodFieldName(s string) string {
re := regexp.MustCompile(`(.)([A-Z])`)
return strings.ToLower(string(re.ReplaceAll([]byte(s), []byte("${1}_${2}"))))
}
func kmodStructName(typ reflect.Type) string {
re := regexp.MustCompile(`(.)([A-Z])`)
s := strings.ToLower(string(re.ReplaceAll([]byte(typ.Name()), []byte("${1}_${2}"))))
return fmt.Sprintf("eggsfs_%s", s)
}
func kmodStaticSizeName(typ reflect.Type) string {
re := regexp.MustCompile(`(.)([A-Z])`)
s := strings.ToUpper(string(re.ReplaceAll([]byte(typ.Name()), []byte("${1}_${2}"))))
return fmt.Sprintf("EGGSFS_%s_SIZE", s)
}
func kmodMaxSizeName(typ reflect.Type) string {
re := regexp.MustCompile(`(.)([A-Z])`)
s := strings.ToUpper(string(re.ReplaceAll([]byte(typ.Name()), []byte("${1}_${2}"))))
return fmt.Sprintf("EGGSFS_%s_MAX_SIZE", s)
}
func generateKmodStructOpaque(w io.Writer, typ reflect.Type, suffix string) {
if typ.Kind() != reflect.Struct {
panic(fmt.Errorf("unexpected non-struct type %v", typ))
}
fmt.Fprintf(w, "struct %s_%s;\n", kmodStructName(typ), suffix)
}
func generateKmodStruct(w io.Writer, typ reflect.Type, fldName string, fld string) {
if typ.Kind() != reflect.Struct {
panic(fmt.Errorf("unexpected non-struct type %v", typ))
}
fmt.Fprintf(w, "struct %s_%s { %s; };\n", kmodStructName(typ), fldName, fld)
}
func generateKmodSize(staticSizes map[string]int, maxSizes map[string]int, w io.Writer, typ reflect.Type) {
if typ.Kind() != reflect.Struct {
panic(fmt.Errorf("unexpected non-struct type %v", typ))
}
updateSize := func(size *int, update int) {
if update < 0 {
*size = -1
} else if *size >= 0 {
*size += update
}
}
staticSize := 0
maxSize := 0
for i := 0; i < typ.NumField(); i++ {
fld := typ.Field(i)
fldK := fld.Type.Kind()
if fldK == reflect.Struct {
fldStaticSize, found := staticSizes[kmodStaticSizeName(fld.Type)]
if found {
updateSize(&staticSize, fldStaticSize)
updateSize(&maxSize, fldStaticSize)
} else {
updateSize(&staticSize, -1)
fldMaxSize, found := staticSizes[kmodMaxSizeName(fld.Type)]
if found {
updateSize(&maxSize, fldMaxSize)
} else {
updateSize(&maxSize, -1)
}
}
} else if fldK == reflect.Bool || fldK == reflect.Uint8 || fldK == reflect.Uint16 || fldK == reflect.Uint32 || fldK == reflect.Uint64 || fldK == reflect.Array {
updateSize(&staticSize, int(fld.Type.Size()))
updateSize(&maxSize, int(fld.Type.Size()))
} else if fldK == reflect.Slice || fldK == reflect.String {
updateSize(&staticSize, -1)
elem := sliceTypeElem(fld.Type)
if elem.Kind() == reflect.Uint8 {
updateSize(&maxSize, 256) // len + body
} else {
updateSize(&maxSize, -1)
}
} else {
panic(fmt.Errorf("unexpected kind %v", fldK))
}
}
if staticSize >= 0 {
sizeName := kmodStaticSizeName(typ)
staticSizes[sizeName] = staticSize
fmt.Fprintf(w, "#define %s %v\n", sizeName, staticSize)
} else if maxSize >= 0 {
sizeName := kmodMaxSizeName(typ)
maxSizes[sizeName] = maxSize
fmt.Fprintf(w, "#define %s %v\n", sizeName, maxSize)
}
}
func kmodType(t reflect.Type) string {
switch t.Kind() {
case reflect.Uint8:
return "u8"
case reflect.Uint16:
return "u16"
case reflect.Uint32:
return "u32"
case reflect.Uint64:
return "u64"
case reflect.Bool:
return "bool"
case reflect.Struct:
return fmt.Sprintf("%s*", kmodStructName(t))
case reflect.Slice, reflect.String:
elem := sliceTypeElem(t)
if elem.Kind() == reflect.Uint8 {
return "eggsfs_bytes*"
} else {
return kmodType(elem)
}
default:
panic(fmt.Sprintf("unsupported type with kind %v", t.Kind()))
}
}
func generateKmodGet(staticSizes map[string]int, h io.Writer, typ reflect.Type) {
generateKmodStructOpaque(h, typ, "start")
fmt.Fprintf(h, "#define %s_get_start(ctx, start) struct %s_start* start = NULL\n\n", kmodStructName(typ), kmodStructName(typ))
prevType := fmt.Sprintf("struct %s_start*", kmodStructName(typ))
for i := 0; i < typ.NumField(); i++ {
fld := typ.Field(i)
fldK := fld.Type.Kind()
fldTyp := fld.Type
endianness := "le"
if fldK == reflect.Array {
sz := fld.Type.Size()
endianness = "be"
if sz == 4 {
fldK = reflect.Uint32
fldTyp = reflect.TypeOf(uint32(0))
} else if sz == 8 {
fldK = reflect.Uint64
fldTyp = reflect.TypeOf(uint64(0))
} else {
panic(fmt.Errorf("bad array size %v", fld.Type.Size()))
}
}
fldName := kmodFieldName(fld.Name)
fldKTyp := kmodType(fldTyp)
if fldK == reflect.Struct {
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }; \\\n", prevType)
fmt.Fprintf(h, " struct %s_start* next = NULL\n\n", kmodStructName(fldTyp))
prevType = fmt.Sprintf("struct %s_end*", kmodStructName(fldTyp))
} else if fldK == reflect.Slice || fldK == reflect.String {
elemTyp := sliceTypeElem(fldTyp)
if elemTyp.Kind() == reflect.Uint8 {
generateKmodStruct(h, typ, fldName, "struct eggsfs_bincode_bytes str")
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
fmt.Fprintf(h, "static inline void _%s_get_%s(struct eggsfs_bincode_get_ctx* ctx, %s* prev, %s* next) { \n", kmodStructName(typ), fldName, prevType, nextType)
fmt.Fprintf(h, " if (likely(ctx->err == 0)) { \n")
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < 1)) { \n")
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
fmt.Fprintf(h, " } else { \n")
fmt.Fprintf(h, " next->str.len = *(u8*)(ctx->buf); \n")
fmt.Fprintf(h, " ctx->buf++; \n")
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < next->str.len)) { \n")
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
fmt.Fprintf(h, " } else { \n")
fmt.Fprintf(h, " next->str.buf = ctx->buf; \n")
fmt.Fprintf(h, " ctx->buf += next->str.len; \n")
fmt.Fprintf(h, " } \n")
fmt.Fprintf(h, " } \n")
fmt.Fprintf(h, " }\n")
fmt.Fprintf(h, "}\n")
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " _%s_get_%s(ctx, &(prev), &(next))\n\n", kmodStructName(typ), fldName)
prevType = nextType
} else {
generateKmodStruct(h, typ, fldName, "u16 len")
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
fmt.Fprintf(h, "static inline void _%s_get_%s(struct eggsfs_bincode_get_ctx* ctx, %s* prev, %s* next) { \n", kmodStructName(typ), fldName, prevType, nextType)
fmt.Fprintf(h, " if (likely(ctx->err == 0)) { \n")
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < 2)) { \n")
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
fmt.Fprintf(h, " } else { \n")
fmt.Fprintf(h, " next->len = get_unaligned_le16(ctx->buf); \n")
fmt.Fprintf(h, " ctx->buf += 2; \n")
fmt.Fprintf(h, " } \n")
fmt.Fprintf(h, " } else { \n")
fmt.Fprintf(h, " next->len = 0; \n") // so that loops can work even in the case of errors
fmt.Fprintf(h, " } \n")
fmt.Fprintf(h, "} \n")
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " _%s_get_%s(ctx, &(prev), &(next))\n\n", kmodStructName(typ), fldName)
prevType = nextType
}
} else if fldK == reflect.Bool || fldK == reflect.Uint8 || fldK == reflect.Uint16 || fldK == reflect.Uint32 || fldK == reflect.Uint64 {
generateKmodStruct(h, typ, fldName, fmt.Sprintf("%s x", fldKTyp))
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
fmt.Fprintf(h, "static inline void _%s_get_%s(struct eggsfs_bincode_get_ctx* ctx, %s* prev, %s* next) { \n", kmodStructName(typ), fldName, prevType, nextType)
fmt.Fprintf(h, " if (likely(ctx->err == 0)) { \n")
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < %d)) { \n", fldTyp.Size())
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
fmt.Fprintf(h, " } else { \n")
switch fldK {
case reflect.Bool:
fmt.Fprintf(h, " next->x = *(bool*)(ctx->buf); \n")
case reflect.Uint8:
fmt.Fprintf(h, " next->x = *(u8*)(ctx->buf); \n")
case reflect.Uint16:
fmt.Fprintf(h, " next->x = get_unaligned_%s16(ctx->buf); \n", endianness)
case reflect.Uint32:
fmt.Fprintf(h, " next->x = get_unaligned_%s32(ctx->buf); \n", endianness)
case reflect.Uint64:
fmt.Fprintf(h, " next->x = get_unaligned_%s64(ctx->buf); \n", endianness)
}
fmt.Fprintf(h, " ctx->buf += %d; \n", fldTyp.Size())
fmt.Fprintf(h, " } \n")
fmt.Fprintf(h, " }\n")
fmt.Fprintf(h, "}\n")
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " _%s_get_%s(ctx, &(prev), &(next))\n\n", kmodStructName(typ), fldName)
prevType = nextType
} else {
panic(fmt.Errorf("unexpected kind %v", fldK))
}
}
generateKmodStructOpaque(h, typ, "end")
fmt.Fprintf(h, "#define %s_get_end(ctx, prev, next) \\\n", kmodStructName(typ))
fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }\\\n", prevType)
fmt.Fprintf(h, " struct %s_end* next = NULL\n\n", kmodStructName(typ))
fmt.Fprintf(h, "static inline void %s_get_finish(struct eggsfs_bincode_get_ctx* ctx, struct %s_end* end) {\n", kmodStructName(typ), kmodStructName(typ))
fmt.Fprintf(h, " if (unlikely(ctx->buf != ctx->end)) { \n")
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
fmt.Fprintf(h, " }\n")
fmt.Fprintf(h, "}\n\n")
}
func generateKmodPut(staticSizes map[string]int, h io.Writer, typ reflect.Type) {
fmt.Fprintf(h, "#define %s_put_start(ctx, start) struct %s_start* start = NULL\n\n", kmodStructName(typ), kmodStructName(typ))
prevType := fmt.Sprintf("struct %s_start*", kmodStructName(typ))
for i := 0; i < typ.NumField(); i++ {
fld := typ.Field(i)
fldK := fld.Type.Kind()
fldTyp := fld.Type
endianness := "le"
if fldK == reflect.Array {
sz := fld.Type.Size()
endianness = "be"
if sz == 4 {
fldK = reflect.Uint32
fldTyp = reflect.TypeOf(uint32(0))
} else if sz == 8 {
fldK = reflect.Uint64
fldTyp = reflect.TypeOf(uint64(0))
} else {
panic(fmt.Errorf("bad array size %v", fld.Type.Size()))
}
}
fldName := kmodFieldName(fld.Name)
fldKTyp := kmodType(fldTyp)
if fldK == reflect.Struct {
// fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
// fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }; \\\n", prevType)
// fmt.Fprintf(h, " struct %s_start* next = NULL\n\n", kmodStructName(fldTyp))
// prevType = fmt.Sprintf("struct %s_end*", kmodStructName(fldTyp))
} else if fldK == reflect.Slice || fldK == reflect.String {
elemTyp := sliceTypeElem(fldTyp)
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
if elemTyp.Kind() == reflect.Uint8 {
fmt.Fprintf(h, "static inline void _%s_put_%s(struct eggsfs_bincode_put_ctx* ctx, %s* prev, %s* next, const char* str, int str_len) {\n", kmodStructName(typ), fldName, prevType, nextType)
fmt.Fprintf(h, " next = NULL;\n")
fmt.Fprintf(h, " BUG_ON(str_len < 0 || str_len > 255);\n")
fmt.Fprintf(h, " BUG_ON(ctx->end - ctx->cursor < (1 + str_len));\n")
fmt.Fprintf(h, " *(u8*)(ctx->cursor) = str_len;\n")
fmt.Fprintf(h, " memcpy(ctx->cursor + 1, str, str_len);\n")
fmt.Fprintf(h, " ctx->cursor += 1 + str_len;\n")
fmt.Fprintf(h, "}\n")
fmt.Fprintf(h, "#define %s_put_%s(ctx, prev, next, str, str_len) \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " _%s_put_%s(ctx, &(prev), &(next), str, str_len)\n\n", kmodStructName(typ), fldName)
} else {
fmt.Fprintf(h, "static inline void _%s_put_%s(struct eggsfs_bincode_put_ctx* ctx, %s* prev, %s* next, int len) {\n", kmodStructName(typ), fldName, prevType, nextType)
fmt.Fprintf(h, " next = NULL;\n")
fmt.Fprintf(h, " BUG_ON(len < 0 || len >= 1<<16);\n")
fmt.Fprintf(h, " BUG_ON(ctx->end - ctx->cursor < 2);\n")
fmt.Fprintf(h, " put_unaligned_le16(len, ctx->cursor);\n")
fmt.Fprintf(h, " ctx->cursor += 2;\n")
fmt.Fprintf(h, "}\n")
fmt.Fprintf(h, "#define %s_put_%s(ctx, prev, next, len) \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " _%s_put_%s(ctx, &(prev), &(next), len)\n\n", kmodStructName(typ), fldName)
}
prevType = nextType
} else if fldK == reflect.Bool || fldK == reflect.Uint8 || fldK == reflect.Uint16 || fldK == reflect.Uint32 || fldK == reflect.Uint64 {
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
fmt.Fprintf(h, "static inline void _%s_put_%s(struct eggsfs_bincode_put_ctx* ctx, %s* prev, %s* next, %s x) {\n", kmodStructName(typ), fldName, prevType, nextType, fldKTyp)
fmt.Fprintf(h, " next = NULL;\n")
fmt.Fprintf(h, " BUG_ON(ctx->end - ctx->cursor < %d);\n", fldTyp.Size())
switch fldK {
case reflect.Bool:
fmt.Fprintf(h, " *(bool*)(ctx->cursor) = x;\n")
case reflect.Uint8:
fmt.Fprintf(h, " *(u8*)(ctx->cursor) = x;\n")
case reflect.Uint16:
fmt.Fprintf(h, " put_unaligned_%s16(x, ctx->cursor);\n", endianness)
case reflect.Uint32:
fmt.Fprintf(h, " put_unaligned_%s32(x, ctx->cursor);\n", endianness)
case reflect.Uint64:
fmt.Fprintf(h, " put_unaligned_%s64(x, ctx->cursor);\n", endianness)
}
fmt.Fprintf(h, " ctx->cursor += %d;\n", fldTyp.Size())
fmt.Fprintf(h, "}\n")
fmt.Fprintf(h, "#define %s_put_%s(ctx, prev, next, x) \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
fmt.Fprintf(h, " _%s_put_%s(ctx, &(prev), &(next), x)\n\n", kmodStructName(typ), fldName)
prevType = nextType
} else {
panic(fmt.Errorf("unexpected kind %v", fldK))
}
}
fmt.Fprintf(h, "#define %s_put_end(ctx, prev, next) \\\n", kmodStructName(typ))
fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }\\\n", prevType)
fmt.Fprintf(h, " struct %s_end* next __attribute__((unused)) = NULL\n\n", kmodStructName(typ))
}
func generateKmod(errors []string, shardReqResps []reqRespType, cdcReqResps []reqRespType, shuckleReqResps []reqRespType, blocksReqResps []reqRespType, extras []reflect.Type) []byte {
hOut := new(bytes.Buffer)
fmt.Fprintln(hOut, "// Automatically generated with go run bincodegen.")
fmt.Fprintln(hOut, "// Run `go generate ./...` from the go/ directory to regenerate it.")
fmt.Fprintln(hOut)
for i, err := range errors {
fmt.Fprintf(out, "#define EGGSFS_ERR_%s %d\n", err, errCodeOffset+i)
fmt.Fprintf(hOut, "#define EGGSFS_ERR_%s %d\n", err, errCodeOffset+i)
}
fmt.Fprintf(out, "\n")
fmt.Fprintf(hOut, "\n")
for _, reqResp := range shardReqResps {
fmt.Fprintf(out, "#define EGGSFS_META_%s 0x%X\n", reqRespEnum(reqResp), reqResp.kind)
generateKmodMsgKind(hOut, "SHARD", shardReqResps)
generateKmodMsgKind(hOut, "CDC", cdcReqResps)
generateKmodMsgKind(hOut, "SHUCKLE", shuckleReqResps)
generateKmodMsgKind(hOut, "BLOCKS", blocksReqResps)
fmt.Fprintln(hOut)
staticSizes := make(map[string]int)
maxSizes := make(map[string]int)
for _, typ := range extras {
generateKmodSize(staticSizes, maxSizes, hOut, typ)
generateKmodGet(staticSizes, hOut, typ)
generateKmodPut(staticSizes, hOut, typ)
}
fmt.Fprintf(out, "\n")
for _, reqResp := range cdcReqResps {
fmt.Fprintf(out, "#define EGGSFS_CDC_%s 0x%X\n", reqRespEnum(reqResp), reqResp.kind)
generateReqResps := func(reqResps []reqRespType) {
for _, reqResp := range reqResps {
generateKmodSize(staticSizes, maxSizes, hOut, reqResp.req)
generateKmodGet(staticSizes, hOut, reqResp.req)
generateKmodPut(staticSizes, hOut, reqResp.req)
generateKmodSize(staticSizes, maxSizes, hOut, reqResp.resp)
generateKmodGet(staticSizes, hOut, reqResp.resp)
generateKmodPut(staticSizes, hOut, reqResp.resp)
}
}
fmt.Fprintf(out, "\n")
return out.Bytes()
generateReqResps(shardReqResps)
generateReqResps(cdcReqResps)
generateReqResps(shuckleReqResps)
generateReqResps(blocksReqResps)
return hOut.Bytes()
}
func cppType(t reflect.Type) string {
@@ -549,6 +902,8 @@ func generateCppSingle(hpp io.Writer, cpp io.Writer, t reflect.Type) {
fld := t.Field(i)
if cppType(fld.Type) == "uint8_t" {
fmt.Fprintf(cpp, " << \"%s=\" << (int)x.%s", fld.Name, cppFieldName(fld))
} else if fld.Type.Kind() == reflect.String {
fmt.Fprintf(cpp, " << \"%s=\" << GoLangQuotedStringFmt(x.%s.data(), x.%s.size())", fld.Name, cppFieldName(fld), cppFieldName(fld))
} else {
fmt.Fprintf(cpp, " << \"%s=\" << x.%s", fld.Name, cppFieldName(fld))
}
@@ -899,7 +1254,7 @@ func main() {
"BLOCK_TOO_BIG",
}
shardReqResps := []reqRespType{
kernelShardReqResps := []reqRespType{
{
0x01,
reflect.TypeOf(msgs.LookupReq{}),
@@ -910,11 +1265,6 @@ func main() {
reflect.TypeOf(msgs.StatFileReq{}),
reflect.TypeOf(msgs.StatFileResp{}),
},
{
0x03,
reflect.TypeOf(msgs.StatTransientFileReq{}),
reflect.TypeOf(msgs.StatTransientFileResp{}),
},
{
0x04,
reflect.TypeOf(msgs.StatDirectoryReq{}),
@@ -961,25 +1311,33 @@ func main() {
reflect.TypeOf(msgs.SameDirectoryRenameResp{}),
},
{
0x0D,
reflect.TypeOf(msgs.SetDirectoryInfoReq{}),
reflect.TypeOf(msgs.SetDirectoryInfoResp{}),
0x10,
reflect.TypeOf(msgs.AddInlineSpanReq{}),
reflect.TypeOf(msgs.AddInlineSpanResp{}),
},
{
0x0E,
reflect.TypeOf(msgs.SnapshotLookupReq{}),
reflect.TypeOf(msgs.SnapshotLookupResp{}),
},
}
shardReqResps := append(kernelShardReqResps, []reqRespType{
{
0x03,
reflect.TypeOf(msgs.StatTransientFileReq{}),
reflect.TypeOf(msgs.StatTransientFileResp{}),
},
{
0x0D,
reflect.TypeOf(msgs.SetDirectoryInfoReq{}),
reflect.TypeOf(msgs.SetDirectoryInfoResp{}),
},
{
0x0F,
reflect.TypeOf(msgs.ExpireTransientFileReq{}),
reflect.TypeOf(msgs.ExpireTransientFileResp{}),
},
{
0x10,
reflect.TypeOf(msgs.AddInlineSpanReq{}),
reflect.TypeOf(msgs.AddInlineSpanResp{}),
},
// PRIVATE OPERATIONS -- These are safe operations, but we don't want the FS client itself
// to perform them. TODO make privileged?
{
@@ -1078,9 +1436,9 @@ func main() {
reflect.TypeOf(msgs.MakeFileTransientReq{}),
reflect.TypeOf(msgs.MakeFileTransientResp{}),
},
}
}...)
cdcReqResps := []reqRespType{
kernelCdcReqResps := []reqRespType{
{
0x01,
reflect.TypeOf(msgs.MakeDirectoryReq{}),
@@ -1101,6 +1459,9 @@ func main() {
reflect.TypeOf(msgs.RenameDirectoryReq{}),
reflect.TypeOf(msgs.RenameDirectoryResp{}),
},
}
cdcReqResps := append(kernelCdcReqResps, []reqRespType{
{
0x05,
reflect.TypeOf(msgs.HardUnlinkDirectoryReq{}),
@@ -1111,9 +1472,22 @@ func main() {
reflect.TypeOf(msgs.CrossShardHardUnlinkFileReq{}),
reflect.TypeOf(msgs.CrossShardHardUnlinkFileResp{}),
},
}...)
kernelShuckleReqResps := []reqRespType{
{
0x03,
reflect.TypeOf(msgs.ShardsReq{}),
reflect.TypeOf(msgs.ShardsResp{}),
},
{
0x07,
reflect.TypeOf(msgs.CdcReq{}),
reflect.TypeOf(msgs.CdcResp{}),
},
}
shuckleReqResps := []reqRespType{
shuckleReqResps := append(kernelShuckleReqResps, []reqRespType{
{
0x01,
reflect.TypeOf(msgs.BlockServicesForShardReq{}),
@@ -1124,11 +1498,6 @@ func main() {
reflect.TypeOf(msgs.RegisterBlockServicesReq{}),
reflect.TypeOf(msgs.RegisterBlockServicesResp{}),
},
{
0x03,
reflect.TypeOf(msgs.ShardsReq{}),
reflect.TypeOf(msgs.ShardsResp{}),
},
{
0x04,
reflect.TypeOf(msgs.RegisterShardReq{}),
@@ -1144,19 +1513,9 @@ func main() {
reflect.TypeOf(msgs.RegisterCdcReq{}),
reflect.TypeOf(msgs.RegisterCdcResp{}),
},
{
0x07,
reflect.TypeOf(msgs.CdcReq{}),
reflect.TypeOf(msgs.CdcResp{}),
},
}
}...)
blocksReqResps := []reqRespType{
{
0x01,
reflect.TypeOf(msgs.EraseBlockReq{}),
reflect.TypeOf(msgs.EraseBlockResp{}),
},
kernelBlocksReqResps := []reqRespType{
{
0x02,
reflect.TypeOf(msgs.FetchBlockReq{}),
@@ -1174,32 +1533,43 @@ func main() {
},
}
extras := []reflect.Type{
reflect.TypeOf(msgs.TransientFile{}),
reflect.TypeOf(msgs.FetchedBlock{}),
blocksReqResps := append(kernelBlocksReqResps, []reqRespType{
{
0x01,
reflect.TypeOf(msgs.EraseBlockReq{}),
reflect.TypeOf(msgs.EraseBlockResp{}),
},
}...)
kernelExtras := []reflect.Type{
reflect.TypeOf(msgs.DirectoryInfoEntry{}),
reflect.TypeOf(msgs.DirectoryInfo{}),
reflect.TypeOf(msgs.CurrentEdge{}),
reflect.TypeOf(msgs.Edge{}),
reflect.TypeOf(msgs.BlockInfo{}),
reflect.TypeOf(msgs.BlockProof{}),
reflect.TypeOf(msgs.BlockService{}),
reflect.TypeOf(msgs.ShardInfo{}),
reflect.TypeOf(msgs.BlockPolicyEntry{}),
reflect.TypeOf(msgs.SpanPolicyEntry{}),
reflect.TypeOf(msgs.StripePolicy{}),
reflect.TypeOf(msgs.SnapshotLookupEdge{}),
reflect.TypeOf(msgs.FetchedBlock{}),
reflect.TypeOf(msgs.FetchedSpanHeader{}),
reflect.TypeOf(msgs.FetchedInlineSpan{}),
reflect.TypeOf(msgs.FetchedBlocksSpan{}),
reflect.TypeOf(msgs.BlockProof{}),
reflect.TypeOf(msgs.DirectoryInfoEntry{}),
reflect.TypeOf(msgs.DirectoryInfo{}),
reflect.TypeOf(msgs.BlockService{}),
}
extras := append(kernelExtras, []reflect.Type{
reflect.TypeOf(msgs.TransientFile{}),
reflect.TypeOf(msgs.Edge{}),
reflect.TypeOf(msgs.FullReadDirCursor{}),
reflect.TypeOf(msgs.EntryNewBlockInfo{}),
reflect.TypeOf(msgs.SnapshotLookupEdge{}),
reflect.TypeOf(msgs.BlockServiceInfo{}),
reflect.TypeOf(msgs.ShardInfo{}),
reflect.TypeOf(msgs.RegisterShardInfo{}),
reflect.TypeOf(msgs.SpanPolicyEntry{}),
reflect.TypeOf(msgs.SpanPolicy{}),
reflect.TypeOf(msgs.BlockPolicyEntry{}),
reflect.TypeOf(msgs.BlockPolicy{}),
reflect.TypeOf(msgs.SnapshotPolicy{}),
reflect.TypeOf(msgs.StripePolicy{}),
}
}...)
goCode := generateGo(errors, shardReqResps, cdcReqResps, shuckleReqResps, blocksReqResps, extras)
goOutFileName := fmt.Sprintf("%s/msgs_bincode.go", cwd)
@@ -1210,13 +1580,14 @@ func main() {
defer goOutFile.Close()
goOutFile.Write(goCode)
cOutFilename := fmt.Sprintf("%s/../../c/eggs_msgs.h", cwd)
cOutFile, err := os.Create(cOutFilename)
kmodHOutFilename := fmt.Sprintf("%s/../../kmod/bincodegen.h", cwd)
kmodHOutFile, err := os.Create(kmodHOutFilename)
if err != nil {
panic(err)
}
defer cOutFile.Close()
cOutFile.Write(generateC(errors, shardReqResps, cdcReqResps, extras))
defer kmodHOutFile.Close()
kmodHBytes := generateKmod(errors, kernelShardReqResps, kernelCdcReqResps, kernelShuckleReqResps, kernelBlocksReqResps, kernelExtras)
kmodHOutFile.Write(kmodHBytes)
hppOutFilename := fmt.Sprintf("%s/../../cpp/core/MsgsGen.hpp", cwd)
hppOutFile, err := os.Create(hppOutFilename)
+9 -18
View File
@@ -149,13 +149,8 @@ func registerPeriodically(
}
func checkEraseCertificate(log *lib.Logger, blockServiceId msgs.BlockServiceId, cipher cipher.Block, req *msgs.EraseBlockReq) msgs.ErrCode {
// compute mac
w := bytes.NewBuffer([]byte{})
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
w.Write([]byte{'e'})
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
expectedMac := lib.CBCMAC(cipher, w.Bytes())
if expectedMac != req.Certificate {
expectedMac, good := lib.CheckBlockEraseCertificate(blockServiceId, cipher, req)
if !good {
log.RaiseAlert(fmt.Errorf("bad MAC, got %v, expected %v", req.Certificate, expectedMac))
return msgs.BAD_CERTIFICATE
}
@@ -221,17 +216,10 @@ func sendFetchBlock(log *lib.Logger, blockServiceId msgs.BlockServiceId, basePat
}
func checkWriteCertificate(log *lib.Logger, cipher cipher.Block, blockServiceId msgs.BlockServiceId, req *msgs.WriteBlockReq) msgs.ErrCode {
// Compute mac
w := bytes.NewBuffer([]byte{})
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
w.Write([]byte{'w'})
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
binary.Write(w, binary.LittleEndian, uint32(req.Crc))
binary.Write(w, binary.LittleEndian, uint32(req.Size))
expectedMac := lib.CBCMAC(cipher, w.Bytes())
if expectedMac != req.Certificate {
log.Debug("mac computed for %v %v %v %v", uint64(blockServiceId), uint64(req.BlockId), uint32(req.Crc), uint32(req.Size))
log.RaiseAlert(fmt.Errorf("bad MAC for %v, got %v, expected %v", w.Bytes(), req.Certificate, expectedMac))
expectedMac, good := lib.CheckBlockWriteCertificate(cipher, blockServiceId, req)
if !good {
log.Debug("mac computed for %v %v %v %v", blockServiceId, req.BlockId, req.Crc, req.Size)
log.RaiseAlert(fmt.Errorf("bad MAC, got %v, expected %v", req.Certificate, expectedMac))
return msgs.BAD_CERTIFICATE
}
return 0
@@ -349,6 +337,7 @@ NextRequest:
continue
}
log.Debug("servicing request of type %T from %v", req, conn.RemoteAddr())
log.Trace("req %+v", req)
switch whichReq := req.(type) {
case *msgs.EraseBlockReq:
if err := checkEraseCertificate(log, blockServiceId, blockService.cipher, whichReq); err != 0 {
@@ -647,6 +636,7 @@ func main() {
defer func() { handleRecover(log, terminateChan, recover()) }()
for {
conn, err := listener1.Accept()
log.Trace("new conn %+v", conn)
if err != nil {
terminateChan <- err
return
@@ -662,6 +652,7 @@ func main() {
defer func() { handleRecover(log, terminateChan, recover()) }()
for {
conn, err := listener2.Accept()
log.Trace("new conn %+v", conn)
if err != nil {
terminateChan <- err
return
+41
View File
@@ -1,15 +1,18 @@
package main
import (
"crypto/aes"
"encoding/binary"
"encoding/json"
"flag"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"xtx/eggsfs/crc32c"
"xtx/eggsfs/lib"
"xtx/eggsfs/msgs"
)
@@ -341,6 +344,44 @@ func main() {
run: cpOutofRun,
}
blockReqCmd := flag.NewFlagSet("write-block-req", flag.ExitOnError)
blockReqBlockId := blockReqCmd.Uint64("b", 0, "Block id")
blockReqBlockService := blockReqCmd.Uint64("bs", 0, "Block service")
blockReqFile := blockReqCmd.String("file", "", "")
blockReqRun := func() {
resp, err := lib.ShuckleRequest(log, *shuckleAddress, &msgs.AllBlockServicesReq{})
if err != nil {
panic(err)
}
blockServices := resp.(*msgs.AllBlockServicesResp)
var blockServiceInfo msgs.BlockServiceInfo
for _, bsInfo := range blockServices.BlockServices {
if bsInfo.Id == msgs.BlockServiceId(*blockReqBlockService) {
blockServiceInfo = bsInfo
break
}
}
cipher, err := aes.NewCipher(blockServiceInfo.SecretKey[:])
if err != nil {
panic(err)
}
fileContents, err := ioutil.ReadFile(*blockReqFile)
if err != nil {
panic(err)
}
req := msgs.WriteBlockReq{
BlockId: msgs.BlockId(*blockReqBlockId),
Crc: msgs.Crc(crc32c.Sum(0, fileContents)),
Size: uint32(len(fileContents)),
}
req.Certificate = lib.BlockWriteCertificate(cipher, blockServiceInfo.Id, &req)
log.Info("request: %+v", req)
}
commands["write-block-req"] = commandSpec{
flags: blockReqCmd,
run: blockReqRun,
}
flag.Parse()
if flag.NArg() < 1 {
+64 -47
View File
@@ -153,6 +153,50 @@ type eggsNode struct {
id msgs.InodeId
}
func getattr(id msgs.InodeId, out *fuse.Attr) syscall.Errno {
log.Debug("getattr inode=%v", id)
out.Ino = uint64(id)
out.Mode = inodeTypeToMode(id.Type())
if id.Type() == msgs.DIRECTORY {
resp := msgs.StatDirectoryResp{}
if err := shardRequest(id.Shard(), &msgs.StatDirectoryReq{Id: id}, &resp); err != 0 {
return err
}
} else {
fileStatCacheMu.RLock()
cached, found := fileStatCache[id]
fileStatCacheMu.RUnlock()
if !found {
resp := msgs.StatFileResp{}
if err := shardRequest(id.Shard(), &msgs.StatFileReq{Id: id}, &resp); err != 0 {
return err
}
cached.mtime = resp.Mtime
cached.size = resp.Size
fileStatCacheMu.Lock()
fileStatCache[id] = cached
fileStatCacheMu.Unlock()
}
log.Debug("getattr size=%v", cached.size)
out.Size = cached.size
mtime := msgs.EGGS_EPOCH + uint64(cached.mtime)
mtimesec := mtime / 1000000000
mtimens := uint32(mtime % 1000000000)
out.Ctime = mtimesec
out.Ctimensec = mtimens
out.Mtime = mtimesec
out.Mtimensec = mtimens
}
return 0
}
func (n *eggsNode) Getattr(ctx context.Context, f fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
return getattr(n.id, &out.Attr)
}
func (n *eggsNode) Lookup(
ctx context.Context, name string, out *fuse.EntryOut,
) (*fs.Inode, syscall.Errno) {
@@ -172,6 +216,9 @@ func (n *eggsNode) Lookup(
default:
panic(fmt.Errorf("bad type %v", resp.TargetId.Type()))
}
if err := getattr(resp.TargetId, &out.Attr); err != 0 {
return nil, err
}
return n.NewInode(ctx, &eggsNode{id: resp.TargetId}, fs.StableAttr{Ino: uint64(resp.TargetId), Mode: mode}), 0
}
@@ -316,61 +363,29 @@ func (n *eggsNode) Mkdir(
return n.NewInode(ctx, &eggsNode{id: resp.Id}, fs.StableAttr{Ino: uint64(resp.Id), Mode: syscall.S_IFDIR}), 0
}
func (n *eggsNode) Getattr(ctx context.Context, f fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
log.Debug("getattr inode=%v", n.id)
out.Ino = uint64(n.id)
out.Mode = inodeTypeToMode(n.id.Type())
if n.id.Type() == msgs.DIRECTORY {
resp := msgs.StatDirectoryResp{}
if err := shardRequest(n.id.Shard(), &msgs.StatDirectoryReq{Id: n.id}, &resp); err != 0 {
return err
}
} else {
fileStatCacheMu.RLock()
cached, found := fileStatCache[n.id]
fileStatCacheMu.RUnlock()
if !found {
resp := msgs.StatFileResp{}
if err := shardRequest(n.id.Shard(), &msgs.StatFileReq{Id: n.id}, &resp); err != 0 {
return err
}
cached.mtime = resp.Mtime
cached.size = resp.Size
fileStatCacheMu.Lock()
fileStatCache[n.id] = cached
fileStatCacheMu.Unlock()
}
out.Size = cached.size
mtime := msgs.EGGS_EPOCH + uint64(cached.mtime)
mtimesec := mtime / 1000000000
mtimens := uint32(mtime % 1000000000)
out.Ctime = mtimesec
out.Ctimensec = mtimens
out.Mtime = mtimesec
out.Mtimensec = mtimens
}
return 0
}
func (f *transientFile) writeSpan() syscall.Errno {
if f.data == nil {
return 0 // happens when dup is called on file
}
spanSize := uint32(len(*f.data))
maxSize := int(f.spanPolicy.Entries[len(f.spanPolicy.Entries)-1].MaxSize)
boundary := maxSize
if len(*f.data) < maxSize {
boundary = len(*f.data)
}
spanData := (*f.data)[:boundary]
leftover := append([]byte{}, (*f.data)[boundary:]...)
spanSize := uint32(len(spanData))
if spanSize == 0 {
return 0
}
var err error
*f.data, err = client.CreateSpan(log, []msgs.BlockServiceId{}, f.spanPolicy, f.blockPolicy, f.stripePolicy, f.id, f.cookie, f.written, spanSize, *f.data)
*f.data, err = client.CreateSpan(log, []msgs.BlockServiceId{}, f.spanPolicy, f.blockPolicy, f.stripePolicy, f.id, f.cookie, f.written, spanSize, spanData)
if err != nil {
f.valid = false
return eggsErrToErrno(err)
}
f.written += uint64(spanSize)
*f.data = (*f.data)[:0]
*f.data = append((*f.data)[:0], leftover...)
return 0
}
@@ -514,8 +529,8 @@ type openFile struct {
id msgs.InodeId
size uint64 // total size of file
spanReader lib.TaintableReadCloser // the span we're currently reading from
readOffset int64 // the offset we're currently reading at, in the span reader above
spanReader lib.PuttableReadCloser // the span we're currently reading from
readOffset int64 // the offset we're currently reading at, in the span reader above
// We store the last successful span resp, to avoid re-issuing it if we can
spans *msgs.FileSpansResp
@@ -619,7 +634,6 @@ func (of *openFile) readInternal(dest []byte, off int64) (int64, syscall.Errno)
if off != of.readOffset {
log.Debug("mismatching offset (%v vs %v), will reset span", off, of.readOffset)
if of.spanReader != nil {
of.spanReader.Taint()
of.spanReader.Close()
of.spanReader = nil
}
@@ -650,7 +664,6 @@ func (of *openFile) readInternal(dest []byte, off int64) (int64, syscall.Errno)
func (of *openFile) Flush(ctx context.Context) syscall.Errno {
if of.spanReader != nil {
of.spanReader.Taint()
of.spanReader.Close()
}
return 0
@@ -865,7 +878,11 @@ func main() {
root := eggsNode{
id: msgs.ROOT_DIR_INODE_ID,
}
server, err := fs.Mount(mountPoint, &root, &fs.Options{Logger: logger})
fuseOptions := &fs.Options{
Logger: logger,
}
// fuseOptions.Debug = *trace
server, err := fs.Mount(mountPoint, &root, fuseOptions)
if err != nil {
fmt.Fprintf(os.Stderr, "Could not mount: %v", err)
os.Exit(1)
+18 -4
View File
@@ -34,6 +34,7 @@ func main() {
shuckleHttpPort := flag.Uint("shuckle-http-port", 10000, "")
startingPort := flag.Uint("start-port", 10002, "The services will be assigned port in this order, CDC, shard_000, ..., shard_255, bs_0, ..., bs_n. If 0, ports will be chosen randomly.")
repoDir := flag.String("repo-dir", "", "Used to build C++/Go binaries. If not provided, the path will be derived form the filename at build time (so will only work locally).")
binariesDir := flag.String("binaries-dir", "", "If provided, nothing will be built, instead it'll be assumed that the binaries will be in the specified directory.")
flag.Parse()
noRunawayArgs()
@@ -92,10 +93,23 @@ func main() {
}
log := lib.NewLogger(level, logOut)
fmt.Printf("building shard/cdc/blockservice/shuckle\n")
cppExes := managedprocess.BuildCppExes(log, *repoDir, *buildType)
goExes := managedprocess.BuildGoExes(log, *repoDir)
var cppExes *managedprocess.CppExes
var goExes *managedprocess.GoExes
if *binariesDir != "" {
cppExes = &managedprocess.CppExes{
ShardExe: path.Join(*binariesDir, "eggsshard"),
CDCExe: path.Join(*binariesDir, "eggscdc"),
}
goExes = &managedprocess.GoExes{
ShuckleExe: path.Join(*binariesDir, "eggsshuckle"),
BlocksExe: path.Join(*binariesDir, "eggsblocks"),
FuseExe: path.Join(*binariesDir, "eggsfuse"),
}
} else {
fmt.Printf("building shard/cdc/blockservice/shuckle\n")
cppExes = managedprocess.BuildCppExes(log, *repoDir, *buildType)
goExes = managedprocess.BuildGoExes(log, *repoDir)
}
terminateChan := make(chan any, 1)
+34 -29
View File
@@ -164,35 +164,40 @@ func handleRequest(log *lib.Logger, s *state, conn *net.TCPConn) {
conn.SetLinger(0) // poor man error handling for now
defer conn.Close()
req, err := lib.ReadShuckleRequest(log, conn)
if err != nil {
log.RaiseAlert(fmt.Errorf("could not decode request: %w", err))
return
}
log.Debug("handling request %T %+v", req, req)
var resp msgs.ShuckleResponse
for {
req, err := lib.ReadShuckleRequest(log, conn)
if err == io.EOF {
return
}
if err != nil {
log.RaiseAlert(fmt.Errorf("could not decode request: %w", err))
return
}
log.Debug("handling request %T %+v", req, req)
var resp msgs.ShuckleResponse
switch whichReq := req.(type) {
case *msgs.BlockServicesForShardReq:
resp = handleBlockServicesForShard(log, s, conn, whichReq)
case *msgs.RegisterBlockServicesReq:
resp = handleRegisterBlockServices(log, s, conn, whichReq)
case *msgs.ShardsReq:
resp = handleShards(log, s, conn, whichReq)
case *msgs.RegisterShardReq:
resp = handleRegisterShard(log, s, conn, whichReq)
case *msgs.AllBlockServicesReq:
resp = handleAllBlockServicesReq(log, s, conn, whichReq)
case *msgs.CdcReq:
resp = handleCdcReq(log, s, conn, whichReq)
case *msgs.RegisterCdcReq:
resp = handleRegisterCdcReq(log, s, conn, whichReq)
default:
log.RaiseAlert(fmt.Errorf("bad req type %T", req))
}
log.Debug("sending back response %T", resp)
if err := lib.WriteShuckleResponse(log, conn, resp); err != nil {
log.RaiseAlert(fmt.Errorf("could not send response: %w", err))
switch whichReq := req.(type) {
case *msgs.BlockServicesForShardReq:
resp = handleBlockServicesForShard(log, s, conn, whichReq)
case *msgs.RegisterBlockServicesReq:
resp = handleRegisterBlockServices(log, s, conn, whichReq)
case *msgs.ShardsReq:
resp = handleShards(log, s, conn, whichReq)
case *msgs.RegisterShardReq:
resp = handleRegisterShard(log, s, conn, whichReq)
case *msgs.AllBlockServicesReq:
resp = handleAllBlockServicesReq(log, s, conn, whichReq)
case *msgs.CdcReq:
resp = handleCdcReq(log, s, conn, whichReq)
case *msgs.RegisterCdcReq:
resp = handleRegisterCdcReq(log, s, conn, whichReq)
default:
log.RaiseAlert(fmt.Errorf("bad req type %T", req))
}
log.Debug("sending back response %T", resp)
if err := lib.WriteShuckleResponse(log, conn, resp); err != nil {
log.RaiseAlert(fmt.Errorf("could not send response: %w", err))
}
}
}
@@ -436,7 +441,7 @@ func handleIndex(ll *lib.Logger, state *state, w http.ResponseWriter, r *http.Re
for _, bs := range state.blockServices {
data.BlockServices = append(data.BlockServices, indexBlockService{
Id: bs.Id,
Addr1: fmt.Sprintf("%v:%v", net.IP(bs.Ip1[:]), bs.Port2),
Addr1: fmt.Sprintf("%v:%v", net.IP(bs.Ip1[:]), bs.Port1),
Addr2: fmt.Sprintf("%v:%v", net.IP(bs.Ip2[:]), bs.Port2),
StorageClass: bs.StorageClass,
FailureDomain: string(bs.FailureDomain[:bytes.Index(bs.FailureDomain[:], []byte{0})]),
+236 -61
View File
@@ -5,12 +5,13 @@ import (
"bytes"
"fmt"
"io"
"io/ioutil"
"math"
"math/rand"
"os"
"path"
"runtime/debug"
"sort"
"strconv"
"strings"
"xtx/eggsfs/lib"
"xtx/eggsfs/msgs"
"xtx/eggsfs/wyhash"
@@ -25,6 +26,7 @@ type fsTestOpts struct {
inlineFileProb float64
maxFileSize int
spanSize int
checkThreads int
}
type fsTestHarness[Id comparable] interface {
@@ -35,14 +37,14 @@ type fsTestHarness[Id comparable] interface {
checkFileData(log *lib.Logger, id Id, size uint64, dataSeed uint64)
// files, directories
readDirectory(log *lib.Logger, dir Id) ([]string, []string)
removeFile(log *lib.Logger, dir Id, name string)
removeDirectory(log *lib.Logger, dir Id, name string)
}
type apiFsTestHarness struct {
client *lib.Client
dirInfoCache *lib.DirInfoCache
// buffer we use both to create and to read files.
fileContentsBuf []byte
readBufPool *lib.ReadSpanBufPool
readBufPool *lib.ReadSpanBufPool
}
func (c *apiFsTestHarness) createDirectory(log *lib.Logger, owner msgs.InodeId, name string) (id msgs.InodeId, creationTime msgs.EggsTime) {
@@ -106,7 +108,7 @@ func (c *apiFsTestHarness) rename(
func (c *apiFsTestHarness) createFile(
log *lib.Logger, owner msgs.InodeId, spanSize uint32, name string, size uint64, dataSeed uint64,
) (msgs.InodeId, msgs.EggsTime) {
return createFile(log, c.client, c.dirInfoCache, owner, spanSize, name, size, dataSeed, &c.fileContentsBuf)
return createFile(log, c.client, c.dirInfoCache, owner, spanSize, name, size, dataSeed, c.readBufPool)
}
func (c *apiFsTestHarness) readDirectory(log *lib.Logger, dir msgs.InodeId) (files []string, dirs []string) {
@@ -121,21 +123,21 @@ func (c *apiFsTestHarness) readDirectory(log *lib.Logger, dir msgs.InodeId) (fil
return files, dirs
}
func checkFileData(actualData []byte, expectedData []byte) {
func checkFileData(id any, from int, to int, actualData []byte, expectedData []byte) {
if !bytes.Equal(actualData, expectedData) {
dir, err := os.MkdirTemp("", "eggs-fstest-files.")
if err != nil {
panic(fmt.Errorf("mismatching data, could not create temp directory"))
panic(fmt.Errorf("mismatching data (%v,%v) for file %v, could not create temp directory", from, to, id))
}
expectedPath := path.Join(dir, "expected")
actualPath := path.Join(dir, "actual")
if err := os.WriteFile(expectedPath, expectedData, 0644); err != nil {
panic(fmt.Errorf("mismatching data, could not create data file"))
panic(fmt.Errorf("mismatching data (%v,%v), could not create data file", from, to))
}
if err := os.WriteFile(actualPath, actualData, 0644); err != nil {
panic(fmt.Errorf("mismatching data, could not create data file"))
panic(fmt.Errorf("mismatching data (%v,%v), could not create data file", from, to))
}
panic(fmt.Errorf("mismatching data, expected data is in %v, found data is in %v", expectedPath, actualPath))
panic(fmt.Errorf("mismatching data (%v,%v), expected data is in %v, found data is in %v", from, to, expectedPath, actualPath))
}
}
@@ -156,19 +158,39 @@ func ensureLen(buf []byte, l int) []byte {
}
func (c *apiFsTestHarness) checkFileData(log *lib.Logger, id msgs.InodeId, size uint64, dataSeed uint64) {
fileData := readFile(log, c.readBufPool, c.client, id, &c.fileContentsBuf)
fullSize := int(size)
c.fileContentsBuf = ensureLen(c.fileContentsBuf, fullSize)
expectedData := c.fileContentsBuf[:int(size)]
wyhash.New(dataSeed).Read(expectedData[:int(size)])
checkFileData(fileData, expectedData)
actualData := c.readBufPool.Get(int(size))
defer c.readBufPool.Put(actualData)
*actualData = readFile(log, c.readBufPool, c.client, id, *actualData)
expectedData := c.readBufPool.Get(int(size))
defer c.readBufPool.Put(expectedData)
wyhash.New(dataSeed).Read(*expectedData)
checkFileData(id, 0, int(size), *actualData, *expectedData)
}
func (c *apiFsTestHarness) removeFile(log *lib.Logger, ownerId msgs.InodeId, name string) {
lookupResp := msgs.LookupResp{}
if err := c.client.ShardRequest(log, ownerId.Shard(), &msgs.LookupReq{DirId: ownerId, Name: name}, &lookupResp); err != nil {
panic(err)
}
if err := c.client.ShardRequest(log, ownerId.Shard(), &msgs.SoftUnlinkFileReq{OwnerId: ownerId, FileId: lookupResp.TargetId, Name: name, CreationTime: lookupResp.CreationTime}, &msgs.SoftUnlinkFileResp{}); err != nil {
panic(err)
}
}
func (c *apiFsTestHarness) removeDirectory(log *lib.Logger, ownerId msgs.InodeId, name string) {
lookupResp := msgs.LookupResp{}
if err := c.client.ShardRequest(log, ownerId.Shard(), &msgs.LookupReq{DirId: ownerId, Name: name}, &lookupResp); err != nil {
panic(err)
}
if err := c.client.CDCRequest(log, &msgs.SoftUnlinkDirectoryReq{OwnerId: ownerId, TargetId: lookupResp.TargetId, Name: name, CreationTime: lookupResp.CreationTime}, &msgs.SoftUnlinkDirectoryResp{}); err != nil {
panic(err)
}
}
var _ = (fsTestHarness[msgs.InodeId])((*apiFsTestHarness)(nil))
type posixFsTestHarness struct {
expectedDataBuf []byte
actualDataBuf []byte
bufPool *lib.ReadSpanBufPool
}
func (*posixFsTestHarness) createDirectory(log *lib.Logger, owner string, name string) (fullPath string, creationTime msgs.EggsTime) {
@@ -203,11 +225,34 @@ func (*posixFsTestHarness) rename(
func (c *posixFsTestHarness) createFile(
log *lib.Logger, dirFullPath string, spanSize uint32, name string, size uint64, dataSeed uint64,
) (fileFullPath string, t msgs.EggsTime) {
c.actualDataBuf = ensureLen(c.actualDataBuf, int(size))
wyhash.New(dataSeed).Read(c.actualDataBuf[:size])
actualDataBuf := c.bufPool.Get(int(size))
defer c.bufPool.Put(actualDataBuf)
rand := wyhash.New(dataSeed)
rand.Read(*actualDataBuf)
fileFullPath = path.Join(dirFullPath, name)
log.LogStack(1, lib.DEBUG, "posix create file %v", fileFullPath)
if err := os.WriteFile(fileFullPath, c.actualDataBuf, 0644); err != nil {
f, err := os.Create(fileFullPath)
if err != nil {
panic(err)
}
log.LogStack(1, lib.DEBUG, "posix create file %v (%v size)", fileFullPath, size)
if size > 0 {
// write in randomly sized chunks
chunks := int(rand.Uint32()%10) + 1
offsets := make([]int, chunks+1)
offsets[0] = 0
for i := 1; i < chunks; i++ {
offsets[i] = int(rand.Uint64() % size)
}
offsets[chunks] = int(size)
sort.Ints(offsets)
for i := 0; i < chunks; i++ {
log.Trace("writing from %v to %v", offsets[i], offsets[i+1])
if _, err := f.Write((*actualDataBuf)[offsets[i]:offsets[i+1]]); err != nil {
panic(err)
}
}
}
if err := f.Close(); err != nil {
panic(err)
}
return fileFullPath, 0
@@ -215,12 +260,12 @@ func (c *posixFsTestHarness) createFile(
func (c *posixFsTestHarness) readDirectory(log *lib.Logger, dirFullPath string) (files []string, dirs []string) {
log.LogStack(1, lib.DEBUG, "posix readdir for %v", dirFullPath)
fileInfo, err := ioutil.ReadDir(dirFullPath)
fileInfo, err := os.ReadDir(dirFullPath)
if err != nil {
panic(err)
}
for _, fi := range fileInfo {
if fi.Mode().IsDir() {
if fi.IsDir() {
dirs = append(dirs, fi.Name())
} else {
files = append(files, fi.Name())
@@ -230,21 +275,21 @@ func (c *posixFsTestHarness) readDirectory(log *lib.Logger, dirFullPath string)
}
func (c *posixFsTestHarness) checkFileData(log *lib.Logger, fullFilePath string, size uint64, dataSeed uint64) {
log.Debug("checking data for file %v", fullFilePath)
fullSize := int(size)
c.expectedDataBuf = ensureLen(c.expectedDataBuf, fullSize)
wyhash.New(dataSeed).Read(c.expectedDataBuf[:int(size)])
c.actualDataBuf = ensureLen(c.actualDataBuf, fullSize)
expectedData := c.bufPool.Get(fullSize)
defer c.bufPool.Put(expectedData)
rand := wyhash.New(dataSeed)
rand.Read(*expectedData)
actualData := c.bufPool.Get(fullSize)
defer c.bufPool.Put(actualData)
f, err := os.Open(fullFilePath)
if err != nil {
panic(err)
}
defer f.Close()
// First we check the whole thing
if _, err := io.ReadFull(f, c.actualDataBuf); err != nil {
panic(err)
}
checkFileData(c.actualDataBuf, c.expectedDataBuf)
// then we start doing random reads around
log.Debug("checking for file %v of expected len %v", fullFilePath, fullSize)
// First do some random reads, hopefully stimulating span caches in some interesting way
if fullSize > 1 {
for i := 0; i < 10; i++ {
offset := int(rand.Uint64() % uint64(fullSize-1))
@@ -253,14 +298,31 @@ func (c *posixFsTestHarness) checkFileData(log *lib.Logger, fullFilePath string,
if _, err := f.Seek(int64(offset), 0); err != nil {
panic(err)
}
expectedPartialData := c.expectedDataBuf[offset : offset+size]
actualPartialData := c.actualDataBuf[offset : offset+size]
expectedPartialData := (*expectedData)[offset : offset+size]
actualPartialData := (*actualData)[offset : offset+size]
if _, err := io.ReadFull(f, actualPartialData); err != nil {
panic(err)
}
checkFileData(actualPartialData, expectedPartialData)
checkFileData(fullFilePath, offset, offset+size, actualPartialData, expectedPartialData)
}
}
// Then we check the whole thing
if _, err := f.Seek(0, 0); err != nil {
panic(err)
}
_, err = io.ReadFull(f, *actualData)
if err != nil {
panic(err)
}
checkFileData(fullFilePath, 0, fullSize, *actualData, *expectedData)
}
func (c *posixFsTestHarness) removeFile(log *lib.Logger, ownerId string, name string) {
os.Remove(path.Join(ownerId, name))
}
func (c *posixFsTestHarness) removeDirectory(log *lib.Logger, ownerId string, name string) {
os.Remove(path.Join(ownerId, name))
}
var _ = (fsTestHarness[string])((*posixFsTestHarness)(nil))
@@ -461,12 +523,12 @@ func (d *fsTestDir[Id]) check(log *lib.Logger, harness fsTestHarness[Id]) {
panic(fmt.Errorf("bad number of edges -- got %v + %v, expected %v + %v", len(files), len(dirs), len(d.children.files), len(d.children.files)))
}
for _, fileName := range files {
log.Debug("checking file %v", fileName)
name, err := strconv.Atoi(fileName)
if err != nil {
panic(err)
}
file, present := d.children.files[name]
log.Debug("checking file %v (size %v)", fileName, file.body.size)
if !present {
panic(fmt.Errorf("file %v not found", name))
}
@@ -491,6 +553,27 @@ func (d *fsTestDir[Id]) check(log *lib.Logger, harness fsTestHarness[Id]) {
}
}
func (d *fsTestDir[Id]) clean(log *lib.Logger, harness fsTestHarness[Id]) {
files, dirs := harness.readDirectory(log, d.id)
for _, fileName := range files {
log.Debug("removing file %v", fileName)
harness.removeFile(log, d.id, fileName)
}
for _, dirName := range dirs {
log.Debug("cleaning dir %v", dirName)
name, err := strconv.Atoi(dirName)
if err != nil {
panic(err)
}
dir, present := d.children.directories[name]
if !present {
panic(fmt.Errorf("directory %v not found", name))
}
dir.body.clean(log, harness)
harness.removeDirectory(log, d.id, dirName)
}
}
// Just the first block service id we can find
func findBlockServiceToPurge(log *lib.Logger, client *lib.Client) msgs.BlockServiceId {
filesReq := msgs.VisitFilesReq{}
@@ -519,15 +602,15 @@ func findBlockServiceToPurge(log *lib.Logger, client *lib.Client) msgs.BlockServ
func fsTestInternal[Id comparable](
log *lib.Logger,
state *fsTestState[Id],
shuckleAddress string,
opts *fsTestOpts,
counters *lib.ClientCounters,
harness fsTestHarness[Id],
rootId Id,
) {
state := fsTestState[Id]{
totalDirs: 1, // root dir
rootDir: *newFsTestDir(rootId),
if opts.checkThreads == 0 {
panic(fmt.Errorf("must specify at least one check thread"))
}
branching := int(math.Log(float64(opts.numDirs)) / math.Log(float64(opts.depth)))
rand := wyhash.New(42)
@@ -582,30 +665,110 @@ func fsTestInternal[Id comparable](
state.makeFile(log, harness, opts, rand, dir, state.totalDirs+state.totalFiles)
}
}
}
func fsTestInternalCheck[Id comparable](
log *lib.Logger,
state *fsTestState[Id],
shuckleAddress string,
opts *fsTestOpts,
counters *lib.ClientCounters,
harness fsTestHarness[Id],
rootId Id,
) {
// finally, check that our view of the world is the real view of the world
log.Info("checking directories/files")
errsChans := make([](chan any), opts.checkThreads)
for i := 0; i < opts.checkThreads; i++ {
errChan := make(chan any)
errsChans[i] = errChan
go func() {
defer func() {
err := recover()
if err != nil {
log.Info("stacktrace for %v:", err)
for _, line := range strings.Split(string(debug.Stack()), "\n") {
log.Info(line)
}
}
errChan <- err
}()
state.rootDir.check(log, harness)
}()
}
for i := 0; i < opts.checkThreads; i++ {
err := <-errsChans[i]
if err != nil {
panic(fmt.Errorf("checking thread %v failed: %v", i, err))
}
}
state.rootDir.check(log, harness)
// Now, try to migrate away from one block service, to stimulate that code path
// in tests somewhere.
{
client, err := lib.NewClient(log, shuckleAddress, nil, counters, nil)
if err != nil {
panic(err)
}
defer client.Close()
blockServiceToPurge := findBlockServiceToPurge(log, client)
log.Info("will migrate block service %v", blockServiceToPurge)
migrateStats := lib.MigrateStats{}
err = lib.MigrateBlocksInAllShards(log, client, &migrateStats, blockServiceToPurge)
if err != nil {
panic(fmt.Errorf("could not migrate: %w", err))
}
if migrateStats.MigratedBlocks == 0 {
panic(fmt.Errorf("migrate didn't migrate any blocks"))
/*
{
client, err := lib.NewClient(log, shuckleAddress, nil, counters, nil)
if err != nil {
panic(err)
}
defer client.Close()
blockServiceToPurge := findBlockServiceToPurge(log, client)
log.Info("will migrate block service %v", blockServiceToPurge)
migrateStats := lib.MigrateStats{}
err = lib.MigrateBlocksInAllShards(log, client, &migrateStats, blockServiceToPurge)
if err != nil {
panic(fmt.Errorf("could not migrate: %w", err))
}
if migrateStats.MigratedBlocks == 0 {
panic(fmt.Errorf("migrate didn't migrate any blocks"))
}
}
// And check the state again, don't bother with multiple threads thoush
state.rootDir.check(log, harness)
*/
// Now, remove everything -- the cleanup would do this anyway, but we want to stimulate
// the removal paths in the filesystem tests.
state.rootDir.clean(log, harness)
}
func replaceMountPoint(mountPoint string, mountPointFuse string, fp string) string {
return mountPointFuse + fp[len(mountPoint):]
}
func convertDirToFuse(
dir *fsTestDir[string],
mountPoint string,
mountPointFuse string,
) *fsTestDir[string] {
newDir := fsTestDir[string]{
id: replaceMountPoint(mountPoint, mountPointFuse, dir.id),
children: fsTestChildren[string]{
files: make(map[int]fsTestChild[fsTestFile[string]]),
directories: make(map[int]fsTestChild[fsTestDir[string]]),
},
}
// And check the state again
state.rootDir.check(log, harness)
for i, file := range dir.children.files {
newFile := file
newFile.body.id = replaceMountPoint(mountPoint, mountPointFuse, newFile.body.id)
newDir.children.files[i] = newFile
}
for i, childDir := range dir.children.directories {
newChildDir := childDir
newChildDir.body = *convertDirToFuse(&childDir.body, mountPoint, mountPointFuse)
newDir.children.directories[i] = newChildDir
}
return &newDir
}
func convertToFuse(
state *fsTestState[string],
mountPoint string,
mountPointFuse string,
) *fsTestState[string] {
newState := *state
newState.rootDir = *convertDirToFuse(&state.rootDir, mountPoint, mountPointFuse)
return &newState
}
func fsTest(
@@ -626,9 +789,21 @@ func fsTest(
dirInfoCache: lib.NewDirInfoCache(),
readBufPool: lib.NewReadSpanBufPool(),
}
fsTestInternal[msgs.InodeId](log, shuckleAddress, opts, counters, harness, msgs.ROOT_DIR_INODE_ID)
state := fsTestState[msgs.InodeId]{
totalDirs: 1, // root dir
rootDir: *newFsTestDir(msgs.ROOT_DIR_INODE_ID),
}
fsTestInternal[msgs.InodeId](log, &state, shuckleAddress, opts, counters, harness, msgs.ROOT_DIR_INODE_ID)
fsTestInternalCheck[msgs.InodeId](log, &state, shuckleAddress, opts, counters, harness, msgs.ROOT_DIR_INODE_ID)
} else {
harness := &posixFsTestHarness{}
fsTestInternal[string](log, shuckleAddress, opts, counters, harness, realFs)
harness := &posixFsTestHarness{
bufPool: lib.NewReadSpanBufPool(),
}
state := fsTestState[string]{
totalDirs: 1, // root dir
rootDir: *newFsTestDir(realFs),
}
fsTestInternal[string](log, &state, shuckleAddress, opts, counters, harness, realFs)
fsTestInternalCheck[string](log, &state, shuckleAddress, opts, counters, harness, realFs)
}
}
+191 -20
View File
@@ -4,17 +4,20 @@ import (
"flag"
"fmt"
"os"
"os/exec"
"path"
"regexp"
"runtime"
"runtime/debug"
"runtime/pprof"
"strconv"
"strings"
"sync"
"time"
"xtx/eggsfs/lib"
"xtx/eggsfs/managedprocess"
"xtx/eggsfs/msgs"
"xtx/eggsfs/rs"
"golang.org/x/exp/constraints"
)
@@ -123,12 +126,65 @@ func runTest(
}
}
func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fuseMountPoint string, short bool, filter *regexp.Regexp) {
defer func() { handleRecover(log, terminateChan, recover()) }()
func getKmodDirRefreshTime() uint64 {
bs, err := os.ReadFile("/proc/sys/fs/eggsfs/dir_refresh_time_ms")
if err != nil {
panic(err)
}
ms, err := strconv.ParseUint(strings.TrimSpace(string(bs)), 10, 64)
if err != nil {
panic(err)
}
return ms
}
// We want to immediately clean up everything when we run the GC manually
func setKmodDirRefreshTime(ms uint64) {
out, err := exec.Command("sudo", "sh", "-c", fmt.Sprintf("echo %v > /proc/sys/fs/eggsfs/dir_refresh_time_ms", ms)).CombinedOutput()
if err != nil {
panic(fmt.Errorf("could not mount filesystem (%w): %s", err, out))
}
}
type cfgOverrides map[string]string
func (i *cfgOverrides) Set(value string) error {
parts := strings.Split(value, "=")
if len(parts) != 2 {
return fmt.Errorf("invalid cfg override %q", value)
}
(*i)[parts[0]] = parts[1]
return nil
}
func (i *cfgOverrides) String() string {
pairs := []string{}
for k, v := range *i {
pairs = append(pairs, fmt.Sprintf("%s=%s", k, v))
}
return strings.Join(pairs, ",")
}
func (i *cfgOverrides) int(k string, def int) int {
s, present := (*i)[k]
if !present {
return def
}
n, err := strconv.Atoi(s)
if err != nil {
panic(err)
}
return n
}
func runTests(terminateChan chan any, log *lib.Logger, overrides *cfgOverrides, shuckleAddress string, mountPoint string, fuseMountPoint string, kmod bool, short bool, filter *regexp.Regexp) {
defer func() { handleRecover(log, terminateChan, recover()) }()
client, err := lib.NewClient(log, shuckleAddress, nil, nil, nil)
defer client.Close()
defaultSpanPolicy := &msgs.SpanPolicy{}
defaultStripePolicy := &msgs.StripePolicy{}
{
client, err := lib.NewClient(log, shuckleAddress, nil, nil, nil)
// We want to immediately clean up everything when we run the GC manually
if err != nil {
panic(err)
}
@@ -138,6 +194,44 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, snapshotPolicy); err != nil {
panic(err)
}
dirInfoCache := lib.NewDirInfoCache()
_, err = client.ResolveDirectoryInfoEntry(log, dirInfoCache, msgs.ROOT_DIR_INODE_ID, defaultSpanPolicy)
if err != nil {
panic(err)
}
// We also want to stimulate many spans while not writing huge files.
// This is the same as the default but the second and third policy are
/// 500KiB and 1MiB
spanPolicy := &msgs.SpanPolicy{
Entries: []msgs.SpanPolicyEntry{
{
MaxSize: 1 << 16,
Parity: rs.MkParity(1, 4),
},
{
MaxSize: 500 << 10,
Parity: rs.MkParity(4, 4),
},
{
MaxSize: 1 << 20,
Parity: rs.MkParity(10, 4),
},
},
}
// Also reduce stripe size to get many stripes
_, err = client.ResolveDirectoryInfoEntry(log, dirInfoCache, msgs.ROOT_DIR_INODE_ID, defaultStripePolicy)
if err != nil {
panic(err)
}
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, spanPolicy); err != nil {
panic(err)
}
stripePolicy := &msgs.StripePolicy{
TargetStripeSize: 4096 * 5,
}
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, stripePolicy); err != nil {
panic(err)
}
}
fileHistoryOpts := fileHistoryTestOpts{
@@ -162,21 +256,25 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
)
fsTestOpts := fsTestOpts{
numDirs: 1 * 1000, // we need at least 256 directories, to have at least one dir per shard
numFiles: 20 * 1000, // around 20 files per dir
depth: 4,
// 0.03 * 20*1000 * 5MiB = ~3GiB of file data
emptyFileProb: 0.8,
inlineFileProb: 0.17,
maxFileSize: 10 << 20, // 10MiB
spanSize: 1 << 20, // 1MiB
depth: 4,
maxFileSize: overrides.int("fsTest.maxFileSize", 10<<20), // 10MiB
spanSize: 1 << 20, // 1MiB
checkThreads: overrides.int("fsTest.checkThreads", 5),
}
// 0.03 * 20*1000 * 5MiB = ~3GiB of file data
if short {
fsTestOpts.numDirs = 200
fsTestOpts.numFiles = 10 * 200
fsTestOpts.numDirs = overrides.int("fsTest.numDirs", 200)
fsTestOpts.numFiles = overrides.int("fsTest.numFiles", 10*200)
fsTestOpts.emptyFileProb = 0.1
fsTestOpts.inlineFileProb = 0.3
} else {
fsTestOpts.numDirs = overrides.int("fsTest.numDirs", 1*1000) // we need at least 256 directories, to have at least one dir per shard
fsTestOpts.numFiles = overrides.int("fsTest.numFiles", 20*1000) // around 20 files per dir
fsTestOpts.emptyFileProb = 0.8
fsTestOpts.inlineFileProb = 0.17
}
runTest(
log,
shuckleAddress,
@@ -188,17 +286,65 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
},
)
if kmod {
originalDirRefreshTime := getKmodDirRefreshTime()
defer setKmodDirRefreshTime(originalDirRefreshTime)
setKmodDirRefreshTime(0)
}
if kmod {
preadddirOpts := preadddirOpts{
numDirs: 1000,
filesPerDir: 100,
loops: 1000,
threads: 10,
}
runTest(
log,
shuckleAddress,
filter,
"parallel readdir",
fmt.Sprintf("%v dirs, %v files per dir, %v loops, %v threads", preadddirOpts.numDirs, preadddirOpts.filesPerDir, preadddirOpts.loops, preadddirOpts.threads),
func(counters *lib.ClientCounters) {
preaddirTest(log, mountPoint, &preadddirOpts)
},
)
}
runTest(
log,
shuckleAddress,
filter,
"fuse fs",
"mounted fs",
fmt.Sprintf("%v dirs, %v files, %v depth", fsTestOpts.numDirs, fsTestOpts.numFiles, fsTestOpts.depth),
func(counters *lib.ClientCounters) {
fsTest(log, shuckleAddress, &fsTestOpts, counters, fuseMountPoint)
fsTest(log, shuckleAddress, &fsTestOpts, counters, mountPoint)
},
)
if kmod {
setKmodDirRefreshTime(1000) // 1 sec
runTest(
log,
shuckleAddress,
filter,
"mounted fs (1s dir refresh)",
fmt.Sprintf("%v dirs, %v files, %v depth", fsTestOpts.numDirs, fsTestOpts.numFiles, fsTestOpts.depth),
func(counters *lib.ClientCounters) {
fsTest(log, shuckleAddress, &fsTestOpts, counters, mountPoint)
},
)
}
// Restore default values for policies, otherwise things will be unduly slow below
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, defaultSpanPolicy); err != nil {
panic(err)
}
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, defaultStripePolicy); err != nil {
panic(err)
}
largeFileOpts := largeFileTestOpts{
fileSize: 1 << 30, // 1GiB
}
@@ -209,7 +355,7 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
"large file",
fmt.Sprintf("%vGB", float64(largeFileOpts.fileSize)/1e9),
func(counters *lib.ClientCounters) {
largeFileTest(log, &largeFileOpts, fuseMountPoint)
largeFileTest(log, &largeFileOpts, mountPoint)
},
)
@@ -225,7 +371,7 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
"rsync large files",
fmt.Sprintf("%v files, %v dirs, %vMB file size", rsyncOpts.numFiles, rsyncOpts.numDirs, float64(rsyncOpts.maxFileSize)/1e6),
func(counters *lib.ClientCounters) {
rsyncTest(log, &rsyncOpts, fuseMountPoint)
rsyncTest(log, &rsyncOpts, mountPoint)
},
)
@@ -241,7 +387,7 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
"rsync small files",
fmt.Sprintf("%v files, %v dirs, %vMB file size", rsyncOpts.numFiles, rsyncOpts.numDirs, float64(rsyncOpts.maxFileSize)/1e6),
func(counters *lib.ClientCounters) {
rsyncTest(log, &rsyncOpts, fuseMountPoint)
rsyncTest(log, &rsyncOpts, mountPoint)
},
)
@@ -256,6 +402,8 @@ func noRunawayArgs() {
}
func main() {
overrides := make(cfgOverrides)
buildType := flag.String("build-type", "alpine", "C++ build type, one of alpine/release/debug/sanitized/valgrind")
verbose := flag.Bool("verbose", false, "")
trace := flag.Bool("trace", false, "")
@@ -269,6 +417,8 @@ func main() {
short := flag.Bool("short", false, "Run a shorter version of the tests (useful with packet drop flags)")
repoDir := flag.String("repo-dir", "", "Used to build C++/Go binaries. If not provided, the path will be derived form the filename at build time (so will only work locally).")
binariesDir := flag.String("binaries-dir", "", "If provided, nothing will be built, instead it'll be assumed that the binaries will be in the specified directory.")
kmod := flag.Bool("kmod", false, "Whether to mount with the kernel module, rather than FUSE. Note that the tests will not attempt to run the kernel module and load it, they'll just mount with 'mount -t eggsfs'.")
flag.Var(&overrides, "cfg", "Config overrides")
flag.Parse()
noRunawayArgs()
@@ -438,11 +588,32 @@ func main() {
Profile: *profile,
})
var mountPoint string
if *kmod {
mountPoint = path.Join(*dataDir, "kmod", "mnt")
err := os.MkdirAll(mountPoint, 0777)
if err != nil {
panic(err)
}
out, err := exec.Command("sudo", "mount", "-t", "eggsfs", fmt.Sprintf("127.0.0.1:%v", shucklePort), mountPoint).CombinedOutput()
if err != nil {
panic(fmt.Errorf("could not mount filesystem (%w): %s", err, out))
}
defer func() {
out, err := exec.Command("sudo", "umount", mountPoint).CombinedOutput()
if err != nil {
fmt.Printf("could not umount fs (%v): %s", err, out)
}
}()
} else {
mountPoint = fuseMountPoint
}
fmt.Printf("operational 🤖\n")
// start tests
go func() {
runTests(terminateChan, log, shuckleAddress, fuseMountPoint, *short, filterRe)
runTests(terminateChan, log, &overrides, shuckleAddress, mountPoint, fuseMountPoint, *kmod, *short, filterRe)
}()
// wait for things to finish
+104
View File
@@ -0,0 +1,104 @@
package main
import (
"fmt"
"os"
"path"
"strconv"
"xtx/eggsfs/lib"
)
type preadddirOpts struct {
filesPerDir int
loops int
threads int
numDirs int
}
func preaddirCheck(
log *lib.Logger,
thread int,
mountPoint string,
opts *preadddirOpts,
) []string {
errors := []string{}
for i := 0; i < opts.numDirs; i++ {
dpath := path.Join(mountPoint, strconv.Itoa(i))
entries, err := os.ReadDir(dpath)
if err != nil {
panic(err)
}
found := make([]bool, opts.filesPerDir)
for _, entry := range entries {
i, err := strconv.Atoi(entry.Name())
if err != nil {
panic(err)
}
found[i] = true
}
for i, b := range found {
if !b {
err := fmt.Sprintf("[%v] %v/%v", thread, dpath, i)
log.Error(err)
errors = append(errors, err)
}
}
}
return errors
}
func preaddirTest(
log *lib.Logger,
mountPoint string,
opts *preadddirOpts,
) {
for i := 0; i < opts.numDirs; i++ {
dpath := path.Join(mountPoint, strconv.Itoa(i))
log.Debug("creating directory %v", dpath)
if err := os.Mkdir(dpath, 0777); err != nil {
panic(err)
}
for i := 0; i < opts.filesPerDir; i++ {
fpath := path.Join(dpath, strconv.Itoa(i))
log.Debug("creating file %v", fpath)
f, err := os.Create(fpath)
if err != nil {
panic(err)
}
if err := f.Close(); err != nil {
panic(err)
}
}
}
errsChans := make([](chan any), opts.threads)
errors := make([][]string, opts.threads)
for i := 0; i < opts.threads; i++ {
errsChans[i] = make(chan any)
go func(thread int) {
defer func() { errsChans[thread] <- recover() }()
errors[thread] = preaddirCheck(log, thread, mountPoint, opts)
}(i)
}
log.Info("waiting for threads")
for i := 0; i < opts.threads; i++ {
log.Debug("reading from %v", i)
err := <-errsChans[i]
if err != nil {
panic(fmt.Errorf("checking thread %v failed: %v", i, err))
}
}
log.Info("checking errors")
var flatErrors string
for i := range errors {
for j := range errors[i] {
if flatErrors == "" {
flatErrors = errors[i][j]
} else {
flatErrors = flatErrors + ", " + errors[i][j]
}
}
}
if flatErrors != "" {
panic(fmt.Sprintf("missing %s", flatErrors))
}
}
+25 -16
View File
@@ -1,8 +1,8 @@
package main
import (
"bytes"
"fmt"
"io"
"xtx/eggsfs/crc32c"
"xtx/eggsfs/lib"
"xtx/eggsfs/msgs"
@@ -65,7 +65,7 @@ func createFile(
name string,
size uint64,
dataSeed uint64,
buf *[]byte,
bufPool *lib.ReadSpanBufPool,
) (id msgs.InodeId, creationTime msgs.EggsTime) {
// construct
constructReq := msgs.ConstructFileReq{
@@ -94,13 +94,11 @@ func createFile(
if uint32(size-offset) < thisSpanSize {
thisSpanSize = uint32(size - offset)
}
if len(*buf) < int(thisSpanSize) {
*buf = append(*buf, make([]byte, int(thisSpanSize)-len(*buf))...)
}
spanBuf := (*buf)[:thisSpanSize]
rand.Read(spanBuf)
spanBuf := bufPool.Get(int(thisSpanSize))
rand.Read(*spanBuf)
var err error
*buf, err = client.CreateSpan(log, []msgs.BlockServiceId{}, &spanPolicy, &blockPolicies, &stripePolicy, constructResp.Id, constructResp.Cookie, offset, uint32(thisSpanSize), spanBuf)
*spanBuf, err = client.CreateSpan(log, []msgs.BlockServiceId{}, &spanPolicy, &blockPolicies, &stripePolicy, constructResp.Id, constructResp.Cookie, offset, uint32(thisSpanSize), *spanBuf)
bufPool.Put(spanBuf)
if err != nil {
panic(err)
}
@@ -118,11 +116,11 @@ func createFile(
return constructResp.Id, linkResp.CreationTime
}
func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client, id msgs.InodeId, buf *[]byte) []byte {
func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client, id msgs.InodeId, buf []byte) []byte {
statResp := msgs.StatFileResp{}
shardReq(log, client, id.Shard(), &msgs.StatFileReq{Id: id}, &statResp)
if len(*buf) < int(statResp.Size) {
*buf = append(*buf, make([]byte, int(statResp.Size)-len(*buf))...)
if len(buf) < int(statResp.Size) {
buf = append(buf, make([]byte, int(statResp.Size)-len(buf))...)
}
spansReq := msgs.FileSpansReq{
FileId: id,
@@ -138,16 +136,27 @@ func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client,
if err != nil {
panic(err)
}
read, err := bytes.NewBuffer((*buf)[readSoFar:readSoFar]).ReadFrom(spanR)
spanR.Close()
spanRead := 0
for {
r, err := spanR.Read(buf[readSoFar+spanRead:])
if err == io.EOF {
break
}
if err != nil {
spanR.Close()
panic(err)
}
spanRead += r
}
spanR.Close() // TODO replace with Put?
if err != nil {
panic(err)
}
spanCrc := msgs.Crc(crc32c.Sum(0, (*buf)[readSoFar:readSoFar+int(read)]))
spanCrc := msgs.Crc(crc32c.Sum(0, buf[readSoFar:readSoFar+spanRead]))
if spanCrc != span.Header.Crc {
panic(fmt.Errorf("expected crc %v for span, but got %v", span.Header.Crc, spanCrc))
}
readSoFar += int(read)
readSoFar += spanRead
}
if spansResp.NextOffset == 0 {
break
@@ -157,7 +166,7 @@ func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client,
if readSoFar != int(statResp.Size) {
panic(fmt.Errorf("readSoFar=%v != statResp.Size=%v", readSoFar, statResp.Size))
}
return (*buf)[:readSoFar]
return buf[:readSoFar]
}
func readDir(log *lib.Logger, client *lib.Client, dir msgs.InodeId) []edge {
+33
View File
@@ -0,0 +1,33 @@
package lib
import (
"bytes"
"crypto/cipher"
"encoding/binary"
"xtx/eggsfs/msgs"
)
func BlockWriteCertificate(cipher cipher.Block, blockServiceId msgs.BlockServiceId, req *msgs.WriteBlockReq) [8]byte {
w := bytes.NewBuffer([]byte{})
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
w.Write([]byte{'w'})
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
binary.Write(w, binary.LittleEndian, uint32(req.Crc))
binary.Write(w, binary.LittleEndian, uint32(req.Size))
return CBCMAC(cipher, w.Bytes())
}
func CheckBlockWriteCertificate(cipher cipher.Block, blockServiceId msgs.BlockServiceId, req *msgs.WriteBlockReq) ([8]byte, bool) {
expectedMac := BlockWriteCertificate(cipher, blockServiceId, req)
return expectedMac, expectedMac == req.Certificate
}
func CheckBlockEraseCertificate(blockServiceId msgs.BlockServiceId, cipher cipher.Block, req *msgs.EraseBlockReq) ([8]byte, bool) {
// compute mac
w := bytes.NewBuffer([]byte{})
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
w.Write([]byte{'e'})
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
expectedMac := CBCMAC(cipher, w.Bytes())
return expectedMac, expectedMac == req.Certificate
}
+2
View File
@@ -135,6 +135,7 @@ func ReadBlocksResponse(
}
func WriteBlocksResponse(log *Logger, w io.Writer, resp msgs.BlocksResponse) error {
log.Trace("writing response %T %+v", resp, resp)
if err := binary.Write(w, binary.LittleEndian, msgs.BLOCKS_RESP_PROTOCOL_VERSION); err != nil {
return err
}
@@ -148,6 +149,7 @@ func WriteBlocksResponse(log *Logger, w io.Writer, resp msgs.BlocksResponse) err
}
func WriteBlocksResponseError(log *Logger, w io.Writer, err msgs.ErrCode) error {
log.Debug("writing blocks error %v", err)
if err := binary.Write(w, binary.LittleEndian, msgs.BLOCKS_RESP_PROTOCOL_VERSION); err != nil {
return err
}
+18 -32
View File
@@ -530,18 +530,22 @@ func (client *Client) ResolvePath(log *Logger, path string) (msgs.InodeId, error
return id, nil
}
type Taintable interface {
Taint()
// Some connection we can reuse
type Puttable interface {
// Note: this can only be called if the current request (usually for blocks)
// has been _fully consumed without errors_. If it hasn't, then you should
// just close the connection.
Put()
}
type TaintableReadCloser interface {
type PuttableReadCloser interface {
io.ReadCloser
Taintable
Puttable
}
type TaintableCloser interface {
type PuttableCloser interface {
io.Closer
Taintable
Puttable
}
type BlocksConn interface {
@@ -549,51 +553,34 @@ type BlocksConn interface {
io.Reader
io.ReaderFrom
io.Closer
Taintable
Puttable
}
type trackedBlocksConn struct {
blockService msgs.BlockServiceId
conn *net.TCPConn
tainted bool
factory *blocksConnFactory
}
func (c *trackedBlocksConn) Read(p []byte) (int, error) {
read, err := c.conn.Read(p)
if err != nil {
c.tainted = true
}
return read, err
return c.conn.Read(p)
}
func (c *trackedBlocksConn) Write(p []byte) (int, error) {
written, err := c.conn.Write(p)
if err != nil {
c.tainted = true
}
return written, err
return c.conn.Write(p)
}
func (c *trackedBlocksConn) ReadFrom(r io.Reader) (int64, error) {
n, err := c.conn.ReadFrom(r)
if err != nil {
c.tainted = true
}
return n, err
return c.conn.ReadFrom(r)
}
func (c *trackedBlocksConn) Close() error {
if c.tainted {
return c.conn.Close()
} else {
c.factory.put(c.blockService, time.Now(), c.conn)
return nil
}
return c.conn.Close()
}
func (c *trackedBlocksConn) Taint() {
c.tainted = true
func (c *trackedBlocksConn) Put() {
c.factory.put(c.blockService, time.Now(), c.conn)
c.conn = nil
}
// The first ip1/port1 cannot be zeroed, the second one can. One of them
@@ -614,7 +601,6 @@ func (c *Client) GetBlocksConn(log *Logger, blockServiceId msgs.BlockServiceId,
return &trackedBlocksConn{
blockService: blockServiceId,
conn: conn.(*net.TCPConn),
tainted: false,
factory: &c.blocksConns,
}, nil
}
+3 -3
View File
@@ -17,10 +17,10 @@ func ReadShuckleRequest(
) (msgs.ShuckleRequest, error) {
var protocol uint32
if err := binary.Read(r, binary.LittleEndian, &protocol); err != nil {
return nil, fmt.Errorf("could not read protocol: %w", err)
return nil, err
}
if protocol != msgs.SHUCKLE_REQ_PROTOCOL_VERSION {
return nil, fmt.Errorf("bad shuckle protocol, expected %v, got %v", msgs.SHUCKLE_REQ_PROTOCOL_VERSION, protocol)
return nil, fmt.Errorf("bad shuckle protocol, expected %08x, got %08x", msgs.SHUCKLE_REQ_PROTOCOL_VERSION, protocol)
}
var len uint32
if err := binary.Read(r, binary.LittleEndian, &len); err != nil {
@@ -28,7 +28,7 @@ func ReadShuckleRequest(
}
data := make([]byte, len)
if _, err := io.ReadFull(r, data); err != nil {
return nil, err
return nil, fmt.Errorf("could not read response body: %w", err)
}
kind := msgs.ShuckleMessageKind(data[0])
var req msgs.ShuckleRequest
+37 -33
View File
@@ -74,6 +74,8 @@ func ensureLen(buf []byte, l int) []byte {
return buf
}
const EGGSFS_PAGE_SIZE int = 4096
func prepareSpanInitiateReq(
blacklist []msgs.BlockServiceId,
spanPolicies *msgs.SpanPolicy,
@@ -104,6 +106,8 @@ func prepareSpanInitiateReq(
B := spanPolicy.Parity.Blocks()
blockSize := (len(data) + D - 1) / D
cellSize := (blockSize + S - 1) / S
// Round up cell to page size
cellSize = EGGSFS_PAGE_SIZE * ((cellSize + EGGSFS_PAGE_SIZE - 1) / EGGSFS_PAGE_SIZE)
blockSize = cellSize * S
storageClass := blockPolicies.Pick(uint32(blockSize)).StorageClass
@@ -341,7 +345,7 @@ func (c *Client) CreateFile(
type mirroredSpanReader struct {
cursor int
block int
blockConn TaintableReadCloser
blockConn PuttableReadCloser
cellBuf *[]byte
cellCrcs []msgs.Crc // starting from the _next_ stripe crc
}
@@ -350,15 +354,15 @@ func (r *mirroredSpanReader) Close() error {
return r.blockConn.Close()
}
func (r *mirroredSpanReader) Taint() {
r.blockConn.Taint()
func (r *mirroredSpanReader) Put() {
r.blockConn.Put()
}
type rsNormalSpanReader struct {
bufPool *ReadSpanBufPool
cursor int
haveBlocks []uint8 // which blocks are we fetching. most of the times it'll just be the data blocks
blockConns []TaintableReadCloser
blockConns []PuttableReadCloser
blocksRunningCrcs []msgs.Crc
stripeBuf *[]byte
stripeCrcs []msgs.Crc // starting from the _next_ stripe CRC.
@@ -366,9 +370,9 @@ type rsNormalSpanReader struct {
}
func (sr *rsNormalSpanReader) Close() error {
sr.bufPool.put(sr.stripeBuf)
sr.bufPool.Put(sr.stripeBuf)
for _, b := range sr.parityBuffers {
sr.bufPool.put(b)
sr.bufPool.Put(b)
}
var lastErr error
for _, c := range sr.blockConns {
@@ -380,9 +384,9 @@ func (sr *rsNormalSpanReader) Close() error {
return lastErr
}
func (sr *rsNormalSpanReader) Taint() {
func (sr *rsNormalSpanReader) Put() {
for _, c := range sr.blockConns {
c.Taint()
c.Put()
}
}
@@ -398,12 +402,12 @@ type rsCorruptedSpanReader struct {
func (r *rsCorruptedSpanReader) Close() error {
for _, b := range r.dataBlocks {
r.bufPool.put(b)
r.bufPool.Put(b)
}
return nil
}
func (*rsCorruptedSpanReader) Taint() {}
func (*rsCorruptedSpanReader) Put() {}
type spanReader struct {
bufPool *ReadSpanBufPool
@@ -415,16 +419,16 @@ type spanReader struct {
stripes uint8
cellSize uint32
blocksCrcs []msgs.Crc
blockConn func(block int, offset uint32, size uint32) (TaintableReadCloser, error)
r TaintableCloser
blockConn func(block int, offset uint32, size uint32) (PuttableReadCloser, error)
r PuttableCloser
}
func (sr *spanReader) Close() error {
return sr.r.Close()
}
func (sr *spanReader) Taint() {
sr.r.Taint()
func (sr *spanReader) Put() {
sr.r.Put()
}
func (sr *spanReader) repairCorruptedStripe(
@@ -435,7 +439,7 @@ func (sr *spanReader) repairCorruptedStripe(
parityData []*[]byte,
haveBlocks []uint8, // the blocks we have been using so far
haveBlocksCrc []msgs.Crc, // the CRCs so far for the blocks that we have
haveBlocksConns []TaintableReadCloser, // the connections to the blocks we already have
haveBlocksConns []PuttableReadCloser, // the connections to the blocks we already have
) (*rsCorruptedSpanReader, error) {
D := sr.parity.DataBlocks()
B := sr.parity.Blocks()
@@ -448,13 +452,13 @@ func (sr *spanReader) repairCorruptedStripe(
for i := D; i < B; i++ {
b := blocksData[i]
if b != nil {
bufPool.put(b)
bufPool.Put(b)
}
}
}()
// make space for full sized remaining data blocks
for b := 0; b < D; b++ {
blocksData[b] = bufPool.get(remainingBlockSize)
blocksData[b] = bufPool.Get(remainingBlockSize)
}
// load current stripe data into the blocks buffers
{
@@ -463,7 +467,7 @@ func (sr *spanReader) repairCorruptedStripe(
if int(b) < D {
copy(*blocksData[int(b)], (*stripeData)[int(b)*int(sr.cellSize):])
} else {
blocksData[b] = bufPool.get(remainingBlockSize)
blocksData[b] = bufPool.Get(remainingBlockSize)
copy(*blocksData[int(b)], *parityData[parityIx])
parityIx++
}
@@ -499,7 +503,7 @@ func (sr *spanReader) repairCorruptedStripe(
}
// Find and download blocks to recover
var tmpBuf *[]byte
defer bufPool.put(tmpBuf)
defer bufPool.Put(tmpBuf)
for b := 0; b < B && numGoodBlocks < D; b++ {
if badBlocks[b] || goodBlocks[b] { // we know this is bad
continue
@@ -516,7 +520,7 @@ func (sr *spanReader) repairCorruptedStripe(
continue
}
if tmpBuf == nil {
tmpBuf = bufPool.get(int(blockSize))
tmpBuf = bufPool.Get(int(blockSize))
}
if _, err := io.ReadFull(conn, *tmpBuf); err != nil {
conn.Close()
@@ -528,7 +532,7 @@ func (sr *spanReader) repairCorruptedStripe(
crc := crc32c.Sum(0, *tmpBuf)
if crc == uint32(sr.blocksCrcs[b]) {
// this is a good one
blocksData[b] = bufPool.get(remainingBlockSize)
blocksData[b] = bufPool.Get(remainingBlockSize)
copy(*blocksData[b], (*tmpBuf)[blockStart:])
goodBlocks[b] = true
numGoodBlocks++
@@ -822,8 +826,8 @@ func readSpanFromBlocks(
//
// We currently make the assumption that the connections that are available
// at the beginning will be available throughout the duration of span reading.
blockConn func(block int, offset uint32, size uint32) (TaintableReadCloser, error),
) (TaintableReadCloser, error) {
blockConn func(block int, offset uint32, size uint32) (PuttableReadCloser, error),
) (PuttableReadCloser, error) {
D := parity.DataBlocks()
B := parity.Blocks()
sr := spanReader{
@@ -839,7 +843,7 @@ func readSpanFromBlocks(
if D == 1 {
mr := &mirroredSpanReader{
cursor: 0,
cellBuf: sr.bufPool.get(int(cellSize)),
cellBuf: sr.bufPool.Get(int(cellSize)),
cellCrcs: stripesCrc,
}
for b := 0; b < B; b++ {
@@ -861,7 +865,7 @@ func readSpanFromBlocks(
return nil, err
}
} else {
conns := make([]TaintableReadCloser, 0)
conns := make([]PuttableReadCloser, 0)
haveBlocks := make([]uint8, 0)
parityBuffers := []*[]byte{}
for i := 0; i < B; i++ {
@@ -875,7 +879,7 @@ func readSpanFromBlocks(
conns = append(conns, conn)
haveBlocks = append(haveBlocks, uint8(i))
if i >= D {
parityBuffers = append(parityBuffers, sr.bufPool.get(int(cellSize)))
parityBuffers = append(parityBuffers, sr.bufPool.Get(int(cellSize)))
}
if len(haveBlocks) == D {
break
@@ -884,7 +888,7 @@ func readSpanFromBlocks(
if len(haveBlocks) != D {
return nil, fmt.Errorf("couldn't get enough block connections (need at least %v, got %v)", D, len(conns))
}
stripeBuf := sr.bufPool.get(int(cellSize) * D)
stripeBuf := sr.bufPool.Get(int(cellSize) * D)
rsr := &rsNormalSpanReader{
bufPool: sr.bufPool,
cursor: 0,
@@ -931,7 +935,7 @@ func (*inlineSpanReader) Close() error {
return nil
}
func (*inlineSpanReader) Taint() {}
func (*inlineSpanReader) Put() {}
type ReadSpanBufPool struct {
pool sync.Pool
@@ -951,7 +955,7 @@ func NewReadSpanBufPool() *ReadSpanBufPool {
// This does _not_ zero the memory in the bufs -- i.e. there might
// be garbage in it.
func (pool *ReadSpanBufPool) get(l int) *[]byte {
func (pool *ReadSpanBufPool) Get(l int) *[]byte {
buf := pool.pool.Get().(*[]byte)
if cap(*buf) >= l {
*buf = (*buf)[:l]
@@ -962,7 +966,7 @@ func (pool *ReadSpanBufPool) get(l int) *[]byte {
return buf
}
func (pool *ReadSpanBufPool) put(buf *[]byte) {
func (pool *ReadSpanBufPool) Put(buf *[]byte) {
if buf != nil {
pool.pool.Put(buf)
}
@@ -974,8 +978,7 @@ func (c *Client) ReadSpan(
blacklist []msgs.BlockServiceId,
blockServices []msgs.BlockService,
fetchedSpan *msgs.FetchedSpan,
) (TaintableReadCloser, error) {
log.DebugStack(1, "starting to read span")
) (PuttableReadCloser, error) {
if fetchedSpan.Header.StorageClass == msgs.INLINE_STORAGE {
data := fetchedSpan.Body.(*msgs.FetchedInlineSpan).Body
dataCrc := msgs.Crc(crc32c.Sum(0, data))
@@ -991,11 +994,12 @@ func (c *Client) ReadSpan(
}
body := fetchedSpan.Body.(*msgs.FetchedBlocksSpan)
log.DebugStack(1, "span parity %v", body.Parity)
blocksCrcs := make([]msgs.Crc, body.Parity.Blocks())
for i := range blocksCrcs {
blocksCrcs[i] = body.Blocks[i].Crc
}
blockConn := func(blockIx int, offset uint32, size uint32) (TaintableReadCloser, error) {
blockConn := func(blockIx int, offset uint32, size uint32) (PuttableReadCloser, error) {
log.DebugStack(1, "requested connection for block ix %v, offset %v, size %v", blockIx, offset, size)
block := body.Blocks[blockIx]
blockService := blockServices[block.BlockServiceIx]
+4 -2
View File
@@ -37,7 +37,9 @@ func (c *mockedBlockConn) Close() error {
return nil
}
func (*mockedBlockConn) Taint() {}
func (c *mockedBlockConn) Put() {
c.Close()
}
func testSpan(
t *testing.T,
@@ -142,7 +144,7 @@ func testSpan(
openBlockConns := int64(0)
spanReader, err := readSpanFromBlocks(
bufPool, sizeWithZeros, req.Crc, req.Parity, req.Stripes, req.CellSize, blocksCrcs, stripesCrcs,
func(i int, offset uint32, size uint32) (TaintableReadCloser, error) {
func(i int, offset uint32, size uint32) (PuttableReadCloser, error) {
for _, b := range badConnections {
if int(b) == i {
return nil, nil
+6 -1
View File
@@ -112,6 +112,10 @@ func (id *InodeId) UnmarshalJSON(b []byte) error {
if err := json.Unmarshal(b, &ids); err != nil {
return err
}
if ids == "ROOT" {
*id = ROOT_DIR_INODE_ID
return nil
}
idu, err := strconv.ParseUint(ids, 0, 63)
if err != nil {
return err
@@ -506,6 +510,8 @@ type SoftUnlinkFileResp struct{}
type FileSpansReq struct {
FileId InodeId
ByteOffset uint64
// if 0, no limit.
Limit uint32
}
type FetchedBlock struct {
@@ -838,7 +844,6 @@ type ExpireTransientFileResp struct{}
type MakeDirectoryReq struct {
OwnerId InodeId
Name string
Info DirectoryInfo
}
type MakeDirectoryResp struct {
+543 -543
View File
File diff suppressed because it is too large Load Diff
+10
View File
@@ -0,0 +1,10 @@
*.qcow2
compile_commands.json
*.o
*.d
*.cmd
Module.symvers
modules.order
*.ko
*.mod
*.mod.c
+8
View File
@@ -0,0 +1,8 @@
To get a `compile_commands.json`:
* Call the kmod source dir (where this file is located) `$KMOD_DIR`
* Download some version of the linux source (I used `linux-5.4.237` to be in line with Ubuntu 20.20), say `$LINUX_DIR`
* `cd $LINUX_DIR && make LLVM=1 defconfig && make LLVM=1 -j`
* `cd $KMOD_DIR && make KDIR=$LINUX_DIR LLVM=1 kmod`
* `$LINUX_DIR/scripts/gen_compile_commands.py -d $KMOD_DIR` to generate `compile_commands.json`
* Replace all occurrences of `"directory": $KMOD_DIR` with `"directory": $LINUX_DIR` in the generated `$KMOD_DIR/compile_commands.json`
+43
View File
@@ -0,0 +1,43 @@
KDIR ?= /data/home/fmazzol/src/eggsfs/kmod/linux-5.4.237/
LLVM ?= 0
obj-m += eggsfs.o
eggsfs-objs += \
dir.o \
err.o \
file.o \
inode.o \
kmod.o \
metadata.o \
namei.o \
net.o \
skb.o \
super.o \
sysctl.o \
sysfs.o \
trace.o \
shuckle.o \
block.o \
file.o \
rs.o \
crc.o \
span.o
EXTRA_CFLAGS = -I$(src) -g -DDEBUG -fdiagnostics-color=always -Wno-declaration-after-statement
export CF = -Wbitwise -Wcontext -Wcast_truncate -Wsparse-all -Wno-shadow -Wnosizeof-void -DCONFIG_SPARSE_RCU_POINTER
# C=1 builds with sparse
kmod:
make -C $(KDIR) M=$(PWD) C=1 modules
clean:
make -C $(KDIR) M=$(PWD) clean
rm -f bincode_tests
bincode_tests: bincode_tests.c bincodegen.h bincode.h
clang -Wall -g -O2 -fsanitize=undefined,address,integer,function -fno-sanitize-recover=all bincode_tests.c -o bincode_tests
writefile: writefile.c
gcc -Wall -g -O2 writefile.c -o writefile
+143
View File
@@ -0,0 +1,143 @@
#ifndef _EGGSFS_BINCODE_H
#define _EGGSFS_BINCODE_H
#ifndef EGGSFS_BINCODE_TESTS
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/bug.h>
#include <asm/unaligned.h>
#endif
struct eggsfs_bincode_put_ctx { char* start; char* cursor; char* end; };
struct eggsfs_bincode_get_ctx { char* buf; char* end; u16 err; };
struct eggsfs_bincode_bytes { char* buf; u8 len; };
#define eggsfs_bincode_get_finish_list_el(end) ((void)(end))
#include "bincodegen.h"
inline static void eggsfs_bincode_put_u64(struct eggsfs_bincode_put_ctx* ctx, u64 x) {
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
put_unaligned_le64(x, ctx->cursor);
ctx->cursor += sizeof(x);
}
inline static void eggsfs_bincode_put_u32(struct eggsfs_bincode_put_ctx* ctx, u32 x) {
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
put_unaligned_le32(x, ctx->cursor);
ctx->cursor += sizeof(x);
}
inline static void eggsfs_bincode_put_u16(struct eggsfs_bincode_put_ctx* ctx, u16 x) {
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
put_unaligned_le16(x, ctx->cursor);
ctx->cursor += sizeof(x);
}
inline static void eggsfs_bincode_put_u8(struct eggsfs_bincode_put_ctx* ctx, u8 x) {
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
*(u8*)(ctx->cursor) = x;
ctx->cursor += sizeof(x);
}
inline static u64 eggsfs_bincode_get_u64(struct eggsfs_bincode_get_ctx* ctx) {
if (likely(ctx->err == 0)) {
if (unlikely(ctx->end - ctx->buf < sizeof(u64))) {
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
return 0;
} else {
u64 x = get_unaligned_le64(ctx->buf);
ctx->buf += sizeof(x);
return x;
}
} else {
return 0;
}
}
inline static u32 eggsfs_bincode_get_u32(struct eggsfs_bincode_get_ctx* ctx) {
if (likely(ctx->err == 0)) {
if (unlikely(ctx->end - ctx->buf < sizeof(u32))) {
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
return 0;
} else {
u32 x = get_unaligned_le32(ctx->buf);
ctx->buf += sizeof(x);
return x;
}
} else {
return 0;
}
}
inline static u16 eggsfs_bincode_get_u16(struct eggsfs_bincode_get_ctx* ctx) {
if (likely(ctx->err == 0)) {
if (unlikely(ctx->end - ctx->buf < sizeof(u16))) {
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
return 0;
} else {
u16 x = get_unaligned_le16(ctx->buf);
ctx->buf += sizeof(x);
return x;
}
} else {
return 0;
}
}
inline static u8 eggsfs_bincode_get_u8(struct eggsfs_bincode_get_ctx* ctx) {
if (likely(ctx->err == 0)) {
if (unlikely(ctx->end - ctx->buf < sizeof(u8))) {
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
return 0;
} else {
u8 x = *(u8*)(ctx->buf);
ctx->buf += sizeof(x);
return x;
}
} else {
return 0;
}
}
// >>> format(struct.unpack('<I', b'SHA\0')[0], 'x')
// '414853'
static const u32 EGGSFS_SHARD_REQ_PROTOCOL_VERSION = 0x414853;
// >>> format(struct.unpack('<I', b'SHA\1')[0], 'x')
// '1414853'
static const u32 EGGSFS_SHARD_RESP_PROTOCOL_VERSION = 0x1414853;
// >>> format(struct.unpack('<I', b'CDC\0')[0], 'x')
// '434443'
static const u32 EGGSFS_CDC_REQ_PROTOCOL_VERSION = 0x434443;
// >>> format(struct.unpack('<I', b'CDC\1')[0], 'x')
// '1434443'
static const u32 EGGSFS_CDC_RESP_PROTOCOL_VERSION = 0x1434443;
// >>> format(struct.unpack('<I', b'SHU\0')[0], 'x')
// '554853'
static const u32 EGGSFS_SHUCKLE_REQ_PROTOCOL_VERSION = 0x554853;
// >>> format(struct.unpack('<I', b'SHU\1')[0], 'x')
// '1554853'
static const u32 EGGSFS_SHUCKLE_RESP_PROTOCOL_VERSION = 0x1554853;
// >>> format(struct.unpack('<I', b'BLO\0')[0], 'x')
// '4f4c42'
static const u32 EGGSFS_BLOCKS_REQ_PROTOCOL_VERSION = 0x4f4c42;
// >>> format(struct.unpack('<I', b'BLO\1')[0], 'x')
// '14f4c42'
static const u32 EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION = 0x14f4c42;
static const u8 SPAN_POLICY_TAG = 2;
static const u8 BLOCK_POLICY_TAG = 3;
static const u8 STRIPE_POLICY_TAG = 4;
static const u8 EGGSFS_EMPTY_STORAGE = 0;
static const u8 EGGSFS_INLINE_STORAGE = 1;
static const u8 EGGSFS_HDD_STORAGE = 2;
static const u8 EGGSFS_FLASH_STORAGE = 3;
#define EGGSFS_GET_EXTRA(x) (((x) & (1ull<<63)) >> 63)
#define EGGSFS_GET_EXTRA_ID(x) ((x) & ~(1ull<<63))
#endif
+163
View File
@@ -0,0 +1,163 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define UDP_MTU 1472
#define BUG_ON(x) \
if (unlikely(x)) { \
fprintf(stderr, "bug: " #x); \
fprintf(stderr, "\n"); \
exit(1); \
}
static inline u64 get_unaligned_le64(const void* p) {
u64 x;
memcpy(&x, p, sizeof(x));
return x;
}
static inline u32 get_unaligned_le32(const void* p) {
u32 x;
memcpy(&x, p, sizeof(x));
return x;
}
static inline u16 get_unaligned_le16(const void* p) {
u16 x;
memcpy(&x, p, sizeof(x));
return x;
}
static inline u64 get_unaligned_be64(const void* p) {
u64 x;
memcpy(&x, p, sizeof(x));
return __bswap_64(x);
}
static inline u32 get_unaligned_be32(const void* p) {
u32 x;
memcpy(&x, p, sizeof(x));
return __bswap_32(x);
}
static inline void put_unaligned_le64(u64 x, void* p) {
memcpy(p, &x, sizeof(x));
}
static inline void put_unaligned_le32(u32 x, void* p) {
memcpy(p, &x, sizeof(x));
}
static inline void put_unaligned_le16(u16 x, void* p) {
memcpy(p, &x, sizeof(x));
}
static inline void put_unaligned_be64(u64 x, void* p) {
x = __bswap_64(x);
memcpy(p, &x, sizeof(x));
}
static inline void put_unaligned_be32(u32 x, void* p) {
x = __bswap_32(x);
memcpy(p, &x, sizeof(x));
}
#define EGGSFS_BINCODE_TESTS
#include "bincode.h"
int main(void) {
{
char lookup_req[EGGSFS_LOOKUP_REQ_MAX_SIZE];
struct eggsfs_bincode_put_ctx put_ctx = {
.start = lookup_req,
.cursor = lookup_req,
.end = lookup_req + sizeof(lookup_req),
};
{
eggsfs_lookup_req_put_start(&put_ctx, start);
eggsfs_lookup_req_put_dir_id(&put_ctx, start, dir_id, 123);
eggsfs_lookup_req_put_name(&put_ctx, dir_id, name, "foo", 3);
eggsfs_lookup_req_put_end(&ctx, name, end);
}
struct eggsfs_bincode_get_ctx get_ctx = {
.buf = put_ctx.start,
.end = put_ctx.cursor,
.err = 0,
};
{
eggsfs_lookup_req_get_start(&get_ctx, start);
eggsfs_lookup_req_get_dir_id(&get_ctx, start, dir_id);
eggsfs_lookup_req_get_name(&get_ctx, dir_id, name);
eggsfs_lookup_req_get_end(&get_ctx, name, end);
eggsfs_lookup_req_get_finish(&get_ctx, end);
BUG_ON(get_ctx.err != 0);
BUG_ON(dir_id.x != 123);
BUG_ON(name.str.len != 3);
BUG_ON(strncmp("foo", name.str.buf, 3) != 0);
}
}
{
char read_dir_resp[UDP_MTU];
struct eggsfs_bincode_put_ctx put_ctx = {
.start = read_dir_resp,
.cursor = read_dir_resp,
.end = read_dir_resp + sizeof(read_dir_resp),
};
{
eggsfs_read_dir_resp_put_start(&put_ctx, start);
eggsfs_read_dir_resp_put_next_hash(&put_ctx, start, next_hash, 123);
eggsfs_read_dir_resp_put_results(&put_ctx, next_hash, results, 3);
for (int i = 0; i < 3; i++) {
eggsfs_current_edge_put_start(&put_ctx, edge_start);
eggsfs_current_edge_put_target_id(&put_ctx, edge_start, target_id, i);
eggsfs_current_edge_put_name_hash(&put_ctx, target_id, name_hash, i + 42);
char name_str[1];
snprintf(name_str, 1, "%d", i);
eggsfs_current_edge_put_name(&put_ctx, name_hash, name, name_str, sizeof(name_str));
eggsfs_current_edge_put_creation_time(&put_ctx, name, creation_time, i + 1000);
eggsfs_current_edge_put_end(&put_ctx, creation_time, end);
}
eggsfs_read_dir_resp_put_end(&ctx, results, end);
}
struct eggsfs_bincode_get_ctx get_ctx = {
.buf = put_ctx.start,
.end = put_ctx.cursor,
.err = 0,
};
{
eggsfs_read_dir_resp_get_start(&get_ctx, start);
eggsfs_read_dir_resp_get_next_hash(&get_ctx, start, next_hash);
eggsfs_read_dir_resp_get_results(&get_ctx, next_hash, results);
for (int i = 0; i < results.len; i++) {
eggsfs_current_edge_get_start(&get_ctx, edge_start);
eggsfs_current_edge_get_target_id(&get_ctx, edge_start, target_id);
eggsfs_current_edge_get_name_hash(&get_ctx, target_id, name_hash);
eggsfs_current_edge_get_name(&get_ctx, name_hash, name);
eggsfs_current_edge_get_creation_time(&get_ctx, name, creation_time);
eggsfs_current_edge_get_end(&put_ctx, creation_time, end);
eggsfs_bincode_get_finish_list_el(end);
if (get_ctx.err == 0) {
BUG_ON(target_id.x != i);
BUG_ON(name_hash.x != i + 42);
BUG_ON(name.str.len != 1);
char name_str[1];
snprintf(name_str, 1, "%d", i);
BUG_ON(strncmp(name.str.buf, name_str, 1) != 0);
BUG_ON(creation_time.x != i + 1000);
}
}
eggsfs_read_dir_resp_get_end(&get_ctx, results, end);
eggsfs_read_dir_resp_get_finish(&get_ctx, end);
BUG_ON(get_ctx.err != 0);
BUG_ON(next_hash.x != 123);
}
}
return 0;
}
+5142
View File
File diff suppressed because it is too large Load Diff
View File
+765
View File
@@ -0,0 +1,765 @@
// Plan for socket reclamation:
// * Have a hashmap or anyway some sort of mapping from port/ip to socket
// * Every n seconds in get_socket function purge inactive ones
//
// Plan for writing. At any given point, we are:
// * Writing the current data blocks and the previous parity blocks.
// * Computing the current parity blocks.
// We can do this by just firing all the requests, start computing the
// parity blocks, and then waiting on all the responses.
// This should ideally keep us writing close to 100% of the time, with
// a small tail of writing the parity blocks for the last span.
#include <linux/inet.h>
#include <net/tcp.h>
#include <net/sock.h>
#include "block.h"
#include "common.h"
#include "err.h"
#include "skb.h"
#include "super.h"
#include "crc.h"
#include "trace.h"
#define EGGSFS_BLOCKS_REQ_HEADER_SIZE (4 + 8 + 1) // protocol + block service id + kind
struct eggsfs_block_socket {
struct socket* sock;
atomic_t sock_err;
struct list_head requests;
spinlock_t requests_lock;
struct sockaddr_in addr;
// saved callbacks
void (*saved_state_change)(struct sock *sk);
void (*saved_data_ready)(struct sock *sk);
void (*saved_write_space)(struct sock *sk);
// only used for block writes
struct work_struct write_work;
};
static u64 WHICH_BLOCK_IP = 0;
static struct kmem_cache* eggsfs_fetch_block_request_cachep;
static struct kmem_cache* eggsfs_write_block_request_cachep;
static void eggsfs_fetch_block_state_check(struct sock* sk) {
// TODO check if connection has been closed before its time and error out
// eggsfs_debug_print("socket state check triggered: %d", sk->sk_state);
}
static void eggsfs_fetch_block_state_change(struct sock* sk) {
read_lock_bh(&sk->sk_callback_lock);
eggsfs_fetch_block_state_check(sk);
read_unlock_bh(&sk->sk_callback_lock);
}
// Returns a negative result if the socket has to be considered corrupted.
// If a "recoverable" error occurs (e.g. the server just replies with an error)
// `req->err` will be filled in and receiving can continue. If an unrecoverable
// error occurs `req->err` will be filled in also, but it's expected that the
// caller stops operating on this socket.
static int eggsfs_fetch_block_receive_single_req(
struct eggsfs_fetch_block_request* req,
struct sk_buff* skb,
unsigned int offset, size_t len0
) {
size_t len = len0;
#define HEADER_COPY(buf, count) ({ \
int read = count > 0 ? eggsfs_skb_copy(buf, skb, offset, count) : 0; \
offset += read; \
len -= read; \
req->header_read += read; \
read; \
})
// Header
{
int header_left = EGGSFS_BLOCKS_RESP_HEADER_SIZE - (int)req->header_read;
int read = HEADER_COPY(req->header.buf + req->header_read, header_left);
header_left -= read;
if (header_left > 0) { return len0-len; }
}
// Protocol check
if (unlikely(le32_to_cpu(req->header.data.protocol) != EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
eggsfs_info_print("bad blocks resp protocol, expected %*pE, got %*pE", 4, &EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->header.data.protocol);
return eggsfs_error_to_linux(EGGSFS_ERR_MALFORMED_RESPONSE);
}
// Error check
if (unlikely(req->header.data.kind == 0)) {
int error_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)req->header_read;
int read = HEADER_COPY(req->header.buf + req->header_read, error_left);
error_left -= read;
if (error_left > 0) { return len0-len; }
eggsfs_info_print("failed=%u", req->header.data.error);
// we got an error "nicely", the socket can carry on living.
req->err = eggsfs_error_to_linux(le16_to_cpu(req->header.data.error));
return len0-len;
}
#undef HEADER_COPY
// Actual data copy
BUG_ON(req->bytes_left == 0);
struct page* page = list_first_entry(&req->pages, struct page, lru);
BUG_ON(!page);
char* page_ptr = kmap_atomic(page);
struct skb_seq_state seq;
skb_prepare_seq_read(skb, offset, offset + min(req->bytes_left, (u32)len), &seq);
u32 block_bytes_read = 0;
while (block_bytes_read < req->bytes_left) {
const u8* data;
u32 avail = skb_seq_read(block_bytes_read, &data, &seq);
if (avail == 0) { break; }
kernel_fpu_begin(); // crc
while (avail) {
u32 this_len = min3(avail, (u32)PAGE_SIZE - req->page_offset, req->bytes_left - block_bytes_read);
req->crc = eggsfs_crc32c(req->crc, data, this_len);
memcpy(page_ptr + req->page_offset, data, this_len);
data += this_len;
req->page_offset += this_len;
avail -= this_len;
block_bytes_read += this_len;
if (req->page_offset >= PAGE_SIZE) {
BUG_ON(req->page_offset > PAGE_SIZE);
kunmap_atomic(page_ptr);
list_rotate_left(&req->pages);
page = list_first_entry(&req->pages, struct page, lru);
page_ptr = kmap_atomic(page);
req->page_offset = 0;
}
}
kernel_fpu_end();
}
// We might have not terminated with avail == 0, but because we're done with this request
skb_abort_seq_read(&seq);
req->bytes_left -= block_bytes_read;
// The last page is partially filled, zero it and return to first page
if (!req->bytes_left && req->page_offset) {
memset(page_ptr + req->page_offset, 0, PAGE_SIZE - req->page_offset);
list_rotate_left(&req->pages);
req->page_offset = 0;
}
kunmap_atomic(page_ptr);
return (len0-len) + block_bytes_read;
}
void eggsfs_put_fetch_block_socket(struct eggsfs_block_socket *socket) {
// TODO terminate existing requests
read_lock(&socket->sock->sk->sk_callback_lock);
socket->sock->sk->sk_data_ready = socket->saved_data_ready;
socket->sock->sk->sk_state_change = socket->saved_state_change;
socket->sock->sk->sk_write_space = socket->saved_write_space;
read_unlock(&socket->sock->sk->sk_callback_lock);
sock_release(socket->sock);
kfree(socket);
}
static int eggsfs_fetch_block_receive(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len) {
struct eggsfs_block_socket* socket = rd_desc->arg.data;
int initial_len = len;
int sock_err = atomic_read(&socket->sock_err);
for (;;) {
spin_lock_bh(&socket->requests_lock);
struct eggsfs_fetch_block_request* req = list_empty(&socket->requests) ?
NULL :
list_first_entry(&socket->requests, struct eggsfs_fetch_block_request, socket_requests);
spin_unlock_bh(&socket->requests_lock);
if (req == NULL) { break; }
int consumed = sock_err;
if (consumed == 0) {
consumed = eggsfs_fetch_block_receive_single_req(req, skb, offset, len);
if (consumed < 0) {
sock_err = consumed;
} else {
offset += consumed;
len -= consumed;
}
} else {
req->err = consumed;
}
if (req->err || req->bytes_left == 0) {
spin_lock_bh(&socket->requests_lock);
BUG_ON(atomic_read(&req->refcount) < 1);
bool should_free = atomic_dec_and_test(&req->refcount);
list_del(&req->socket_requests);
spin_unlock_bh(&socket->requests_lock);
complete_all(&req->comp);
// Important to do the actual freeing after the completion: otherwise
// there might be races between this and a consumer taking ownership
// of the pages.
if (should_free) {
put_pages_list(&req->pages);
kmem_cache_free(eggsfs_fetch_block_request_cachep, req);
}
} else {
break;
}
}
if (sock_err == 0) {
BUG_ON(len != 0); // can't have leftovers
}
return initial_len;
}
static void eggsfs_fetch_block_data_ready(struct sock* sk) {
read_lock_bh(&sk->sk_callback_lock);
read_descriptor_t rd_desc;
// Taken from iscsi -- we set count to 1 because we want the network layer to
// hand us all the skbs that are available.
rd_desc.arg.data = sk->sk_user_data;
rd_desc.count = 1;
tcp_read_sock(sk, &rd_desc, eggsfs_fetch_block_receive);
eggsfs_fetch_block_state_check(sk);
read_unlock_bh(&sk->sk_callback_lock);
}
struct eggsfs_block_socket* eggsfs_get_fetch_block_socket(struct eggsfs_block_service *block_service) {
struct eggsfs_block_socket* socket = kzalloc(sizeof(struct eggsfs_block_socket), GFP_KERNEL);
if (socket == NULL) { return ERR_PTR(-ENOMEM); }
int err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &socket->sock);
if (err != 0) { goto out_err; }
atomic_set(&socket->sock_err, 0);
socket->saved_data_ready = socket->sock->sk->sk_data_ready;
socket->saved_state_change = socket->sock->sk->sk_state_change;
socket->saved_write_space = socket->sock->sk->sk_write_space;
// Try both ips (if we have them) before giving up
int block_ip = WHICH_BLOCK_IP++;
int i;
for (i = 0; i < 1 + (block_service->port2 != 0); i++, block_ip++) { // we might not have a second address
socket->addr.sin_family = AF_INET;
if (block_service->port2 == 0 || block_ip & 1) {
socket->addr.sin_addr.s_addr = htonl(block_service->ip1);
socket->addr.sin_port = htons(block_service->port1);
} else {
socket->addr.sin_addr.s_addr = htonl(block_service->ip2);
socket->addr.sin_port = htons(block_service->port2);
}
// TODO we should probably use TFO, but it's annoying because our custom callbacks
// break the connection, so we'd need to coordinate with that in a more annoying way.
// Also this makes it easier to try both IPs immediately.
err = kernel_connect(socket->sock, (struct sockaddr*)&socket->addr, sizeof(socket->addr), 0);
if (err < 0) {
eggsfs_warn_print("could not connect to block service at %pI4:%d: %d", &socket->addr.sin_addr, ntohs(socket->addr.sin_port), err);
} else {
break;
}
}
if (err < 0) { goto out_err_sock; }
INIT_LIST_HEAD(&socket->requests);
spin_lock_init(&socket->requests_lock);
// Important for the callbacks to happen after the connect, see TODO above.
read_lock(&socket->sock->sk->sk_callback_lock);
socket->sock->sk->sk_user_data = socket;
socket->sock->sk->sk_data_ready = eggsfs_fetch_block_data_ready;
socket->sock->sk->sk_state_change = eggsfs_fetch_block_state_change;
read_unlock(&socket->sock->sk->sk_callback_lock);
return socket;
out_err_sock:
sock_release(socket->sock);
out_err:
kfree(socket);
return ERR_PTR(err);
}
struct eggsfs_fetch_block_request* eggsfs_fetch_block(
struct eggsfs_block_socket* socket,
u64 block_service_id,
u64 block_id,
u32 offset,
u32 count
) {
int err;
struct eggsfs_fetch_block_request* req = kmem_cache_alloc(eggsfs_fetch_block_request_cachep, GFP_KERNEL);
if (!req) { err = -ENOMEM; goto out_err; }
INIT_LIST_HEAD(&req->pages);
int i;
for (i = 0; i < (count + PAGE_SIZE - 1)/PAGE_SIZE; i++) {
struct page* page = alloc_page(GFP_KERNEL);
if (!page) { err = -ENOMEM; goto out_err_pages; }
list_add_tail(&page->lru, &req->pages);
}
req->bytes_left = count;
req->page_offset = 0;
req->header_read = 0;
req->crc = 0;
req->err = 0;
// one for the socket, one for the caller
atomic_set(&req->refcount, 2);
reinit_completion(&req->comp);
spin_lock_bh(&socket->requests_lock);
list_add_tail(&req->socket_requests, &socket->requests);
spin_unlock_bh(&socket->requests_lock);
// fill in req
char req_msg_buf[EGGSFS_BLOCKS_REQ_HEADER_SIZE + EGGSFS_FETCH_BLOCK_REQ_SIZE];
char* req_msg = req_msg_buf;
put_unaligned_le32(EGGSFS_BLOCKS_REQ_PROTOCOL_VERSION, req_msg); req_msg += 4;
put_unaligned_le64(block_service_id, req_msg); req_msg += 8;
*(u8*)req_msg = EGGSFS_BLOCKS_FETCH_BLOCK; req_msg += 1;
{
struct eggsfs_bincode_put_ctx ctx = {
.start = req_msg,
.cursor = req_msg,
.end = req_msg_buf + sizeof(req_msg_buf),
};
eggsfs_fetch_block_req_put_start(&ctx, start);
eggsfs_fetch_block_req_put_block_id(&ctx, start, req_block_id, block_id);
eggsfs_fetch_block_req_put_offset(&ctx, req_block_id, req_offset, offset);
eggsfs_fetch_block_req_put_count(&ctx, req_offset, req_count, count);
eggsfs_fetch_block_req_put_end(ctx, req_count, end);
BUG_ON(ctx.cursor != ctx.end);
}
// send message
struct msghdr msg = { NULL, };
eggsfs_debug_print("sending fetch block req to %pI4:%d", &socket->addr.sin_addr, ntohs(socket->addr.sin_port));
struct kvec iov = {
.iov_base = req_msg_buf,
.iov_len = sizeof(req_msg_buf),
};
err = kernel_sendmsg(socket->sock, &msg, &iov, 1, iov.iov_len);
if (err < 0) { goto out_err_pages; }
return req;
out_err_pages:
put_pages_list(&req->pages);
kmem_cache_free(eggsfs_fetch_block_request_cachep, req);
out_err:
eggsfs_info_print("couldn't start fetch block request, err=%d", req->err);
return ERR_PTR(err);
}
void eggsfs_put_fetch_block_request(struct eggsfs_fetch_block_request* req) {
BUG_ON(atomic_read(&req->refcount) < 1);
if (atomic_dec_and_test(&req->refcount)) {
put_pages_list(&req->pages);
kmem_cache_free(eggsfs_fetch_block_request_cachep, req);
}
}
static void eggsfs_write_block_state_check(struct sock* sk) {
// TODO check if connection has been closed before its time and error out
// eggsfs_debug_print("socket state check triggered: %d", sk->sk_state);
}
static void eggsfs_write_block_state_change(struct sock* sk) {
read_lock_bh(&sk->sk_callback_lock);
eggsfs_write_block_state_check(sk);
read_unlock_bh(&sk->sk_callback_lock);
}
static inline int atomic_set_return(atomic_t *v, int i) {
atomic_set(v, i);
return i;
}
static int eggsfs_write_block_receive_single_req(
struct eggsfs_block_socket* socket,
struct eggsfs_write_block_request* req,
struct sk_buff* skb,
unsigned int offset, size_t len0
) {
size_t len = len0;
// Write resp (after write request)
#define HEADER_COPY(buf, count) ({ \
int read = count > 0 ? eggsfs_skb_copy(buf, skb, offset, count) : 0; \
offset += read; \
len -= read; \
req->write_resp_read += read; \
read; \
})
// Header
{
int header_left = EGGSFS_BLOCKS_RESP_HEADER_SIZE - (int)req->write_resp_read;
int read = HEADER_COPY(req->write_resp.buf + req->write_resp_read, header_left);
header_left -= read;
if (header_left > 0) { return len0-len; }
}
// Protocol check
if (unlikely(le32_to_cpu(req->write_resp.data.protocol) != EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
eggsfs_info_print("bad blocks resp protocol, expected %*pE, got %*pE", 4, &EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->write_resp.data.protocol);
return eggsfs_error_to_linux(EGGSFS_ERR_MALFORMED_RESPONSE);
}
// Error check
if (unlikely(req->write_resp.data.kind == 0)) {
int error_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)req->write_resp_read;
int read = HEADER_COPY(req->write_resp.buf + req->write_resp_read, error_left);
error_left -= read;
if (error_left > 0) { return len0-len; }
eggsfs_info_print("failed=%u", req->write_resp.data.error);
// We immediately start writing, so any error here means that the socket is kaput
return atomic_set_return(&req->status, eggsfs_error_to_linux(le16_to_cpu(req->write_resp.data.error)));
}
#undef HEADER_COPY
// Written resp (after the block has been fully written)
#define HEADER_COPY(buf, count) ({ \
int read = count > 0 ? eggsfs_skb_copy(buf, skb, offset, count) : 0; \
offset += read; \
len -= read; \
req->written_resp_read += read; \
read; \
})
// Header
{
int header_left = EGGSFS_BLOCKS_RESP_HEADER_SIZE - (int)req->written_resp_read;
int read = HEADER_COPY(req->written_resp.buf + req->written_resp_read, header_left);
header_left -= read;
if (header_left > 0) {
return len0-len;
}
}
// Protocol check
if (unlikely(le32_to_cpu(req->written_resp.data.protocol) != EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
eggsfs_info_print("bad blocks resp protocol, expected %*pE, got %*pE", 4, &EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->written_resp.data.protocol);
return eggsfs_error_to_linux(EGGSFS_ERR_MALFORMED_RESPONSE);
}
// Error check
if (unlikely(req->written_resp.data.kind == 0)) {
int error_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)req->written_resp_read;
int read = HEADER_COPY(req->written_resp.buf + req->written_resp_read, error_left);
error_left -= read;
if (error_left > 0) {
return len0-len;
}
eggsfs_info_print("failed=%u", req->written_resp.data.error);
// We immediately start writing, so any error here means that the socket is kaput
// TODO here it might actually be fine, but better not risk it
return atomic_set_return(&req->status, eggsfs_error_to_linux(le16_to_cpu(req->written_resp.data.error)));
}
// We can finally get the proof
{
int proof_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 8) - (int)req->written_resp_read;
int read = HEADER_COPY(req->written_resp.buf + req->written_resp_read, proof_left);
proof_left -= read;
if (proof_left > 0) {
return len0-len;
}
}
#undef HEADER_COPY
// we can't get here unless we've fully written the block, the channel is broken
if (smp_load_acquire(&req->bytes_left)) {
eggsfs_warn_print("unexpected written resp before writing out block");
return atomic_set_return(&req->status, -EIO);
}
// We're good!
atomic_set_return(&req->status, 1);
return len0-len;
}
static int eggsfs_write_block_receive_inner(struct eggsfs_block_socket* socket, struct sk_buff* skb, unsigned int offset, size_t len) {
int initial_len = len;
int sock_err = atomic_read(&socket->sock_err);
while (true) {
spin_lock_bh(&socket->requests_lock);
struct eggsfs_write_block_request* req = list_empty(&socket->requests) ?
NULL :
list_first_entry(&socket->requests, struct eggsfs_write_block_request, socket_requests);
spin_unlock_bh(&socket->requests_lock);
if (req == NULL) { break; }
int consumed = sock_err;
if (consumed == 0) {
consumed = eggsfs_write_block_receive_single_req(socket, req, skb, offset, len);
if (consumed < 0) {
sock_err = consumed;
} else {
offset += consumed;
len -= consumed;
}
} else {
atomic_set(&req->status, consumed);
}
int status = atomic_read(&req->status);
if (status != 0) {
spin_lock_bh(&socket->requests_lock);
list_del(&req->socket_requests);
spin_unlock_bh(&socket->requests_lock);
trace_eggsfs_block_write_exit(req->block_service_id, req->block_id, status > 0 ? 0 : status);
queue_work(eggsfs_wq, req->callback);
// start writing the next request
queue_work(eggsfs_wq, &socket->write_work);
} else {
break;
}
}
if (sock_err == 0) {
BUG_ON(len != 0); // can't have leftovers
}
return initial_len;
}
static int eggsfs_write_block_receive(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len) {
struct eggsfs_block_socket* socket = rd_desc->arg.data;
return eggsfs_write_block_receive_inner(socket, skb, offset, len);
}
static void eggsfs_write_block_data_ready(struct sock* sk) {
read_lock_bh(&sk->sk_callback_lock);
read_descriptor_t rd_desc;
rd_desc.arg.data = sk->sk_user_data;
rd_desc.count = 1;
tcp_read_sock(sk, &rd_desc, eggsfs_write_block_receive);
eggsfs_fetch_block_state_check(sk);
read_unlock_bh(&sk->sk_callback_lock);
}
static void eggsfs_write_block_write_space(struct sock* sk) {
read_lock_bh(&sk->sk_callback_lock);
struct eggsfs_block_socket* socket = (struct eggsfs_block_socket*)sk->sk_user_data;
if (sk_stream_is_writeable(sk)) {
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
queue_work(eggsfs_wq, &socket->write_work);
}
read_unlock_bh(&sk->sk_callback_lock);
}
static void eggsfs_write_block_work(struct work_struct* work) {
struct eggsfs_block_socket* socket = container_of(work, struct eggsfs_block_socket, write_work);
// The way this function works means that we will not start writing a request
// before we've finished reading the proof for the previous one. We could be
// slightly lower latency by writing ahead, but probably not a big difference.
spin_lock_bh(&socket->requests_lock);
struct eggsfs_write_block_request* req = list_empty(&socket->requests) ?
NULL :
list_first_entry(&socket->requests, struct eggsfs_write_block_request, socket_requests);
spin_unlock_bh(&socket->requests_lock);
if (req == NULL) {
eggsfs_debug_print("no request to work on");
return;
}
int err;
// Still writing request
while (req->write_req_written < EGGSFS_BLOCKS_REQ_HEADER_SIZE+EGGSFS_WRITE_BLOCK_REQ_SIZE) {
char req_msg_buf[EGGSFS_BLOCKS_REQ_HEADER_SIZE+EGGSFS_WRITE_BLOCK_REQ_SIZE];
char* req_msg = req_msg_buf;
put_unaligned_le32(EGGSFS_BLOCKS_REQ_PROTOCOL_VERSION, req_msg); req_msg += 4;
put_unaligned_le64(req->block_service_id, req_msg); req_msg += 8;
*(u8*)req_msg = EGGSFS_BLOCKS_WRITE_BLOCK; req_msg += 1;
{
struct eggsfs_bincode_put_ctx ctx = {
.start = req_msg,
.cursor = req_msg,
.end = req_msg_buf + sizeof(req_msg_buf),
};
eggsfs_write_block_req_put_start(&ctx, start);
eggsfs_write_block_req_put_block_id(&ctx, start, req_block_id, req->block_id);
eggsfs_write_block_req_put_crc(&ctx, req_block_id, req_crc, req->crc);
eggsfs_write_block_req_put_size(&ctx, req_crc, req_size, req->size);
eggsfs_write_block_req_put_certificate(&ctx, req_size, req_certificate, req->certificate);
eggsfs_write_block_req_put_end(ctx, req_certificate, end);
BUG_ON(ctx.cursor != ctx.end);
}
// send message
struct msghdr msg = {
.msg_flags = MSG_MORE | MSG_DONTWAIT,
};
eggsfs_debug_print("sending write block req to %pI4:%d", &socket->addr.sin_addr, ntohs(socket->addr.sin_port));
struct kvec iov = {
.iov_base = req_msg_buf + req->write_req_written,
.iov_len = sizeof(req_msg_buf) - req->write_req_written,
};
int sent = kernel_sendmsg(socket->sock, &msg, &iov, 1, iov.iov_len);
if (sent == -EAGAIN) { return; } // done for now, will be rescheduled by `sk_write_space`
if (sent < 0) { err = sent; goto out_err; }
req->write_req_written += sent;
}
// Write the pages out
u32 bytes_left = req->bytes_left;
while (bytes_left > 0) {
struct page* page = req->page;
int sent = kernel_sendpage(
socket->sock, page,
req->page_offset,
min((u32)(PAGE_SIZE - req->page_offset), bytes_left),
MSG_DONTWAIT
);
if (sent == -EAGAIN) {
// done for now, will be rescheduled by `sk_write_space`
break;
}
if (sent < 0) { err = sent; goto out_err; }
req->page_offset += sent;
bytes_left -= sent;
if (req->page_offset >= PAGE_SIZE) {
BUG_ON(req->page_offset != PAGE_SIZE);
req->page_offset = 0;
req->page = (struct page*)req->page->private;
}
}
smp_store_release(&req->bytes_left, bytes_left);
return;
out_err:
eggsfs_debug_print("block write failed err=%d", err);
read_lock(&socket->sock->sk->sk_callback_lock);
atomic_set(&socket->sock_err, err);
// this will just drain the request list erroring them all because of the
// socket error.
eggsfs_write_block_receive_inner(socket, NULL, 0, 0);
read_unlock(&socket->sock->sk->sk_callback_lock);
}
struct eggsfs_block_socket* eggsfs_get_write_block_socket(struct eggsfs_block_service* block_service) {
struct eggsfs_block_socket* socket = eggsfs_get_fetch_block_socket(block_service);
if (IS_ERR(socket)) { return socket; }
socket->sock->sk->sk_data_ready = eggsfs_write_block_data_ready;
socket->sock->sk->sk_state_change = eggsfs_write_block_state_change;
socket->sock->sk->sk_write_space = eggsfs_write_block_write_space;
INIT_WORK(&socket->write_work, eggsfs_write_block_work);
return socket;
}
void eggsfs_put_write_block_socket(struct eggsfs_block_socket* socket) {
eggsfs_put_fetch_block_socket(socket);
}
struct eggsfs_write_block_request* eggsfs_write_block(
struct eggsfs_block_socket* socket,
struct work_struct* callback,
// This must match with what you passed into `eggsfs_get_fetch_block_socket`
u64 block_service_id,
u64 block_id,
u64 certificate,
u32 size,
u32 crc,
// There must be enough pages to write everything. The next page should
// be in ->private (the reasons for not using ->lru are purely because
// how some other code works out)
struct page* page
) {
int err;
struct eggsfs_write_block_request* req = kmem_cache_alloc(eggsfs_write_block_request_cachep, GFP_KERNEL);
if (!req) { err = -ENOMEM; goto out_err; }
req->page = page;
atomic_set(&req->status, 0);
req->bytes_left = size;
req->page_offset = 0;
req->write_resp_read = 0;
req->write_req_written = 0;
req->written_resp_read = 0;
req->block_service_id = block_service_id;
req->block_id = block_id;
req->crc = crc;
req->size = size;
req->certificate = certificate;
req->callback = callback;
// we rely on the proof becoming non-zero to find out when the request is finished
memset(&req->written_resp.buf, 0, sizeof(req->written_resp.buf));
spin_lock_bh(&socket->requests_lock);
list_add_tail(&req->socket_requests, &socket->requests);
spin_unlock_bh(&socket->requests_lock);
// the request is filled in by the normal writer, since we need to ensure ordering
queue_work(eggsfs_wq, &socket->write_work);
trace_eggsfs_block_write_enter(block_service_id, block_id, size);
return req;
out_err:
eggsfs_info_print("couldn't start write block request, err=%d", err);
return ERR_PTR(err);
}
void eggsfs_put_write_block_request(struct eggsfs_write_block_request* req) {
kmem_cache_free(eggsfs_write_block_request_cachep, req);
}
static void eggsfs_init_fetch_block_request_once(void *p) {
struct eggsfs_fetch_block_request* req = p;
init_completion(&req->comp);
}
int __init eggsfs_blocksimple_init(void) {
int err;
eggsfs_fetch_block_request_cachep = kmem_cache_create(
"eggsfs_fetch_block_request_cache",
sizeof(struct eggsfs_fetch_block_request),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
eggsfs_init_fetch_block_request_once
);
if (!eggsfs_fetch_block_request_cachep) { err = -ENOMEM; goto out_err; }
eggsfs_write_block_request_cachep = kmem_cache_create(
"eggsfs_write_block_request_cache",
sizeof(struct eggsfs_write_block_request),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL
);
if (!eggsfs_write_block_request_cachep) { err = -ENOMEM; goto out_fetch_request; }
return 0;
out_fetch_request:
kmem_cache_destroy(eggsfs_fetch_block_request_cachep);
out_err:
return err;
}
void __cold eggsfs_blocksimple_exit(void) {
// TODO: handle case where there still are requests in flight.
kmem_cache_destroy(eggsfs_fetch_block_request_cachep);
kmem_cache_destroy(eggsfs_write_block_request_cachep);
}
+171
View File
@@ -0,0 +1,171 @@
#ifndef _EGGSFS_BLOCKSIMPLE_H
#define _EGGSFS_BLOCKSIMPLE_H
#include <linux/kernel.h>
#include <linux/completion.h>
#include <linux/net.h>
#include <net/tcp.h>
#include "bincode.h"
#define EGGSFS_MAX_BLOCK_SIZE (100 << 20) // 100MiB
#define EGGSFS_BLOCKS_RESP_HEADER_SIZE (4 + 1) // protocol + kind
struct eggsfs_block_service {
u64 id;
u32 ip1;
u32 ip2;
u16 port1;
u16 port2;
};
struct eggsfs_block_socket;
struct eggsfs_block_socket* eggsfs_get_fetch_block_socket(struct eggsfs_block_service* block_service);
void eggsfs_put_fetch_block_socket(struct eggsfs_block_socket* socket);
struct eggsfs_fetch_block_request {
// Pages where we write the block. When the request is done everything
// will be available here. Freeing this is the responsibility of the
// caller, who must wait for the request to be completed before doing so.
struct list_head pages;
// Completes when the block is fetched and all pages have been filled
// in.
struct completion comp;
// Stores the error if something went wrong (to be checked by the caller before
// inspecting pages or crc).
int err;
// CRC of the body
u32 crc;
// Below fields are private.
// List node for list of requests enqueued for a given socket.
struct list_head socket_requests;
// The caller and blocksimple.c both need to make sure the request is still there
// when they need it. We need such a thing since we sometimes send fetch requests
// from contexts we want to be killable (i.e. syscalls). So in those cases we
// need to drop them on the floor.
atomic_t refcount;
// How many bytes we have left to read.
u32 bytes_left;
// The cursor inside the current page.
u16 page_offset;
// How much we've read of the header.
u8 header_read;
static_assert(EGGSFS_FETCH_BLOCK_RESP_SIZE == 0);
union {
char buf[EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2];
struct {
__le32 protocol;
u8 kind;
__le16 error;
} __attribute__((packed)) data;
} header;
};
// Might return errors.
struct eggsfs_fetch_block_request* eggsfs_fetch_block(
struct eggsfs_block_socket* socket,
// This must match with what you passed into `eggsfs_get_fetch_block_socket`
u64 block_service_id,
u64 block_id,
u32 offset,
u32 count
);
// Must be called when you're done with the request. It's OK to call
// this before the request has finished. If the caller has successfully waited for
// the request to complete, it's OK to take ownership of the pages by removing them
// from the list.
void eggsfs_put_fetch_block_request(struct eggsfs_fetch_block_request* req);
struct eggsfs_block_socket* eggsfs_get_write_block_socket(struct eggsfs_block_service* block_service);
void eggsfs_put_write_block_socket(struct eggsfs_block_socket* socket);
struct eggsfs_write_block_request {
// Called when request is done
struct work_struct* callback;
// The current page we're reading from. Singly linked list, next one
// in ->private. These pages are managed by the issuer of the request.
struct page* page;
// List node for list of requests enqueued for a given socket.
struct list_head socket_requests;
// Req info
u64 block_service_id;
u64 block_id;
u32 crc;
u32 size;
u64 certificate;
// * 0: not done yet
// * 1: done
// * -n: errored
atomic_t status;
u32 bytes_left; // how many bytes left to write
u16 page_offset; // offset into page we're reading from
// How much we've written of the write request
u8 write_req_written;
// How much we're read of the first response (after we ask to write), and of the
// second (after we've written the block).
u8 write_resp_read;
static_assert(EGGSFS_WRITE_BLOCK_RESP_SIZE == 0);
union {
char buf[EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2];
struct {
__le32 protocol;
u8 kind;
__le16 error;
} __attribute__((packed)) data;
} write_resp;
u8 written_resp_read;
static_assert(EGGSFS_BLOCK_WRITTEN_RESP_SIZE > 2);
union {
char buf[EGGSFS_BLOCKS_RESP_HEADER_SIZE+EGGSFS_BLOCK_WRITTEN_RESP_SIZE];
struct {
__le32 protocol;
u8 kind;
union {
__le16 error;
__be64 proof;
};
} __attribute__((packed)) data;
} written_resp;
};
// Unlike fetching, here you _must_ wait for the request to be complete. This is since
// we must know that nothing might use the pages passed in before freeing them.
struct eggsfs_write_block_request* eggsfs_write_block(
struct eggsfs_block_socket* socket,
struct work_struct* callback,
// This must match with what you passed into `eggsfs_get_fetch_block_socket`
u64 block_service_id,
u64 block_id,
u64 certificate,
u32 size,
u32 crc,
// There must be enough pages to write everything. The next page should
// be in ->private (the reasons for not using ->lru are purely because
// the code that actually uses this is easier to write this way).
struct page* page
);
// Must be called when you're done with the request. Can't be called
// before the request is completed.
void eggsfs_put_write_block_request(struct eggsfs_write_block_request* req);
int __init eggsfs_blocksimple_init(void);
void __cold eggsfs_blocksimple_exit(void);
#endif
Executable
+7
View File
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -eu -o pipefail
set -x
./sync.sh
ssh uovo 'cd eggs-kmod && make -j kmod writefile'
+15
View File
@@ -0,0 +1,15 @@
#ifndef _EGGSFS_COMMON_H
#define _EGGSFS_COMMON_H
#define EGGSFS_MAX_FILENAME 255
//#define eggsfs_debug_print(fmt, args...) trace_printk(fmt "\n", ##args)
#define eggsfs_warn_print(fmt, args...) printk(KERN_WARNING "eggsfs: %s: " fmt "\n", __func__, ##args)
#define eggsfs_info_print(fmt, args...) printk(KERN_INFO "eggsfs: %s: " fmt "\n", __func__, ##args)
#define eggsfs_debug_print(fmt, args...) do {} while(0)
// #define eggsfs_debug_print(fmt, args...) printk(KERN_DEBUG "eggsfs: %s: " fmt "\n", __func__, ##args)
extern struct workqueue_struct* eggsfs_wq;
#endif
+22
View File
@@ -0,0 +1,22 @@
#ifndef _EGGSFS_COUNTER_H
#define _EGGSFS_COUNTER_H
#include <linux/compiler.h>
#include <linux/percpu.h>
#define EGGSFS_DECLARE_COUNTER(_name) DECLARE_PER_CPU(u64, _name)
#define EGGSFS_DEFINE_COUNTER(_name) DEFINE_PER_CPU(u64, _name)
#define eggsfs_counter_inc(_counter) raw_cpu_inc(_counter)
#define eggsfs_counter_get(_counter) ({ \
u64 total = 0; \
int cpu; \
for_each_possible_cpu(cpu) { \
total += *per_cpu_ptr(_counter, cpu); \
} \
total; \
})
#endif
+26
View File
@@ -0,0 +1,26 @@
#include "crc.h"
#include "intrshims.h"
#include "common.h"
#include "crc32c.c"
u32 eggsfs_crc32c(u32 crc, const char* buf, size_t len) {
return crc32c(crc, buf, len);
}
u32 eggsfs_crc32c_simple(u32 crc, const char* buf, size_t len) {
return crc32c_simple(crc, buf, len);
}
u32 eggsfs_crc32c_xor(u32 crc1, u32 crc2, size_t len) {
return crc32c_xor(crc1, crc2, len);
}
u32 eggsfs_crc32c_append(u32 crc1, u32 crc2, size_t len2) {
return crc32c_append(crc1, crc2, len2);
}
u32 eggsfs_crc32c_zero_extend(u32 crc, ssize_t zeros) {
return crc32c_zero_extend(crc, zeros);
}
+20
View File
@@ -0,0 +1,20 @@
#ifndef _EGGSFS_CRC_H
#define _EGGSFS_CRC_H
#include <linux/kernel.h>
// You _must_ wrap this with kernel_fpu_begin/kernel_fpu_end!
// It's not required for the other functions.
u32 eggsfs_crc32c(u32 crc, const char* buf, size_t len);
// The only difference with `eggsfs_crc32c` is that kernel_fpu_begin/kernel_fpu_end
// are not required for this one.
u32 eggsfs_crc32c_simple(u32 crc, const char* buf, size_t len);
u32 eggsfs_crc32c_xor(u32 crc1, u32 crc2, size_t len);
u32 eggsfs_crc32c_append(u32 crc1, u32 crc2, size_t len2);
u32 eggsfs_crc32c_zero_extend(u32 crc, ssize_t zeros);
#endif
+1
View File
@@ -0,0 +1 @@
../cpp/crc32c/crc32c.c
+429
View File
@@ -0,0 +1,429 @@
#include "dir.h"
#include <linux/mm.h>
#include <linux/slab.h>
#include "common.h"
#include "err.h"
#include "inode.h"
#include "metadata.h"
#include "trace.h"
// sysctls
int eggsfs_dir_refresh_time; // in jiffies
//
#define eggsfs_dir_get_page_n(_page) ({ *((((u32*)&(_page)->private))+1); })
#define eggsfs_dir_set_page_n(_page, _n) ({ *((((u32*)&(_page)->private))+1) = _n; })
static inline int eggsfs_dir_entry_size(int name_len) {
return 8 + 1 + name_len; // inode + len + name
}
static inline void eggsfs_dir_entry_parse(const char* buf, u64* ino, u8* name_len, const char** name) {
*ino = get_unaligned_le64(buf); buf += 8;
*name_len = *(u8*)buf; buf++;
*name = buf; buf += *name_len;
}
struct eggsfs_readdir_ctx {
struct page* current_page;
struct page* pages;
u32 page_offset;
u32 page_n;
};
// Increments the refcount of the cache in ->private under RCU.
// After this we know the page won't be deallocated until we
// decrease the refcount.
static struct page* eggsfs_dir_get_cache(struct eggsfs_inode* enode) {
struct page* page;
rcu_read_lock();
page = rcu_dereference(enode->dir.pages);
if (page) { atomic_inc((atomic_t*)&page->private); }
rcu_read_unlock();
return page;
}
// Decrements the refcount, and frees up all the pages in the
// cache if we were the last ones. Must be called by
// somebody who called eggsfs_dir_get_cache before.
//
// Note that ->current_page and ->dir_pages are both holding a
// reference, so once you clear those you should call `eggsfs_dir_put_cache`.
static void eggsfs_dir_put_cache(struct page* page) {
if (atomic_dec_and_test((atomic_t*)&page->private)) {
LIST_HEAD(pages);
while (page) {
struct page* next = (struct page*)page->mapping;
page->mapping = NULL;
page->private = 0;
list_add_tail(&page->lru, &pages);
page = next;
}
put_pages_list(&pages);
}
}
static void __eggsfs_dir_put_cache_rcu(struct rcu_head* rcu) {
struct page* page = container_of(rcu, struct page, rcu_head);
eggsfs_dir_put_cache(page);
}
// Removes the cache from ->dir_pages, and also calls
// `eggsfs_dir_put_cache` to clear the reference.
void eggsfs_dir_drop_cache(struct eggsfs_inode* enode) {
struct page* page;
static_assert(sizeof(struct rcu_head) == sizeof(struct list_head));
if (!atomic64_read((atomic64_t*)&enode->dir.pages)) { return; }
page = (struct page*)atomic64_xchg((atomic64_t*)&enode->dir.pages, (u64)0);
if (page) { call_rcu(&page->rcu_head, __eggsfs_dir_put_cache_rcu); }
}
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_seqno);
static int eggsfs_dir_open(struct inode* inode, struct file* filp) {
struct eggsfs_inode* enode = EGGSFS_I(inode);
struct eggsfs_readdir_ctx* ctx;
struct page* page;
int err;
trace_eggsfs_vfs_opendir_enter(inode);
if (get_jiffies_64() >= enode->mtime_expiry) {
err = eggsfs_dir_revalidate(enode);
if (err) { return err; }
}
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx) { return -ENOMEM; }
again: // progress: whoever wins the lock either get pages or fails
page = eggsfs_dir_get_cache(enode);
if (page && page->index != READ_ONCE(enode->mtime)) {
eggsfs_dir_put_cache(page);
eggsfs_dir_drop_cache(enode);
page = NULL;
}
if (!page) {
int seqno;
if (!eggsfs_latch_try_acquire(&enode->dir.pages_latch, seqno)) {
err = eggsfs_latch_wait_killable(&enode->dir.pages_latch, seqno);
if (err) { goto out_ctx; }
goto again;
}
// If somebody else got to creating the page already, drop it,
// otherwise we'll leak.
page = eggsfs_dir_get_cache(enode);
if (page && page->index != READ_ONCE(enode->mtime)) {
eggsfs_dir_put_cache(page);
eggsfs_dir_drop_cache(enode);
page = NULL;
}
if (!page) {
u64 dir_mtime = READ_ONCE(enode->mtime);
page = eggsfs_dir_read_all(filp->f_path.dentry, dir_mtime);
if (!IS_ERR(page)) {
// If the mtime matches (which will be the case unless somebody
// else revalidated between the READ_ONCE above), then the refcount
// starts at two: one for the ->dir_pages, and one for the
// ->current_page below. Otherwise only for ->current_page (basically
// we have these pages to be private to this read).
if (likely(READ_ONCE(enode->mtime) == dir_mtime)) {
atomic_set((atomic_t*)&page->private, 2);
rcu_assign_pointer(enode->dir.pages, page);
} else {
atomic_set((atomic_t*)&page->private, 1);
}
}
}
eggsfs_latch_release(&enode->dir.pages_latch, seqno);
}
if (IS_ERR(page)) { err = PTR_ERR(page); goto out_ctx; }
ctx->current_page = page;
ctx->pages = page;
ctx->page_offset = 0;
ctx->page_n = 0;
filp->private_data = ctx;
trace_eggsfs_vfs_opendir_exit(inode, 0);
return 0;
out_ctx:
kfree(ctx);
trace_eggsfs_vfs_opendir_exit(inode, err);
eggsfs_debug_print("err=%d", err);
return err;
}
static void update_dcache(struct dentry* parent, u64 dir_seqno, const char* name, int name_len, u64 ino) {
struct qstr filename = QSTR_INIT(name, name_len);
struct dentry* dentry;
struct dentry* alias;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
eggsfs_debug_print("parent=%pd name=%*pE ino=%llx", parent, name_len, name, ino);
filename.hash = full_name_hash(parent, filename.name, filename.len);
dentry = d_lookup(parent, &filename);
if (!dentry) {
again:
dentry = d_alloc_parallel(parent, &filename, &wq);
if (IS_ERR(dentry)) { return; }
}
if (!d_in_lookup(dentry)) {
// pre-existing entry
struct inode* old_inode = d_inode(dentry);
if (old_inode && old_inode->i_ino == ino) {
WRITE_ONCE(dentry->d_time, dir_seqno);
goto out;
}
d_invalidate(dentry);
dput(dentry);
goto again;
} else {
// new entry
struct inode* inode = eggsfs_get_inode(parent->d_sb, ino);
// d_splice_alias propagates error in inode
WRITE_ONCE(dentry->d_time, dir_seqno);
alias = d_splice_alias(inode, dentry);
d_lookup_done(dentry);
if (alias) {
dput(dentry);
dentry = alias;
}
if (IS_ERR(dentry)) { return; }
}
out:
dput(dentry);
};
struct eggsfs_dir_fill_ctx {
struct dentry* parent;
u64 dir_mtime;
// NULL or kmap'ed at page_addr
struct page* current_page;
// kmap'd address of current_page
char* page_addr;
// Current offset inside the page we're writing
u32 page_off;
// Number of eggsfs dir entries in the current page
u32 page_n;
};
static int eggsfs_dir_add_entry(struct eggsfs_dir_fill_ctx* ctx, const char* name, int name_len, u64 ino) {
if (name_len > EGGSFS_MAX_FILENAME) { return -ENAMETOOLONG; }
// Flush current page, and start a new one, linking it in ->mapping
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
struct page* page = alloc_page(GFP_KERNEL);
if (!page) { return -ENOMEM; }
kunmap(ctx->current_page);
ctx->current_page->mapping = (void*)page;
eggsfs_dir_set_page_n(ctx->current_page, ctx->page_n);
ctx->current_page = page;
ctx->page_addr = kmap(page);
ctx->page_off = 0;
ctx->page_n = 0;
}
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
WARN_ON(ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE);
return -EIO;
}
char* entry = ctx->page_addr + ctx->page_off;
put_unaligned_le64(ino, entry); entry += 8;
*(u8*)entry = (u8)name_len; entry++;
memcpy(entry, name, name_len);
ctx->page_off += eggsfs_dir_entry_size(name_len);
++ctx->page_n;
return 0;
}
int eggsfs_dir_readdir_entry_cb(void* ptr, const char* name, int name_len, u64 hash, u64 ino) {
struct eggsfs_dir_fill_ctx* ctx = ptr;
eggsfs_debug_print("hash=%llx ino=%llx, name_len=%d, name=%*pE", hash, ino, name_len, name_len, name);
update_dcache(ctx->parent, ctx->dir_mtime, name, name_len, ino);
return eggsfs_dir_add_entry(ctx, name, name_len, ino);
}
// mut hold
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_mtime) {
bool eof = false;
int err;
u64 continuation_key = 0;
struct inode* inode = d_inode(dentry);
struct eggsfs_inode* enode = EGGSFS_I(inode);
struct eggsfs_dir_fill_ctx ctx = {
.parent = dentry,
.dir_mtime = dir_mtime,
};
struct page* first_page;
first_page = alloc_page(GFP_KERNEL);
if (!first_page) { return ERR_PTR(-ENOMEM); }
ctx.current_page = first_page;
ctx.page_addr = kmap(first_page);
ctx.page_off = 0;
ctx.page_n = 0;
first_page->index = dir_mtime;
while (!eof) {
eggsfs_debug_print("cont_key=%llx", continuation_key);
err = eggsfs_shard_readdir((struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, continuation_key, &ctx, &continuation_key);
if (err) { err = eggsfs_error_to_linux(err); goto out_err; }
eof = continuation_key == 0;
}
kunmap(ctx.current_page);
eggsfs_dir_set_page_n(ctx.current_page, ctx.page_n);
return first_page;
out_err:
kunmap(ctx.current_page);
while (first_page) {
struct page* next = (struct page*)first_page->mapping;
first_page->mapping = NULL;
put_page(first_page);
first_page = next;
}
eggsfs_info_print("err=%d", err);
return ERR_PTR(err);
};
// vfs: exclusive inode.i_rwsem
// vfs: mutex file.f_pos_lock
static int eggsfs_dir_read(struct file* filp, struct dir_context* dctx) {
struct eggsfs_readdir_ctx* ctx = filp->private_data;
char* page_addr;
WARN_ON(!ctx);
if (!ctx) { return -EINVAL; }
eggsfs_debug_print("ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u", ctx,
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n);
if (!dir_emit_dots(filp, dctx)) { return 0; }
if (!ctx->current_page || !eggsfs_dir_get_page_n(ctx->current_page)) { return 0; }
page_addr = kmap(ctx->current_page);
while (ctx->current_page) {
int dtype;
const char* entry = page_addr + ctx->page_offset;
eggsfs_debug_print("next page_addr=%p ctx.page_offset=%u entry=%p", page_addr, ctx->page_offset, entry);
u64 ino;
u8 name_len;
const char* name;
eggsfs_dir_entry_parse(entry, &ino, &name_len, &name);
eggsfs_debug_print("name=%*pE ino=0x%016llx", name_len, name, ino);
switch (eggsfs_inode_type(ino)) {
case EGGSFS_INODE_DIRECTORY: dtype = DT_DIR; break;
case EGGSFS_INODE_FILE: dtype = DT_REG; break;
case EGGSFS_INODE_SYMLINK: dtype = DT_LNK; break;
default: dtype = DT_UNKNOWN;
}
if (!dir_emit(dctx, name, name_len, ino, dtype)) {
eggsfs_debug_print("break");
break;
}
ctx->page_offset += eggsfs_dir_entry_size(name_len);
++ctx->page_n;
++dctx->pos;
eggsfs_debug_print(
"next ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u page_n=%u", ctx,
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n, eggsfs_dir_get_page_n(ctx->current_page)
);
if (ctx->page_n >= eggsfs_dir_get_page_n(ctx->current_page)) {
kunmap(ctx->current_page);
ctx->current_page = (struct page*)ctx->current_page->mapping;
eggsfs_debug_print("next page current_page = %p", ctx->current_page);
ctx->page_offset = 0;
ctx->page_n = 0;
page_addr = kmap(ctx->current_page);
}
}
if (ctx->current_page) {
eggsfs_debug_print("unmap current_page = %p", ctx->current_page);
kunmap(ctx->current_page);
}
eggsfs_debug_print("done");
return 0;
}
#if 0
// multiple seeks at the same time?
static loff_t eggsfs_llseek_dir(struct file* filp, loff_t offset, int whence) {
int err;
struct eggsfs_readdir_ctx* ctx = filp->private_data;
loff_t dir_offset;
if (whence != SEEK_CUR) { return -EINVAL; }
// concurrency with readdir?
// TODO: if offset == 0 reacquire pages
ctx->current_page = ctx->pages;
ctx->page_offset = 0;
ctx->page_n = 0;
if (offset > 2) {
dir_offset = offset - 2;
while (ctx->current_page && eggsfs_dir_get_page_n(ctx->current_page) <= dir_offset) {
dir_offset -= eggsfs_dir_get_page_n(ctx->current_page);
ctx->current_page = (struct page*)ctx->current_page->mapping;
}
if (ctx->current_page) {
char* page_addr = (char*)kmap(ctx->current_page);
while (ctx->page_n < dir_offset) {
BUG_ON(ctx->page_offset >= PAGE_SIZE);
u64 ino;
u8 name_len;
const char* name;
eggsfs_dir_entry_parse(page_addr + ctx->page_offset, &ino, &name_len, &name);
ctx->page_offset += eggsfs_dir_entry_size(name_len);
++ctx->page_n;
}
kunmap(ctx->current_page);
}
}
filp->f_pos = offset;
return offset;
}
#endif
static int eggsfs_dir_close(struct inode* inode, struct file* filp) {
struct eggsfs_readdir_ctx* ctx = (struct eggsfs_readdir_ctx*)filp->private_data;
WARN_ON(!ctx);
if (!ctx) { return -EINVAL; }
trace_eggsfs_vfs_closedir_enter(inode);
eggsfs_dir_put_cache(ctx->pages);
kfree(ctx);
trace_eggsfs_vfs_closedir_exit(inode, 0);
return 0;
}
struct file_operations eggsfs_dir_operations = {
.open = eggsfs_dir_open,
.iterate = eggsfs_dir_read,
.iterate_shared = eggsfs_dir_read,
.release = eggsfs_dir_close,
};
+41
View File
@@ -0,0 +1,41 @@
#ifndef _EGGSFS_DIR_H
#define _EGGSFS_DIR_H
#include <linux/fs.h>
#include "common.h"
#include "inode.h"
extern struct file_operations eggsfs_dir_operations;
extern int eggsfs_dir_refresh_time;
int eggsfs_dir_readdir_entry_cb(void* ptr, const char* name, int name_len, u64 hash, u64 ino);
void eggsfs_dir_drop_cache(struct eggsfs_inode* enode);
static inline int eggsfs_dir_revalidate(struct eggsfs_inode* enode) {
return eggsfs_do_getattr(enode);
}
static inline int eggsfs_dir_needs_reval(struct eggsfs_inode* dir, struct dentry* dentry) {
// 0 => invalid
// -ECHILD => revalidate parent dir
// 1 => valid
u64 dentry_dir_mtime = dentry->d_time;
int ret;
u64 t = get_jiffies_64();
if (dentry_dir_mtime != dir->mtime) { ret = 0; }
else if (t >= dir->mtime_expiry) { ret = -ECHILD; }
else { ret = 1; }
eggsfs_debug_print(
"t=%llu dir=%p dir_id=0x%016lx dir_mtime=%llu dir_mtime_expiry=%llu dentry=%pd dentry_dir_mtime=%llu -> ret=%d",
t, dir, dir->inode.i_ino, dir->mtime, dir->mtime_expiry, dentry, dentry_dir_mtime, ret
);
return ret;
}
#endif
+41
View File
@@ -0,0 +1,41 @@
#include "err.h"
// Safe to use with non-error `err`
int eggsfs_error_to_linux(int err) {
if (err == 0) { return 0; }
switch (err) {
case EGGSFS_ERR_INTERNAL_ERROR: return -EIO;
case EGGSFS_ERR_FATAL_ERROR: return -EIO;
case EGGSFS_ERR_TIMEOUT: return -ETIMEDOUT;
case EGGSFS_ERR_NOT_AUTHORISED: return -EPERM;
case EGGSFS_ERR_UNRECOGNIZED_REQUEST: return -EIO;
case EGGSFS_ERR_FILE_NOT_FOUND: return -ENOENT;
case EGGSFS_ERR_DIRECTORY_NOT_FOUND: return -ENOENT;
case EGGSFS_ERR_NAME_NOT_FOUND: return -ENOENT;
case EGGSFS_ERR_TYPE_IS_DIRECTORY: return -EISDIR;
case EGGSFS_ERR_TYPE_IS_NOT_DIRECTORY: return -ENOTDIR;
case EGGSFS_ERR_BAD_COOKIE: return -EBADCOOKIE;
case EGGSFS_ERR_INCONSISTENT_STORAGE_CLASS_PARITY: return -EIO;
case EGGSFS_ERR_LAST_SPAN_STATE_NOT_CLEAN: return -EIO;
case EGGSFS_ERR_COULD_NOT_PICK_BLOCK_SERVICES: return -EIO;
case EGGSFS_ERR_BAD_SPAN_BODY: return -EIO;
case EGGSFS_ERR_SPAN_NOT_FOUND: return -EIO;
case EGGSFS_ERR_BLOCK_SERVICE_NOT_FOUND: return -EIO;
case EGGSFS_ERR_CANNOT_CERTIFY_BLOCKLESS_SPAN: return -EIO;
case EGGSFS_ERR_BAD_NUMBER_OF_BLOCKS_PROOFS: return -EIO;
case EGGSFS_ERR_BAD_BLOCK_PROOF: return -EIO;
case EGGSFS_ERR_CANNOT_OVERRIDE_NAME: return -EEXIST;
case EGGSFS_ERR_NAME_IS_LOCKED: return -EIO;
case EGGSFS_ERR_MISMATCHING_TARGET: return -EIO;
case EGGSFS_ERR_MISMATCHING_OWNER: return -EIO;
case EGGSFS_ERR_DIRECTORY_NOT_EMPTY: return -ENOTEMPTY;
case EGGSFS_ERR_FILE_IS_TRANSIENT: return -EIO;
case EGGSFS_ERR_OLD_DIRECTORY_NOT_FOUND: return -ENOENT;
case EGGSFS_ERR_NEW_DIRECTORY_NOT_FOUND: return -ENOENT;
case EGGSFS_ERR_LOOP_IN_DIRECTORY_RENAME: return -ELOOP;
case EGGSFS_ERR_MALFORMED_REQUEST: return -EIO;
case EGGSFS_ERR_MALFORMED_RESPONSE: return -EIO;
}
return -EIO;
}
+11
View File
@@ -0,0 +1,11 @@
#ifndef _EGGSFS_ERR_H
#define _EGGSFS_ERR_H
#include <linux/errno.h>
#include "bincode.h"
int eggsfs_error_to_linux(int err);
#endif
+855
View File
@@ -0,0 +1,855 @@
#include "file.h"
#include <linux/uio.h>
#include "bincode.h"
#include "inode.h"
#include "common.h"
#include "metadata.h"
#include "err.h"
#include "rs.h"
#include "block.h"
#include "skb.h"
#include "crc.h"
#include "trace.h"
#include "span.h"
// open_mutex held here
// really want atomic open for this
static int eggsfs_file_open(struct inode* inode, struct file* filp) {
struct eggsfs_inode* enode = EGGSFS_I(inode);
eggsfs_debug_print("enode=%p flags=%x owner=%p", enode, enode->flags, current->group_leader);
if (enode->flags & EGGSFS_INODE_TRANSIENT) {
eggsfs_debug_print("trying to open transient file");
return -ENOENT;
}
if (filp->f_mode & FMODE_WRITE) {
eggsfs_debug_print("opening file for writing");
if (!(enode->flags & EGGSFS_INODE_WRITEABLE)) {
eggsfs_debug_print("trying to open for write non-writeable file");
return -EPERM;
}
enode->flags &= ~EGGSFS_INODE_WRITEABLE;
enode->flags |= EGGSFS_INODE_TRANSIENT;
}
return 0;
}
struct eggsfs_transient_span* eggsfs_add_new_span(struct list_head* spans) {
// The zeroing is important: eg we rely on it in `eggsfs_flush_spans` to detect whether we
// need to compute the parity or not.
struct eggsfs_transient_span* span = kzalloc(sizeof(struct eggsfs_transient_span), GFP_KERNEL);
if (!span) {
return ERR_PTR(-ENOMEM);
}
span->status = EGGSFS_SPAN_STATUS_WRITING;
INIT_LIST_HEAD(&span->pages);
list_add_tail(&span->list, spans);
return span;
}
struct initiate_ctx {
struct eggsfs_inode_file* file;
struct eggsfs_transient_span* span;
u32* cell_crcs;
};
int eggsfs_shard_add_span_initiate_block_cb(
void* data, int block, u32 ip1, u16 port1, u32 ip2, u16 port2, u64 block_service_id, u64 block_id, u64 certificate
) {
struct initiate_ctx* ctx = data;
struct eggsfs_block_service bs = {
.id = block_service_id,
.ip1 = ip1,
.port1 = port1,
.ip2 = ip2,
.port2 = port2,
};
struct eggsfs_block_socket* sock = eggsfs_get_write_block_socket(&bs);
if (IS_ERR(sock)) { return PTR_ERR(sock); }
ctx->span->block_sockets[block] = sock;
int B = eggsfs_blocks(ctx->span->parity);
int cell_size = ctx->span->block_size / ctx->span->stripes;
u32 block_crc = 0;
int s;
for (s = 0; s < ctx->span->stripes; s++) {
block_crc = eggsfs_crc32c_append(block_crc, ctx->cell_crcs[s*B + block], cell_size);
}
struct eggsfs_write_block_request* req = eggsfs_write_block(
sock,
&ctx->file->flusher,
block_service_id,
block_id,
certificate,
ctx->span->block_size,
block_crc,
ctx->span->blocks[block]
);
if (IS_ERR(req)) {
eggsfs_put_write_block_socket(sock);
return PTR_ERR(req);
}
ctx->span->block_reqs[block] = req;
return 0;
}
// Checks if the block-writing requests have all finished.
//
// * 0: not finished
// * 1: finished
// * -n: some request has failed with this error.
//
// Also, releases the sockets for requests that are done.
static int eggsfs_block_writing_finished(struct eggsfs_transient_span* span) {
int status = 1;
int B = eggsfs_blocks(span->parity);
int i;
for (i = 0; i < B; i++) {
struct eggsfs_write_block_request* req = span->block_reqs[i];
if (req == NULL) {
eggsfs_debug_print("%d is NULL", i);
continue;
}
int req_status = atomic_read(&req->status);
if (req_status < 0) { // error
eggsfs_warn_print("request for block %d failed: %d", i, req_status);
status = req_status;
}
if (req_status == 0) { // not completed yet
eggsfs_debug_print("%d not completed yet", i);
status = status == 1 ? 0 : status; // preserve errors
}
}
return status;
}
static void eggsfs_block_writing_put_reqs(struct eggsfs_transient_span* span) {
int B = eggsfs_blocks(span->parity);
int i;
for (i = 0; i < B; i++) {
struct eggsfs_write_block_request* req = span->block_reqs[i];
if (req != NULL) {
eggsfs_put_write_block_request(req);
span->block_reqs[i] = NULL;
}
struct eggsfs_block_socket* sock = span->block_sockets[i];
if (sock != NULL) {
eggsfs_put_write_block_socket(sock);
span->block_sockets[i] = NULL;
}
}
}
static struct page* eggsfs_alloc_write_page(struct eggsfs_inode_file* file) {
// the zeroing is to just write out to blocks assuming we have trailing
// zeros, we could do it on demand but I can't be bothered.
struct page* p = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (p == NULL) { return p; }
// this contributes to OOM score
inc_mm_counter(file->owner->mm, MM_FILEPAGES);
return p;
}
static void eggsfs_free_transient_span(struct eggsfs_inode_file* file, struct eggsfs_transient_span* span) {
// free pages
int num_pages = 0;
while (!list_empty(&span->pages)) {
struct page *victim = lru_to_page(&span->pages);
list_del(&victim->lru);
put_page(victim);
num_pages++;
}
add_mm_counter(file->owner->mm, MM_FILEPAGES, -num_pages);
// Free reqs
eggsfs_block_writing_put_reqs(span);
// Free the span itself
kfree(span);
}
// Frees up everything after an error. Takes the inode lock.
static void eggsfs_transient_error_cleanup(struct eggsfs_inode* enode) {
struct eggsfs_inode_file* file = &enode->file;
if (atomic_read(&enode->file.transient_err) == 0) {
eggsfs_warn_print("Error cleanup function is running, but the transient file is not errored. This should not be happening.");
return;
}
// We need to take the exclusive lock to ensure nobody else is writing to the last span.
// We could even release it immediately afterwards, since afterwards everything else
// will exit early singe `enode->file.transient_err` is non-zero.
inode_lock(&enode->inode);
bool finished = false;
struct eggsfs_transient_span* span;
for (;;) {
// Pick out first span (which is also the oldest, but doesn't matter anyway)
spin_lock(&file->transient_spans_lock);
span = list_first_entry_or_null(&file->transient_spans, struct eggsfs_transient_span, list);
if (span == NULL) {
// we've cleared all the spans
finished = true;
break;
}
if (eggsfs_block_writing_finished(span) == 0) {
// The requests need to complete first, they rely on the pages still being there
// (to read from them). We currently do not interrupt the requests mid-writing.
// Note that we return here even if we aren't done cleaning up, but `eggsfs_flush_transient_span`
// will be called back into, which will in turn call this function, continuing the cleanup.
eggsfs_debug_print("requests not done yet, cleanup not finished");
break;
}
eggsfs_block_writing_put_reqs(span);
// We can destroy this span.
list_del(&span->list);
spin_unlock(&file->transient_spans_lock);
eggsfs_free_transient_span(file, span);
}
spin_unlock(&file->transient_spans_lock);
inode_unlock(&enode->inode);
if (finished) {
eggsfs_debug_print("finished cleaning up, we're done flushing");
up(&file->done_flushing);
} else {
eggsfs_debug_print("not finished cleaning up");
}
}
void eggsfs_flush_transient_spans(struct work_struct* work) {
struct eggsfs_inode_file* file = container_of(work, struct eggsfs_inode_file, flusher);
struct eggsfs_inode* enode = container_of(file, struct eggsfs_inode, file);
eggsfs_debug_print("ino=%lu", enode->inode.i_ino);
if (atomic_read(&enode->file.transient_err) != 0) {
eggsfs_transient_error_cleanup(enode);
return;
}
again:
spin_lock(&file->transient_spans_lock);
struct eggsfs_transient_span* span = list_first_entry_or_null(&file->transient_spans, struct eggsfs_transient_span, list);
bool good_to_go = false;
if (span) {
if (span->status == EGGSFS_SPAN_STATUS_IDLE) {
good_to_go = true;
span->status = EGGSFS_SPAN_STATUS_FLUSHING;
}
if (span->status == EGGSFS_SPAN_STATUS_LAST) {
good_to_go = true;
span->status = EGGSFS_SPAN_STATUS_FLUSHING_LAST;
}
}
spin_unlock(&file->transient_spans_lock);
if (!good_to_go) {
eggsfs_debug_print("oldest span is not idle, terminating");
return;
}
int err;
u32* cell_crcs = NULL;
bool done = false;
if (span->storage_class == EGGSFS_EMPTY_STORAGE) {
done = true;
} else if (span->storage_class == EGGSFS_INLINE_STORAGE) {
// this is an easy one, just add the inline span
struct page* page = list_first_entry_or_null(&span->pages, struct page, lru);
if (page == NULL) {
err = -ENOMEM;
goto error;
}
char* data = kmap(page);
eggsfs_debug_print("adding inline span of length %d", span->size);
err = eggsfs_error_to_linux(eggsfs_shard_add_inline_span(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, enode->file.cookie,
atomic64_read(&enode->file.flushed_so_far), span->size, data, span->size
));
kunmap(page);
if (err) { goto error; }
done = true;
} else {
// This means that it's the first time we see this span to flush
int D = eggsfs_data_blocks(span->parity);
int B = eggsfs_blocks(span->parity);
int S = span->stripes;
eggsfs_debug_print("size=%d, D=%d, B=%d, S=%d, status=%d", span->size, D, B, S, span->status);
int i;
if (span->block_reqs[0] != NULL) {
// If we have already the sockets/requests, it being callback'd here means
// that one of the request has finished. Let's check whether they all have.
int reqs_status = eggsfs_block_writing_finished(span);
if (reqs_status == 1) { // finished
u64 block_ids[EGGSFS_MAX_BLOCKS];
u64 block_proofs[EGGSFS_MAX_BLOCKS];
for (i = 0; i < B; i++) {
struct eggsfs_write_block_request* req = span->block_reqs[i];
block_ids[i] = req->block_id;
block_proofs[i] = be64_to_cpu(req->written_resp.data.proof);
}
eggsfs_block_writing_put_reqs(span); // we got our proof
err = eggsfs_error_to_linux(eggsfs_shard_add_span_certify(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
enode->inode.i_ino, file->cookie, atomic64_read(&file->flushed_so_far), span->parity,
block_ids, block_proofs
));
if (err) { goto error; }
done = true;
} else if (reqs_status < 0) { // errored
eggsfs_block_writing_put_reqs(span);
err = reqs_status;
goto error;
}
} else {
trace_eggsfs_span_flush_enter(
enode->inode.i_ino, atomic64_read(&file->flushed_so_far), span->size, span->parity, span->stripes, span->storage_class
);
// Below, remember that cell size is always a multiple of PAGE_SIZE.
{
int data_size;
for (data_size = span->block_size * D; data_size > span->size; data_size -= PAGE_SIZE) {
// This can happen if we have trailing zeros because of how we arrange the
// stripes.
struct page* zpage = eggsfs_alloc_write_page(file);
if (zpage == NULL) {
err = -ENOMEM;
goto error;
}
list_add_tail(&zpage->lru, &span->pages);
}
}
struct page* curr_pages[EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY]; // buffer we use at various points
memset(curr_pages, 0, sizeof(curr_pages));
int cell_size = span->block_size / S;
// First, we arrange the data blocks in singly linked lists in `span->blocks`.
{
int cell_offset = 0;
int block = 0;
struct page* page;
list_for_each_entry(page, &span->pages, lru) {
page->private = 0;
if (curr_pages[block] == NULL) { // first
span->blocks[block] = page;
} else {
curr_pages[block]->private = (unsigned long)page;
}
curr_pages[block] = page;
cell_offset += PAGE_SIZE;
if (cell_offset >= cell_size) {
block++;
block = block % D;
cell_offset = 0;
}
}
}
// Then, we compute the parity blocks.
// We also compute the cell CRCs at this stage.
cell_crcs = kzalloc(sizeof(uint32_t)*B*span->stripes, GFP_KERNEL);
if (cell_crcs == NULL) { err = -ENOMEM; goto error; }
u32 span_crc = 0;
{
char* pages_bufs[EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY];
memset(curr_pages, 0, sizeof(curr_pages));
kernel_fpu_begin();
// traverse each block, page by page, keeping track of which cell we're in.
u32 block_offset;
for (block_offset = 0; block_offset < span->block_size; block_offset += PAGE_SIZE) {
for (i = 0; i < D; i++) { // grab next data page for each block
if (block_offset == 0) { // first page
curr_pages[i] = span->blocks[i];
} else {
curr_pages[i] = (struct page*)curr_pages[i]->private;
}
}
for (i = D; i < B; i++) { // allocate parity pages
struct page* ppage = eggsfs_alloc_write_page(file);
if (ppage == NULL) { err = -ENOMEM; goto kernel_fpu_error; }
list_add_tail(&ppage->lru, &span->pages);
if (curr_pages[i] == NULL) {
span->blocks[i] = ppage;
} else {
curr_pages[i]->private = (unsigned long)ppage;
}
curr_pages[i] = ppage;
}
// get buffer pointers
int s = block_offset / cell_size;
for (i = 0; i < B; i++) {
pages_bufs[i] = kmap(curr_pages[i]);
}
// compute parity
err = eggsfs_compute_parity(span->parity, PAGE_SIZE, (const char**)&pages_bufs[0], &pages_bufs[D]);
if (err) { goto kernel_fpu_error; }
// compute CRCs
for (i = 0; i < B; i++) {
cell_crcs[s*B + i] = eggsfs_crc32c(cell_crcs[s*B + i], pages_bufs[i], PAGE_SIZE);
}
for (i = 0; i < B; i++) {
kunmap(curr_pages[i]);
}
}
// Compute overall span crc
int s;
for (s = 0; s < S; s++) {
for (i = 0; i < D; i++) {
span_crc = eggsfs_crc32c_append(span_crc, cell_crcs[s*B + i], cell_size);
}
}
span_crc = eggsfs_crc32c_zero_extend(span_crc, (int)span->size - (int)(span->block_size*D));
kernel_fpu_end();
}
// Now we need to init the span (the callback will actually send the requests)
// TODO would be better to have this async, and free up the queue for other stuff.
{
struct initiate_ctx ctx = {
.file = file,
.span = span,
.cell_crcs = cell_crcs,
};
err = eggsfs_error_to_linux(eggsfs_shard_add_span_initiate(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
&ctx, enode->inode.i_ino, file->cookie, atomic64_read(&file->flushed_so_far), span->size,
span_crc, span->storage_class, span->parity, span->stripes, cell_size, cell_crcs
));
if (err) { goto error; }
}
kfree(cell_crcs);
// Now we wait.
eggsfs_debug_print("sent all block writing requests");
}
}
if (done) {
// Free everything
spin_lock(&file->transient_spans_lock);
list_del(&span->list);
spin_unlock(&file->transient_spans_lock);
// safe to do this without locks since there's only ever one
// work queue working on any given span
trace_eggsfs_span_flush_exit(enode->inode.i_ino, atomic64_read(&file->flushed_so_far), span->size, 0);
eggsfs_debug_print("finished span at offset %llu", atomic64_read(&file->flushed_so_far));
atomic64_add(span->size, &enode->file.flushed_so_far);
bool all_done = span->status == EGGSFS_SPAN_STATUS_FLUSHING_LAST;
eggsfs_free_transient_span(file, span);
up(&file->in_flight_spans);
if (all_done) {
eggsfs_debug_print("finished");
up(&file->done_flushing);
} else {
eggsfs_debug_print("going to next span");
goto again;
}
} else {
eggsfs_debug_print("span not done");
// restore status so that we can keep working on this
spin_lock(&file->transient_spans_lock);
if (span->status == EGGSFS_SPAN_STATUS_FLUSHING) {
span->status = EGGSFS_SPAN_STATUS_IDLE;
} else if (span->status == EGGSFS_SPAN_STATUS_FLUSHING_LAST) {
span->status = EGGSFS_SPAN_STATUS_LAST;
} else {
BUG();
}
spin_unlock(&file->transient_spans_lock);
}
return;
kernel_fpu_error:
kernel_fpu_end();
error:
eggsfs_info_print("transient span flushing failed with err=%d", err);
trace_eggsfs_span_flush_exit(enode->inode.i_ino, atomic64_read(&file->flushed_so_far), span->size, err);
atomic_set(&enode->file.transient_err, err);
kfree(cell_crcs);
eggsfs_transient_error_cleanup(enode);
}
// Runs in the transient_spans_lock spinlock.
static void eggsfs_compute_spans_parameters(
struct eggsfs_block_policies* block_policies,
struct eggsfs_span_policies* span_policies,
u32 target_stripe_size,
struct eggsfs_transient_span* span,
u32 span_size
) {
eggsfs_debug_print("span_size=%u", span_size);
span->size = span_size;
// Easy case: inline span (or empty, can happen for empty files)
if (span_size < 256) {
span->storage_class = span_size > 0 ? EGGSFS_INLINE_STORAGE : EGGSFS_EMPTY_STORAGE;
span->block_size = 0;
span->stripes = 0;
span->parity = 0;
return;
}
// Pick parity
int num_span_policies = eggsfs_span_policies_len(span_policies);
BUG_ON(num_span_policies == 0);
int i;
u8 parity = 0;
for (i = num_span_policies - 2; i >= 0; i--) {
u32 max_size;
u8 this_parity;
eggsfs_span_policies_get(span_policies, i, &max_size, &this_parity);
if (span_size > max_size) {
i++;
eggsfs_span_policies_get(span_policies, i, &max_size, &parity);
break;
}
}
if (parity == 0) {
i = 0;
u32 max_size;
eggsfs_span_policies_get(span_policies, i, &max_size, &parity);
BUG_ON(span_size > max_size);
}
// For simplicity, we have all cells to be a multiple of PAGE_SIZE, which means that all
// blocks/stripes/spans are multiples of PAGE_SIZE. We might also have some extra pages
// at the end to have things to line up correctly. This should only happens for big spans
// anyway (small spans will just use mirroring).
int S;
if (eggsfs_data_blocks(parity) == 1) {
// If we only have one data block, things are also pretty simple (just mirroring).
// It doesn't make sense to have stripes in this case.
S = 1;
} else {
// Otherwise compute striping etc.
// We use target_stripe_size as an upper bound, for now (i.e. the stripe will always be smaller
// than TargetStripeSize)
S = min(max((u32)1, span_size / target_stripe_size), (u32)15);
}
int D = eggsfs_data_blocks(parity);
int block_size = (span_size + D - 1) / D;
int cell_size = (block_size + S - 1) / S;
// Round up to cell to page size
cell_size = (cell_size + PAGE_SIZE - 1) & PAGE_MASK;
block_size = cell_size * S;
// Pick storage class
BUG_ON(span_size > EGGSFS_MAX_BLOCK_SIZE);
int num_block_policies = eggsfs_block_policies_len(block_policies);
BUG_ON(num_block_policies == 0);
u8 storage_class;
for (i = num_block_policies-1; i >= 0; i--) {
u32 min_size;
eggsfs_block_policies_get(block_policies, i, &storage_class, &min_size);
if (block_size > min_size) {
break;
}
}
span->storage_class = storage_class;
span->block_size = block_size;
span->stripes = S;
span->parity = parity;
}
static ssize_t eggsfs_file_write_iter(struct kiocb* iocb, struct iov_iter* from) {
int err;
if (iocb->ki_flags & IOCB_DIRECT) { return -ENOSYS; }
struct file* file = iocb->ki_filp;
struct inode* inode = file->f_inode;
struct eggsfs_inode* enode = EGGSFS_I(inode);
loff_t* ppos = &iocb->ki_pos;
loff_t ppos_before = *ppos;
eggsfs_debug_print("enode=%p, ino=%lu, count=%lu, size=%lld, *ppos=%lld", enode, inode->i_ino, iov_iter_count(from), enode->inode.i_size, *ppos);
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
return -EAGAIN;
}
inode_lock(inode);
}
// still a transient file
if (!(enode->flags & EGGSFS_INODE_TRANSIENT)) { err = -EPERM; goto out_err; }
// permanent error
err = atomic_read(&enode->file.transient_err);
if (err) { goto out_err; }
// we're writing at the right offset
if (*ppos != enode->inode.i_size) {
err = -EINVAL;
goto out_err;
}
// if it's a zero write, then we exit early
if (unlikely(!iov_iter_count(from))) { goto out; }
// Precompute the bound for spans
u32 max_span_size;
{
u8 p;
eggsfs_span_policies_last(&enode->span_policies, &max_span_size, &p);
}
BUG_ON(max_span_size%PAGE_SIZE != 0); // needed for "new span" logic paired with page copying below, we could avoid it
// get the span we're currently writing at
spin_lock(&enode->file.transient_spans_lock);
BUG_ON(list_empty(&enode->file.transient_spans));
struct eggsfs_transient_span* span = list_last_entry(&enode->file.transient_spans, struct eggsfs_transient_span, list);
int status = span->status;
spin_unlock(&enode->file.transient_spans_lock);
if (status != EGGSFS_SPAN_STATUS_WRITING) { err = -EPERM; goto out_err; } // we've already declared this file done, we're just flushing it
// We now start writing into the span, as much as we can anyway
BUG_ON(!iov_iter_count(from)); // we rely on making progress for the switch to next spage below (page->index == 0)
while (true) {
struct page* page = list_empty(&span->pages) ? NULL : list_last_entry(&span->pages, struct page, lru);
if (page == NULL || page->index == 0) { // we're the first ones to get here, or we need to switch to the next one
BUG_ON(page != NULL && page->index > PAGE_SIZE);
page = eggsfs_alloc_write_page(&enode->file);
if (!page) {
err = -ENOMEM;
goto out_err; // we haven't corrupted anything, so no need to make it permanent
}
page->index = 0;
list_add_tail(&page->lru, &span->pages);
}
int ret = copy_page_from_iter(page, page->index, PAGE_SIZE - page->index, from);
if (ret < 0) { err = ret; goto out_err_permanent; }
eggsfs_debug_print("written %d to page %p", ret, page);
enode->inode.i_size += ret;
page->index = (page->index + ret) % PAGE_SIZE;
*ppos += ret;
u32 current_span_size = enode->inode.i_size - enode->file.size_without_current_span;
if (current_span_size >= max_span_size) { // we need to switch to the next span
BUG_ON(current_span_size > max_span_size);
if (down_trylock(&enode->file.in_flight_spans) == 1) {
if (iocb->ki_flags & IOCB_NOWAIT) {
err = -EAGAIN;
goto out_err;
}
int err = down_killable(&enode->file.in_flight_spans);
if (err < 0) { goto out_err; }
}
spin_lock(&enode->file.transient_spans_lock);
span->status = EGGSFS_SPAN_STATUS_IDLE;
eggsfs_compute_spans_parameters(
&enode->block_policies, &enode->span_policies, enode->target_stripe_size,
span, current_span_size
);
struct eggsfs_transient_span* new_span = eggsfs_add_new_span(&enode->file.transient_spans);
spin_unlock(&enode->file.transient_spans_lock);
enode->file.size_without_current_span += current_span_size;
if (IS_ERR(new_span)) {
err = PTR_ERR(new_span);
// This is permanent otherwise we'd break the invariant of always having a WRITING span at
// the end.
goto out_err_permanent;
}
trace_eggsfs_span_add(enode->inode.i_ino, *ppos);
queue_work(eggsfs_wq, &enode->file.flusher);
span = new_span;
}
if (!iov_iter_count(from)) { break; }
}
int written;
out:
written = *ppos - ppos_before;
inode_unlock(inode);
eggsfs_debug_print("written=%d", written);
return written;
out_err_permanent:
atomic_set(&enode->file.transient_err, err);
// we need to free the yet-to-be-written spans
queue_work(eggsfs_wq, &enode->file.flusher);
out_err:
inode_unlock(inode);
eggsfs_info_print("err=%d", err);
if (err == -EAGAIN && *ppos > ppos_before) {
goto out; // we can just return what we've already written
}
return err;
}
// * 0: skip (not the right owner)
// * 1: proceed
// * -n: error
__must_check static int eggsfs_file_flush_init(struct eggsfs_inode* enode) {
int res;
inode_lock(&enode->inode);
// Not a transient file, there's nothing to do, files are immutable
if (!(enode->flags & EGGSFS_INODE_TRANSIENT)) { res = 0; goto out; }
// We are in another process, skip
if (enode->file.owner != current->group_leader) { res = 0; goto out; }
// Mark the last span as the last one, and queue work
spin_lock(&enode->file.transient_spans_lock);
BUG_ON(list_empty(&enode->file.transient_spans));
struct eggsfs_transient_span* span = list_last_entry(&enode->file.transient_spans, struct eggsfs_transient_span, list);
// we might not be the first ones here -- every other operation is fine to do twice anyway.
bool needs_queueing = false;
if (span->status == EGGSFS_SPAN_STATUS_WRITING) {
needs_queueing = true;
u32 current_span_size = enode->inode.i_size - enode->file.size_without_current_span;
span->status = EGGSFS_SPAN_STATUS_LAST;
eggsfs_debug_print("finishing with %u", current_span_size);
eggsfs_compute_spans_parameters(
&enode->block_policies, &enode->span_policies, enode->target_stripe_size,
span, current_span_size
);
enode->file.size_without_current_span += current_span_size;
}
spin_unlock(&enode->file.transient_spans_lock);
inode_unlock(&enode->inode);
if (needs_queueing) {
queue_work(eggsfs_wq, &enode->file.flusher);
}
return 1;
out:
inode_unlock(&enode->inode);
return res;
}
static int eggsfs_file_flush(struct file* filp, fl_owner_t id) { // can we get write while this is in progress?
struct eggsfs_inode* enode = EGGSFS_I(filp->f_inode);
struct dentry* dentry = filp->f_path.dentry;
int res = eggsfs_file_flush_init(enode);
if (res == 0) { return 0; }
if (res < 0) { return res; }
// Now we need to wait for things to be done
eggsfs_debug_print("waiting for flush");
res = down_killable(&enode->file.done_flushing);
if (res < 0) { return res; }
up(&enode->file.done_flushing); // allow others to wait on it again
eggsfs_debug_print("flush done");
res = atomic_read(&enode->file.transient_err);
if (res < 0) { return res; }
inode_lock(&enode->inode);
// somebody might have gotten here before us
if (enode->flags & EGGSFS_INODE_TRANSIENT) {
res = eggsfs_error_to_linux(eggsfs_shard_link_file(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino,
enode->file.cookie, dentry->d_parent->d_inode->i_ino, dentry->d_name.name, dentry->d_name.len,
&enode->edge_creation_time
));
if (res == 0) {
enode->flags &= ~EGGSFS_INODE_TRANSIENT;
}
}
inode_unlock(&enode->inode);
return res;
}
static ssize_t eggsfs_file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
struct file* file = iocb->ki_filp;
struct inode* inode = file->f_inode;
struct eggsfs_inode* enode = EGGSFS_I(inode);
loff_t *ppos = &iocb->ki_pos;
ssize_t written = 0;
if (unlikely(iocb->ki_flags & IOCB_DIRECT)) { return -ENOSYS; }
eggsfs_debug_print("start of read loop, *ppos=%llu", *ppos);
while (*ppos < inode->i_size && iov_iter_count(to)) {
struct eggsfs_span* span = eggsfs_get_span(enode, *ppos);
if (IS_ERR(span)) { written = PTR_ERR(span); goto out; }
if (span == NULL) { goto out; }
if (span->start%PAGE_SIZE != 0) {
eggsfs_warn_print("span start is not a multiple of page size %llu", span->start);
written = -EIO;
goto out;
}
eggsfs_debug_print("span start=%llu end=%llu", span->start, span->end);
u64 span_offset = *ppos - span->start;
u64 span_size = span->end - span->start;
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
struct eggsfs_inline_span* inline_span = EGGSFS_INLINE_SPAN(span);
size_t to_copy = span_size - span_offset;
eggsfs_debug_print("(inline) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
size_t copied =
copy_to_iter(inline_span->body + span_offset, inline_span->len - span_offset, to) +
iov_iter_zero(span_size - inline_span->len, to); // trailing zeros
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out; }
written += copied;
*ppos += copied;
} else {
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
if (block_span->cell_size%PAGE_SIZE != 0) {
eggsfs_warn_print("cell size not multiple of page size %u", block_span->cell_size);
return -EIO;
}
u32 page_ix = span_offset/PAGE_SIZE;
// how much we should read (e.g. exclude trailing zeros)
u64 span_data_end = min(
span->end,
span->start + (u64)block_span->cell_size*eggsfs_data_blocks(block_span->parity)*block_span->stripes
);
eggsfs_debug_print("span_data_end=%llu", span_data_end);
while (*ppos < span_data_end && iov_iter_count(to)) {
struct page* page = eggsfs_get_span_page(block_span, page_ix);
if (IS_ERR(page)) { written = PTR_ERR(page); goto out; }
size_t to_copy = min((u64)PAGE_SIZE - (*ppos % PAGE_SIZE), span_data_end - *ppos);
eggsfs_debug_print("(block) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
size_t copied = copy_page_to_iter(page, *ppos % PAGE_SIZE, to_copy, to);
eggsfs_debug_print("copied=%lu, remaining %lu", copied, iov_iter_count(to));
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out; }
written += copied;
*ppos += copied;
page_ix++;
}
// trailing zeros
if (iov_iter_count(to)) {
size_t to_copy = span->end - span_data_end;
size_t copied = iov_iter_zero(to_copy, to);
eggsfs_debug_print("(block zeroes) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out; }
written += copied;
*ppos += copied;
}
}
eggsfs_debug_print("before loop end, remaining %lu", iov_iter_count(to));
}
out:
eggsfs_debug_print("out of the loop, written=%ld", written);
if (written < 0) {
eggsfs_debug_print("reading failed, err=%ld", written);
}
return written;
}
const struct file_operations eggsfs_filesimple_operations = {
.open = eggsfs_file_open,
.read_iter = eggsfs_file_read_iter,
.write_iter = eggsfs_file_write_iter,
.flush = eggsfs_file_flush,
.llseek = generic_file_llseek,
};
+18
View File
@@ -0,0 +1,18 @@
#ifndef _EGGSFS_FILESIMPLE_H
#define _EGGSFS_FILESIMPLE_H
#include <linux/list.h>
#include <linux/workqueue.h>
// Must be called with `spans_lock` taken.
struct eggsfs_transient_span* eggsfs_add_new_span(struct list_head* spans);
void eggsfs_flush_transient_spans(struct work_struct* work);
int eggsfs_shard_add_span_initiate_block_cb(
void* data, int block, u32 ip1, u16 port1, u32 ip2, u16 port2, u64 block_service_id, u64 block_id, u64 certificate
);
extern const struct file_operations eggsfs_filesimple_operations;
#endif
+1
View File
@@ -0,0 +1 @@
../cpp/rs/gf_tables.c
+9
View File
@@ -0,0 +1,9 @@
#cloud-config
hostname: uovo
users:
- name: fmazzol
ssh-authorized-keys:
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE93TJCd7qWh/kHzaBoCSU9VgOKSWlUls9OkiBNCjG3B francesco.mazzoli@xtxmarkets.com
sudo: ['ALL=(ALL) NOPASSWD:ALL']
groups: sudo
shell: /bin/bash
+301
View File
@@ -0,0 +1,301 @@
#include "inode.h"
#include "common.h"
#include "dir.h"
#include "metadata.h"
#include "namei.h"
#include "trace.h"
#include "err.h"
#include "file.h"
static struct kmem_cache* eggsfs_inode_cachep;
struct inode* eggsfs_inode_alloc(struct super_block* sb) {
struct eggsfs_inode* enode;
eggsfs_debug_print("sb=%p", sb);
enode = (struct eggsfs_inode*)kmem_cache_alloc(eggsfs_inode_cachep, GFP_NOFS);
if (!enode) {
eggsfs_debug_print("err=ENOMEM");
return NULL;
}
enode->mtime = 0;
enode->mtime_expiry = 0;
enode->edge_creation_time = 0;
eggsfs_latch_init(&enode->getattr_update_latch);
eggsfs_debug_print("done enode=%p", enode);
return &enode->inode;
}
void eggsfs_inode_evict(struct inode* inode) {
struct eggsfs_inode* enode = EGGSFS_I(inode);
eggsfs_debug_print("enode=%p", enode);
if (S_ISDIR(inode->i_mode)) {
eggsfs_dir_drop_cache(enode);
} else if (S_ISREG(inode->i_mode)) {
// error out in flight stuff
atomic_cmpxchg(&enode->file.transient_err, 0, -EINTR);
// wait for all writing operations to be done (we might be cleaning up)
down(&enode->file.done_flushing);
up(&enode->file.done_flushing);
}
truncate_inode_pages(&inode->i_data, 0);
clear_inode(inode);
}
void eggsfs_inode_free(struct inode* inode) {
struct eggsfs_inode* enode = EGGSFS_I(inode);
eggsfs_debug_print("enode=%p", enode);
kmem_cache_free(eggsfs_inode_cachep, enode);
}
static void eggsfs_inode_init_once(void* ptr) {
struct eggsfs_inode* enode = (struct eggsfs_inode*)ptr;
inode_init_once(&enode->inode);
}
int __init eggsfs_inode_init(void) {
eggsfs_inode_cachep = kmem_cache_create(
"eggsfs_inode_cache", sizeof(struct eggsfs_inode),
0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, eggsfs_inode_init_once
);
if (!eggsfs_inode_cachep) { return -ENOMEM; }
return 0;
}
void __cold eggsfs_inode_exit(void) {
rcu_barrier();
kmem_cache_destroy(eggsfs_inode_cachep);
}
int eggsfs_do_getattr(struct eggsfs_inode* enode) {
int err;
s64 seqno;
again: // progress: whoever wins the lock won't try again
eggsfs_debug_print("enode=%p id=0x%016lx mtime=%lld mtime_expiry=%lld", enode, enode->inode.i_ino, enode->mtime, enode->mtime_expiry);
if (eggsfs_latch_try_acquire(&enode->getattr_update_latch, seqno)) {
u64 ts = get_jiffies_64();
if (smp_load_acquire(&enode->mtime_expiry) > ts) { err = 0; goto out; }
u64 mtime;
u64 expiry;
if (S_ISDIR(enode->inode.i_mode)) {
struct eggsfs_block_policies block_policies;
struct eggsfs_span_policies span_policies;
u32 target_stripe_size;
err = eggsfs_shard_getattr_dir(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
enode->inode.i_ino,
&mtime,
&block_policies,
&span_policies,
&target_stripe_size
);
if (err == 0) {
if (block_policies.len) {
memcpy(&enode->block_policies, &block_policies, sizeof(block_policies));
}
if (span_policies.len) {
memcpy(&enode->span_policies, &span_policies, sizeof(span_policies));
}
if (target_stripe_size) {
enode->target_stripe_size = target_stripe_size;
}
expiry = ts + eggsfs_dir_refresh_time;
}
} else {
u64 size;
err = eggsfs_shard_getattr_file(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
enode->inode.i_ino,
&mtime,
&size
);
if (err == 0) {
enode->inode.i_size = size;
expiry = ~(uint64_t)0;
}
}
if (err) { err = eggsfs_error_to_linux(err); goto out; }
else {
WRITE_ONCE(enode->mtime, mtime);
enode->inode.i_mtime.tv_sec = mtime / 1000000000 + 1577836800;
enode->inode.i_mtime.tv_nsec = mtime % 1000000000;
}
smp_store_release(&enode->mtime_expiry, expiry);
out:
WRITE_ONCE(enode->getattr_err, err);
eggsfs_latch_release(&enode->getattr_update_latch, seqno);
eggsfs_debug_print("out mtime=%llu mtime_expiry=%llu", enode->mtime, enode->mtime_expiry);
return err;
} else {
err = eggsfs_latch_wait_killable(&enode->getattr_update_latch, seqno);
if (err) { return err; }
if (READ_ONCE(enode->getattr_err)) { goto again; }
return 0;
}
}
static int eggsfs_create(struct inode* parent, struct dentry* dentry, umode_t mode, bool excl) {
int err;
BUG_ON(dentry->d_inode);
struct eggsfs_inode* parent_enode = EGGSFS_I(parent);
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
err = -ENAMETOOLONG;
goto out_err;
}
eggsfs_debug_print("creating %*s", dentry->d_name.len, dentry->d_name.name);
u64 ino, cookie;
err = eggsfs_error_to_linux(eggsfs_shard_create_file(
(struct eggsfs_fs_info*)parent->i_sb->s_fs_info, eggsfs_inode_shard(parent->i_ino),
EGGSFS_INODE_FILE, dentry->d_name.name, dentry->d_name.len, &ino, &cookie
));
if (err) { return eggsfs_error_to_linux(err); }
// make this dentry stick around?
struct inode* inode = eggsfs_get_inode(dentry->d_sb, ino); // new_inode
if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_err; }
struct eggsfs_inode* enode = EGGSFS_I(inode);
INIT_LIST_HEAD(&enode->file.transient_spans);
struct eggsfs_transient_span* span = eggsfs_add_new_span(&enode->file.transient_spans);
if (IS_ERR(span)) { err = PTR_ERR(span); goto out_err; }
enode->flags = EGGSFS_INODE_WRITEABLE;
enode->file.cookie = cookie;
enode->file.size_without_current_span = 0;
spin_lock_init(&enode->file.transient_spans_lock);
memcpy(&enode->block_policies, &parent_enode->block_policies, sizeof(parent_enode->block_policies));
memcpy(&enode->span_policies, &parent_enode->span_policies, sizeof(parent_enode->span_policies));
enode->target_stripe_size = parent_enode->target_stripe_size;
atomic_set(&enode->file.transient_err, 0);
enode->file.owner = current->group_leader;
INIT_WORK(&enode->file.flusher, eggsfs_flush_transient_spans);
sema_init(&enode->file.done_flushing, 0);
atomic64_set(&enode->file.flushed_so_far, 0);
sema_init(&enode->file.in_flight_spans, 2); // fairly arbitrary, ~300MiB of memory max
d_instantiate(dentry, inode);
d_invalidate(dentry);
eggsfs_debug_print("size=%lld", inode->i_size);
return 0;
out_err:
return err;
}
// vfs: refcount of path->dentry
static int eggsfs_getattr(const struct path* path, struct kstat* stat, u32 request_mask, unsigned int query_flags) {
struct inode* inode = d_inode(path->dentry);
struct eggsfs_inode* enode = EGGSFS_I(inode);
trace_eggsfs_vfs_getattr_enter(inode);
// >= so that eggsfs_dir_refresh_time=0 causes revalidation at every call to this function
if (get_jiffies_64() >= smp_load_acquire(&enode->mtime_expiry)) {
int err;
// FIXME: symlinks
if (!(request_mask & STATX_MTIME) && (S_ISDIR(inode->i_mode) || !(request_mask & STATX_SIZE))) {
goto done;
}
trace_eggsfs_vfs_getattr_lock(inode);
// dentry refcount also protects the inode (e.g. d_delete will not turn used dentry into a negative one),
// so no need to grab anything before we start waiting for stuff
err = eggsfs_do_getattr(enode);
if (err) {
trace_eggsfs_vfs_getattr_exit(inode, err);
return err;
}
}
done:
generic_fillattr(inode, stat);
trace_eggsfs_vfs_getattr_exit(inode, 0);
return 0;
}
static const struct inode_operations eggsfs_dir_inode_ops = {
.create = eggsfs_create,
.lookup = eggsfs_lookup,
.unlink = eggsfs_unlink,
.mkdir = eggsfs_mkdir,
.rmdir = eggsfs_rmdir,
.rename = eggsfs_rename,
.getattr = eggsfs_getattr,
};
static const struct inode_operations eggsfs_file_inode_ops = {
.getattr = eggsfs_getattr,
};
extern struct file_operations eggsfs_dir_operations;
struct inode* eggsfs_get_inode(struct super_block* sb, u64 ino) {
trace_eggsfs_get_inode_enter(ino);
struct inode* inode = iget_locked(sb, ino); // new_inode when possible?
if (!inode) {
trace_eggsfs_get_inode_exit(ino, NULL, false, 0);
return ERR_PTR(-ENOMEM);
}
struct eggsfs_inode* enode = EGGSFS_I(inode);
bool new = inode->i_state & I_NEW;
if (new) {
switch (eggsfs_inode_type(ino)) {
case EGGSFS_INODE_DIRECTORY: inode->i_mode = S_IFDIR; break;
case EGGSFS_INODE_FILE: inode->i_mode = S_IFREG; break;
case EGGSFS_INODE_SYMLINK: inode->i_mode = S_IFLNK; break;
default: BUG();
}
inode->i_mode |= S_ISDIR(inode->i_mode) ? 0777 : 0666;
inode->i_blocks = 0;
inode->i_size = 0;
inode->i_op = NULL;
inode->i_fop = NULL;
if (S_ISDIR(inode->i_mode)) {
inode->i_op = &eggsfs_dir_inode_ops;
inode->i_fop = &eggsfs_dir_operations;
rcu_assign_pointer(enode->dir.pages, NULL);
eggsfs_latch_init(&enode->dir.pages_latch);
} else if (S_ISREG(inode->i_mode)) {
inode->i_op = &eggsfs_file_inode_ops;
inode->i_fop = &eggsfs_filesimple_operations;
// init normal file stuff
enode->file.spans = RB_ROOT;
seqcount_init(&enode->file.spans_seqcount);
mutex_init(&enode->file.spans_wlock);
}
// FIXME
inode->i_uid = make_kuid(&init_user_ns, 1000);//65534);
inode->i_gid = make_kgid(&init_user_ns, 1000);//65534);
unlock_new_inode(inode);
}
trace_eggsfs_get_inode_exit(ino, inode, new, 0);
return inode;
}
+203
View File
@@ -0,0 +1,203 @@
#ifndef _EGGSFS_INODE_H
#define _EGGSFS_INODE_H
#include <linux/fs.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>
#include "block.h"
#include "latch.h"
#include "bincode.h"
#define EGGSFS_ROOT_INODE 0x2000000000000000ull
struct eggsfs_file_span;
// When we create a file, it is writeable. When we start writing to it, it's transient.
// In any case, with any of these two flags, we have the `transient` case of the union.
#define EGGSFS_INODE_WRITEABLE 1
#define EGGSFS_INODE_TRANSIENT 2
// These two influence the size of eggsfs_inode, currently
#define EGGSFS_MAX_BLOCK_POLICIES 2 // one for flash, one for hdd
#define EGGSFS_MAX_SPAN_POLICIES 3 // we happen to have three for now
struct eggsfs_block_policies {
u8 len;
char body[EGGSFS_MAX_BLOCK_POLICIES*EGGSFS_BLOCK_POLICY_ENTRY_SIZE];
};
static inline int eggsfs_block_policies_len(struct eggsfs_block_policies* policies) {
BUG_ON(policies->len == 0);
return (int)policies->len / EGGSFS_BLOCK_POLICY_ENTRY_SIZE;
}
static inline void eggsfs_block_policies_get(struct eggsfs_block_policies* policies, int ix, u8* storage_class, u32* min_size) {
BUG_ON(ix < 0 || ix >= EGGSFS_MAX_BLOCK_POLICIES);
char* b = policies->body + ix*EGGSFS_BLOCK_POLICY_ENTRY_SIZE;
*storage_class = *(u8*)b; b += 1;
*min_size = get_unaligned_le32(b); b += 4;
}
struct eggsfs_span_policies {
u8 len;
char body[EGGSFS_MAX_SPAN_POLICIES*EGGSFS_SPAN_POLICY_ENTRY_SIZE];
};
static inline int eggsfs_span_policies_len(struct eggsfs_span_policies* policies) {
BUG_ON(policies->len == 0);
return (int)policies->len / EGGSFS_SPAN_POLICY_ENTRY_SIZE;
}
static inline void eggsfs_span_policies_get(struct eggsfs_span_policies* policies, int ix, u32* max_size, u8* parity) {
BUG_ON(ix < 0 || ix >= EGGSFS_MAX_SPAN_POLICIES);
char* b = policies->body + ix*EGGSFS_BLOCK_POLICY_ENTRY_SIZE;
*max_size = get_unaligned_le32(b); b += 4;
*parity = *(u8*)b; b += 1;
}
static inline void eggsfs_span_policies_last(struct eggsfs_span_policies* policies, u32* max_size, u8* parity) {
eggsfs_span_policies_get(policies, eggsfs_span_policies_len(policies)-1, max_size, parity);
}
#define EGGSFS_SPAN_STATUS_WRITING 0 // we're writing to this span
#define EGGSFS_SPAN_STATUS_IDLE 1 // we're waiting to flush this out
#define EGGSFS_SPAN_STATUS_LAST 2 // like idle, but signals that there won't be other spans after this one
#define EGGSFS_SPAN_STATUS_FLUSHING 3 // we're flushing this to the block services
#define EGGSFS_SPAN_STATUS_FLUSHING_LAST 4 // we're flushing this to the block services, and it's the last span
#define EGGSFS_MAX_DATA 10
#define EGGSFS_MAX_PARITY 4
#define EGGSFS_MAX_BLOCKS (EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY)
struct eggsfs_transient_span {
struct list_head list;
// Linear list of pages with the body of the span, used when we're still gathering
// content for it. We use `index` in the page to track where we're writing to.
// We also add the parity block pages here, just to have all the pages available in
// one place.
struct list_head pages;
// Used by the workers which write out to the block services. They first arrange
// the pages here and compute the parity blocks. Then they can just write them out.
//
// We use `->private` to link these together, so that `pages` list can still contain
// all of them, which is handy in a couple of places.
//
// In this stage we use `index` to track where we're reading from the head page,
// and keep popping the head page as we write them out.
struct page* blocks[EGGSFS_MAX_BLOCKS];
struct eggsfs_block_socket* block_sockets[EGGSFS_MAX_BLOCKS];
struct eggsfs_write_block_request* block_reqs[EGGSFS_MAX_BLOCKS];
// These are decided by whoever moves the span from WRITING to IDLE/LAST.
u32 size; // the logical size of the span, might be smaller than actual if we added leading zeros because of RS
u32 block_size;
u8 stripes;
u8 storage_class;
u8 parity;
// See EGGSFS_SPAN_STATUS above.
u8 status;
};
struct eggsfs_inode_file {
// Normal file stuff
struct rb_root spans;
seqcount_t spans_seqcount; // to read in a lockless way
struct mutex spans_wlock; // to ensure there's only one thing fetching the spans when we can't find the span
// Transient stuff. Only initialized on file creation, otherwise it's garbage.
// Could be factored out to separate data structure.
//
// List of spans, FIFO. There's always a "WRITING/LAST" span at the end,
// and there's never a WRITING span which could be IDLE.
struct list_head transient_spans;
// Used to synchronize between things adding and removing to the list of spans.
spinlock_t transient_spans_lock;
// Worker which flushes the spans
struct work_struct flusher;
// To know when we're done flushing
struct semaphore done_flushing;
// If we've encountered an error such that we want to stop.
atomic_t transient_err;
// How many bytes we've flushed so far
atomic64_t flushed_so_far;
// Used to bound the number of in-flight spans (otherwise we
// would eat quite a bit of memory for big files)
struct semaphore in_flight_spans;
u64 cookie;
// We use this to track where we should close the file from.
struct task_struct* owner;
// Size _apart from current span_.
u64 size_without_current_span;
};
struct eggsfs_inode_dir {
// In `struct page`, we use:
//
// ->mapping to store the next page
// ->private to store the number of entries stored in the page (high 32 bits),
// and a reference count with how many people need the cache (low 32 bits).
// The reference count is only stored in the "head" page.
// ->rcu_head to synchronize between modifications to the reference count.
struct page __rcu * pages;
struct eggsfs_latch pages_latch;
};
struct eggsfs_inode {
struct inode inode;
// We cache things based on the eggsfs mtime, but we need to decide when the
// mtime itself is stale, which we do using `mtime_expiry`.
u64 mtime; // in eggsfs time
u64 mtime_expiry; // in jiffies
u64 edge_creation_time; // in eggsfs time, used for operations (re)moving the edge
// These are relevant for directoriese (obviously), but we also use them for transient
// files when we create them, since we need it in many places.
// TODO consider refreshing it for directories, right now we never do.
struct eggsfs_block_policies block_policies;
struct eggsfs_span_policies span_policies;
u32 target_stripe_size;
union {
struct eggsfs_inode_file file;
struct eggsfs_inode_dir dir;
};
struct eggsfs_latch getattr_update_latch;
int getattr_err;
int flags;
};
#define EGGSFS_I(ptr) container_of(ptr, struct eggsfs_inode, inode)
#define EGGSFS_INODE_NEED_GETATTR 1
#define EGGSFS_INODE_DIRECTORY 1
#define EGGSFS_INODE_FILE 2
#define EGGSFS_INODE_SYMLINK 3
static inline u64 eggsfs_inode_type(u64 ino) {
return (ino >> 61) & 0x03;
}
static inline u32 eggsfs_inode_shard(u64 ino) {
return ino & 0xff;
}
struct inode* eggsfs_get_inode(struct super_block* sb, u64 ino);
// super ops
struct inode* eggsfs_inode_alloc(struct super_block* sb);
void eggsfs_inode_evict(struct inode* inode);
void eggsfs_inode_free(struct inode* inode);
int __init eggsfs_inode_init(void);
void __cold eggsfs_inode_exit(void);
// inode ops
int eggsfs_do_getattr(struct eggsfs_inode* enode);
#endif
+44
View File
@@ -0,0 +1,44 @@
#ifndef _EGGS_INTRSHIMS_H
#define _EGGS_INTRSHIMS_H
#include <linux/kernel.h>
#define _mm_crc32_u64(C, D) __builtin_ia32_crc32di(C, D)
#define _mm_crc32_u32(C, D) __builtin_ia32_crc32si(C, D)
#define _mm_crc32_u8(C, D) __builtin_ia32_crc32qi(C, D)
typedef long long __m128i __attribute__((vector_size(16)));
typedef unsigned long long __m128i_u __attribute__((vector_size(16)));
typedef char __v16qi __attribute__((vector_size(16)));
typedef short __v8hi __attribute__((vector_size(16)));
typedef int __v4si __attribute__((vector_size(16)));
typedef long long __v2di __attribute__((vector_size(16)));
#define _mm_loadu_si128(p) (*(p))
#define _mm_setr_epi32(i0, i1, i2, i3) ((__m128i)(__v4si){i0,i1,i2,i3})
#define _mm_clmulepi64_si128(X, Y, I) ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (char)(I)))
#define _mm_xor_si128(a, b) ((a) ^ (b))
#define _mm_cvtsi32_si128(a) ((__m128i)(__v4si){a,0,0,0})
#define _mm_cvtsi128_si64(a) ((a)[0])
#define _mm_extract_epi64(X, N) ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
typedef long long __m256i __attribute__((vector_size(32)));
typedef unsigned long long __m256i_u __attribute__((vector_size(32)));
typedef char __v32qi __attribute__((vector_size(32)));
typedef short __v16hi __attribute__((vector_size(32)));
typedef int __v8si __attribute__((vector_size(32)));
typedef long long __v4di __attribute__((vector_size(32)));
#define _mm256_and_si256(a, b) ((a) & (b))
#define _mm256_xor_si256(a, b) ((a) ^ (b))
#define _mm256_permute2x128_si256(a, b, i) ((__m256i)__builtin_ia32_permti256((a), (b), (i)))
#define _mm256_srli_epi16(a, b) ((__m256i)__builtin_ia32_psrlwi256((__v16hi)(a), (b)))
#define _mm256_shuffle_epi8(a, b) ((__m256i)__builtin_ia32_pshufb256((__v32qi)(a), (__v32qi)(b)))
#define _mm256_setzero_si256() ((__m256i)(__v4di){0,0,0,0})
#define _mm256_loadu_si256(p) (*(p))
#define _mm256_storeu_si256(p, a) (*(p) = a)
#define _mm256_gf2p8mul_epi8(a, b) ((__m256i)__builtin_ia32_vgf2p8mulb_v32qi((__v32qi)(a), (__v32qi)(b)))
#endif
+1
View File
@@ -0,0 +1 @@
../cpp/crc32c/iscsi.h
+76
View File
@@ -0,0 +1,76 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/version.h>
#include "common.h"
#include "inode.h"
#include "metadata.h"
#include "block.h"
#include "super.h"
#include "sysctl.h"
#include "sysfs.h"
#include "block.h"
#include "rs.h"
MODULE_LICENSE("GPL");
struct workqueue_struct* eggsfs_wq;
static int __init eggsfs_init(void) {
int err;
eggsfs_debug_print("initializing module");
eggsfs_shard_init();
eggsfs_wq = alloc_workqueue("eggsfs-wq", 0, 0);
if (!eggsfs_wq) { return -ENOMEM; }
err = eggsfs_sysfs_init();
if (err) { goto out_sysfs; }
err = eggsfs_sysctl_init();
if (err) { goto out_sysctl; }
err = eggsfs_blocksimple_init();
if (err) { goto out_blocksimple; }
err = eggsfs_inode_init();
if (err) { goto out_inode; }
err = eggsfs_fs_init();
if (err) { goto out_fs; }
err = eggsfs_rs_init();
if (err) { goto out_rs; }
return 0;
out_rs:
eggsfs_fs_exit();
out_fs:
eggsfs_inode_exit();
out_inode:
eggsfs_blocksimple_exit();
out_blocksimple:
eggsfs_sysctl_exit();
out_sysctl:
eggsfs_sysfs_exit();
out_sysfs:
destroy_workqueue(eggsfs_wq);
return err;
}
static void __exit eggsfs_exit(void) {
eggsfs_fs_exit();
eggsfs_inode_exit();
eggsfs_blocksimple_exit();
eggsfs_sysctl_exit();
eggsfs_sysfs_exit();
destroy_workqueue(eggsfs_wq);
}
module_init(eggsfs_init);
module_exit(eggsfs_exit);
+35
View File
@@ -0,0 +1,35 @@
#ifndef _EGGSFS_LATCH_H
#define _EGGSFS_LATCH_H
#include <linux/kernel.h>
#include <linux/wait.h>
struct eggsfs_latch {
atomic64_t counter;
wait_queue_head_t wq;
};
#define eggsfs_latch_init(_latch) ({ \
atomic64_set(&(_latch)->counter, 0); \
init_waitqueue_head(&(_latch)->wq); \
})
#define eggsfs_latch_try_acquire(_latch, _seqno) ({ \
(_seqno) = atomic64_fetch_or(1, &(_latch)->counter); \
smp_mb__after_atomic(); \
!(_seqno & 1); \
})
#define eggsfs_latch_release(_latch, _seqno) ({ \
smp_mb__before_atomic(); \
atomic64_set(&(_latch)->counter, (_seqno) + 2); \
wake_up_all(&(_latch)->wq); \
})
#define eggsfs_latch_wait_killable(_latch, _seqno) ({ \
u64 goal = ((_seqno) | 1); \
wait_event_killable((_latch)->wq, atomic64_read(&(_latch)->counter) > goal); \
})
#endif
+1040
View File
File diff suppressed because it is too large Load Diff
+76
View File
@@ -0,0 +1,76 @@
#ifndef _EGGSFS_SHARD_H
#define _EGGSFS_SHARD_H
#include <linux/kernel.h>
#include "net.h"
#include "super.h"
#include "inode.h"
int eggsfs_shard_lookup(struct eggsfs_fs_info* info, u64 dir, const char* name, int name_len, u64* ino, u64* creation_time);
int eggsfs_shard_readdir(struct eggsfs_fs_info* info, u64 dir, u64 start_pos, void* data, u64* next_hash);
int eggsfs_shard_unlink_file(struct eggsfs_fs_info* info, u64 dir, u64 file, const char* name, int name_len, u64 creation_time);
int eggsfs_shard_rename(struct eggsfs_fs_info* info, u64 dir, u64 target_id, const char* old_name, int old_name_len, u64 old_creation_time, const char* new_name, int new_name_len, u64* new_creation_time);
int eggsfs_shard_link_file(struct eggsfs_fs_info* info, u64 file, u64 cookie, u64 dir, const char* name, int name_len, u64* creation_time);
int eggsfs_shard_getattr_file(struct eggsfs_fs_info* info, u64 file, u64* mtime, u64* size);
int eggsfs_shard_getattr_dir(
struct eggsfs_fs_info* info,
u64 file,
u64* mtime,
struct eggsfs_block_policies* block_policies,
struct eggsfs_span_policies* span_policies,
u32* target_stripe_size
);
int eggsfs_shard_getattr(struct eggsfs_fs_info* info, u64 id);
int eggsfs_shard_create_file(struct eggsfs_fs_info* info, u8 shid, int itype, const char* name, int name_len, u64* ino, u64* cookie);
int eggsfs_shard_file_spans(struct eggsfs_fs_info* info, u64 file, u64 offset, void* data);
int eggsfs_shard_add_inline_span(struct eggsfs_fs_info* info, u64 file, u64 cookie, u64 offset, u32 size, const char* data, u8 len);
int eggsfs_shard_add_span_initiate(
struct eggsfs_fs_info* info, void* data, u64 file, u64 cookie, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity,
u8 stripes, u32 cell_size, u32* cell_crcs
);
int eggsfs_shard_add_span_certify(
struct eggsfs_fs_info* info, u64 file, u64 cookie, u64 offset, u8 parity, const u64* block_ids, const u64* block_proofs
);
struct eggsfs_shard_add_span_initiate_new_block {
u64 size;
u32 crc32;
};
struct eggsfs_shard_add_span_initiate_async_request {
struct eggsfs_shard_async_request request;
char data[EGGSFS_UDP_MTU];
};
void eggsfs_shard_add_span_initiate_async(struct eggsfs_fs_info* info, struct eggsfs_shard_add_span_initiate_async_request* request, u64 file, u64 cookie, u64 byte_offset, u8 storage_class, u8 parity, u32 crc32, u64 size, struct eggsfs_shard_add_span_initiate_new_block* blocks, u16 n_blocks);
struct eggsfs_shard_add_span_certify_block_proof {
u64 block_id;
u64 proof;
};
struct eggsfs_shard_add_span_certify_async_request {
struct eggsfs_shard_async_request request;
char data[EGGSFS_UDP_MTU];
};
void eggsfs_shard_add_span_certify_async(struct eggsfs_fs_info* info, struct eggsfs_shard_add_span_certify_async_request* request, u64 file, u64 cookie, u64 offset, struct eggsfs_shard_add_span_certify_block_proof* proofs, u16 n_blocks);
int eggsfs_cdc_mkdir(struct eggsfs_fs_info* info, u64 dir, const char* name, int name_len, u64* ino, u64* creation_time);
int eggsfs_cdc_rmdir(struct eggsfs_fs_info* info, u64 owner_dir, u64 target, u64 creation_time, const char* name, int name_len);
int eggsfs_cdc_rename_directory(
struct eggsfs_fs_info* info,
u64 target, u64 old_parent, u64 new_parent,
const char* old_name, int old_name_len, u64 old_creation_time,
const char* new_name, int new_name_len,
u64* new_creation_time
);
int eggsfs_cdc_rename_file(
struct eggsfs_fs_info* info,
u64 target, u64 old_parent, u64 new_parent,
const char* old_name, int old_name_len, u64 old_creation_time,
const char* new_name, int new_name_len,
u64* new_creation_time
);
void __init eggsfs_shard_init(void);
#endif
+320
View File
@@ -0,0 +1,320 @@
#include "namei.h"
#include "common.h"
#include "dir.h"
#include "err.h"
#include "inode.h"
#include "metadata.h"
#include "trace.h"
EGGSFS_DEFINE_COUNTER(eggsfs_stat_dir_revalidations);
static int eggsfs_d_revalidate(struct dentry* dentry, unsigned int flags) {
struct dentry* parent;
struct inode* dir;
int ret;
eggsfs_debug_print("dentry=%pd flags=%x[rcu=%d]", dentry, flags, !!(flags & LOOKUP_RCU));
if (IS_ROOT(dentry)) { return 1; }
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
return -ENAMETOOLONG;
}
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir) { return -ECHILD; }
ret = eggsfs_dir_needs_reval(EGGSFS_I(dir), dentry);
if (parent != READ_ONCE(dentry->d_parent)) { return -ECHILD; }
return ret;
} else {
parent = dget_parent(dentry);
dir = d_inode(parent);
ret = eggsfs_dir_needs_reval(EGGSFS_I(dir), dentry);
if (ret == -ECHILD) {
eggsfs_counter_inc(eggsfs_stat_dir_revalidations);
ret = eggsfs_dir_revalidate(EGGSFS_I(dir));
if (ret) { goto out; }
ret = eggsfs_dir_needs_reval(EGGSFS_I(dir), dentry);
if (ret == -ECHILD) { ret = 1; }
}
out:
dput(parent);
}
return ret;
}
struct dentry_operations eggsfs_dentry_ops = {
.d_revalidate = eggsfs_d_revalidate,
};
static void eggsfs_dcache_handle_error(struct inode* dir, struct dentry* dentry, int err) {
eggsfs_debug_print("err=%d", err);
switch (err) {
case EGGSFS_ERR_CANNOT_OVERRIDE_NAME:
WARN_ON(dentry->d_inode);
trace_eggsfs_dcache_invalidate_neg_entry(dir, dentry);
WRITE_ONCE(EGGSFS_I(dir)->mtime_expiry, 0);
d_invalidate(dentry);
break;
// TODO is this correct? verify, I've changed the code from an earlier error
// which does not exist anymore.
case EGGSFS_ERR_DIRECTORY_NOT_FOUND:
WARN_ON(dentry->d_inode);
trace_eggsfs_dcache_delete_inode(dir);
WRITE_ONCE(EGGSFS_I(dir)->mtime_expiry, 0);
clear_nlink(dir);
d_invalidate(dentry->d_parent);
break;
case EGGSFS_ERR_DIRECTORY_NOT_EMPTY:
WARN_ON(!dentry->d_inode);
if (dentry->d_inode) {
trace_eggsfs_dcache_invalidate_dir(dentry->d_inode);
WRITE_ONCE(EGGSFS_I(dentry->d_inode)->mtime_expiry, 0);
}
break;
case EGGSFS_ERR_NAME_NOT_FOUND:
WARN_ON(!dentry->d_inode);
if (dentry->d_inode) {
trace_eggsfs_dcache_delete_entry(dir, dentry, dentry->d_inode);
WRITE_ONCE(EGGSFS_I(dentry->d_inode)->mtime_expiry, 0);
clear_nlink(dentry->d_inode);
d_invalidate(dentry);
}
break;
case EGGSFS_ERR_MISMATCHING_TARGET:
WARN_ON(!dentry->d_inode);
if (dentry->d_inode) {
trace_eggsfs_dcache_invalidate_entry(dir, dentry, dentry->d_inode);
WRITE_ONCE(EGGSFS_I(dentry->d_inode)->mtime_expiry, 0);
d_invalidate(dentry);
}
break;
}
}
// vfs: shared dir.i_rwsem
struct dentry* eggsfs_lookup(struct inode* dir, struct dentry* dentry, unsigned int flags) {
eggsfs_debug_print("dir=0x%016lx, name=%*pE", dir->i_ino, dentry->d_name.len, dentry->d_name.name);
int err;
trace_eggsfs_vfs_lookup_enter(dir, dentry);
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
err = -ENAMETOOLONG;
goto out_err;
}
u64 ino, creation_time;
// need to keep the eggs error around for the EGGSFS_ERR_NAME_NOT_FOUND
err = eggsfs_shard_lookup(
dir->i_sb->s_fs_info, dir->i_ino, dentry->d_name.name, dentry->d_name.len, &ino, &creation_time
);
if (unlikely(err < 0)) {
goto out_err;
}
if (err) { // not unlikely since vfs will do lookups for entries likely to not exist
if (err == EGGSFS_ERR_NAME_NOT_FOUND) {
dentry->d_time = EGGSFS_I(dir)->mtime;
trace_eggsfs_vfs_lookup_exit(dir, dentry, NULL, 0);
return d_splice_alias(NULL, dentry);
}
err = eggsfs_error_to_linux(err);
goto out_err;
}
struct inode* inode = eggsfs_get_inode(dentry->d_sb, ino);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_err;
}
inode->i_ino = ino;
EGGSFS_I(inode)->edge_creation_time = creation_time;
dentry->d_time = EGGSFS_I(dir)->mtime;
trace_eggsfs_vfs_lookup_exit(dir, dentry, inode, 0);
return d_splice_alias(inode, dentry);
out_err:
trace_eggsfs_vfs_lookup_exit(dir, dentry, NULL, err);
return ERR_PTR(err);
}
// vfs: exclusive dir.i_rwsem, lockref dentry
int eggsfs_mkdir(struct inode* dir, struct dentry* dentry, umode_t mode) {
int err;
struct eggsfs_inode* dir_enode = EGGSFS_I(dir);
trace_eggsfs_vfs_mkdir_enter(dir, dentry);
BUG_ON(dentry->d_inode);
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
err = -ENAMETOOLONG;
goto out_err;
}
u64 ino, creation_time;
err = eggsfs_cdc_mkdir(
dir->i_sb->s_fs_info,
dir->i_ino,
dentry->d_name.name,
dentry->d_name.len,
&ino,
&creation_time
);
if (unlikely(err)) {
if (err < 0) { goto out_err; }
eggsfs_dcache_handle_error(dir, dentry, err);
err = eggsfs_error_to_linux(err);
goto out_err;
}
eggsfs_dir_drop_cache(EGGSFS_I(dir));
struct inode* inode = eggsfs_get_inode(dentry->d_sb, ino);
if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_err; }
struct eggsfs_inode* enode = EGGSFS_I(inode);
enode->edge_creation_time = creation_time;
memcpy(&enode->block_policies, &dir_enode->block_policies, sizeof(enode->block_policies));
memcpy(&enode->span_policies, &dir_enode->span_policies, sizeof(enode->span_policies));
enode->target_stripe_size = dir_enode->target_stripe_size;
dentry->d_time = dir_enode->mtime;
d_instantiate(dentry, inode);
trace_eggsfs_vfs_mkdir_exit(dir, dentry, inode, 0);
return 0;
out_err:
trace_eggsfs_vfs_mkdir_exit(dir, dentry, NULL, err);
return err;
}
// vfs: exclusive dir,d_entry->d_inode.i_rwsem
int eggsfs_rmdir(struct inode* dir, struct dentry* dentry) {
int err;
trace_eggsfs_vfs_rmdir_enter(dir, dentry, dentry->d_inode);
BUG_ON(!dentry->d_inode);
if (unlikely(dentry->d_name.len > EGGSFS_MAX_FILENAME)) {
pr_err("eggsfs: eggsfs_rmdir dir=%lu dentry=%pd exceed max filename length %d\n",
dir->i_ino, dentry, EGGSFS_MAX_FILENAME);
err = -ENAMETOOLONG;
goto out_err;
}
err = eggsfs_cdc_rmdir((struct eggsfs_fs_info*)dir->i_sb->s_fs_info, dir->i_ino, dentry->d_inode->i_ino, EGGSFS_I(dentry->d_inode)->edge_creation_time, dentry->d_name.name, dentry->d_name.len);
if (unlikely(err)) {
if (err < 0) { goto out_err; }
eggsfs_dcache_handle_error(dir, dentry, err);
err = eggsfs_error_to_linux(err);
goto out_err;
} else {
eggsfs_dir_drop_cache(EGGSFS_I(dir));
clear_nlink(dentry->d_inode);
}
out_err:
trace_eggsfs_vfs_rmdir_exit(dir, dentry, err);
return err;
}
// vfs: exclusive dir,d_entry->d_inode.i_rwsem
int eggsfs_unlink(struct inode* dir, struct dentry* dentry) {
int err;
trace_eggsfs_vfs_unlink_enter(dir, dentry, dentry->d_inode);
BUG_ON(!dentry->d_inode);
if (unlikely(dentry->d_name.len > EGGSFS_MAX_FILENAME)) {
pr_err("eggsfs: eggsfs_unlink dir=%lu dentry=%pd exceed max filename length %d\n",
dir->i_ino, dentry, EGGSFS_MAX_FILENAME);
err = -ENAMETOOLONG;
goto out_err;
}
struct eggsfs_inode* enode = EGGSFS_I(dentry->d_inode);
err = eggsfs_shard_unlink_file((struct eggsfs_fs_info*)dir->i_sb->s_fs_info, dir->i_ino, dentry->d_inode->i_ino, dentry->d_name.name, dentry->d_name.len, enode->edge_creation_time);
if (unlikely(err)) {
if (err < 0) { goto out_err; }
eggsfs_dcache_handle_error(dir, dentry, err);
err = eggsfs_error_to_linux(err);
goto out_err;
} else {
eggsfs_dir_drop_cache(EGGSFS_I(dir));
// no hard links, inode is gone
clear_nlink(dentry->d_inode);
}
out_err:
trace_eggsfs_vfs_unlink_exit(dir, dentry, err);
return err;
}
// vfs: exclusive i_rwsem for all
int eggsfs_rename(struct inode* old_dir, struct dentry* old_dentry, struct inode* new_dir, struct dentry* new_dentry, unsigned int flags) {
int err;
trace_eggsfs_vfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
BUG_ON(!old_dentry->d_inode);
if (flags) { err = -EINVAL; goto out; }
u64 old_creation_time = EGGSFS_I(old_dentry->d_inode)->edge_creation_time;
u64 new_creation_time;
struct eggsfs_fs_info* info = old_dir->i_sb->s_fs_info;
if (old_dir == new_dir) {
err = eggsfs_shard_rename(
info,
old_dir->i_ino,
old_dentry->d_inode->i_ino, old_dentry->d_name.name, old_dentry->d_name.len,
old_creation_time,
new_dentry->d_name.name, new_dentry->d_name.len,
&new_creation_time
);
} else if (S_ISDIR(old_dentry->d_inode->i_mode)) {
// TODO update block policies/span policies here, it's kind of a no brainer in this case
err = eggsfs_cdc_rename_directory(
info,
old_dentry->d_inode->i_ino,
old_dir->i_ino, new_dir->i_ino,
old_dentry->d_name.name, old_dentry->d_name.len, old_creation_time,
new_dentry->d_name.name, new_dentry->d_name.len,
&new_creation_time
);
} else {
err = eggsfs_cdc_rename_file(
info,
old_dentry->d_inode->i_ino,
old_dir->i_ino, new_dir->i_ino,
old_dentry->d_name.name, old_dentry->d_name.len, old_creation_time,
new_dentry->d_name.name, new_dentry->d_name.len,
&new_creation_time
);
}
err = eggsfs_error_to_linux(err);
if (err) {
if (err == -ENOENT) {
// eggsfs_dentry_handle_enoent(old_dentry);
} else {
d_drop(old_dentry);
d_drop(new_dentry); //?
}
// new entry? drop as well
} else {
old_dentry->d_time = new_creation_time;
if (new_dentry->d_inode) { inode_dec_link_count(new_dentry->d_inode); }
d_move(old_dentry, new_dentry);
// negative old dentry?
}
out:
trace_eggsfs_vfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, err);
eggsfs_debug_print("err=%d", err);
return err;
}
+22
View File
@@ -0,0 +1,22 @@
#ifndef _EGGSFS_NAMEI_H
#define _EGGSFS_NAMEI_H
#include <linux/fs.h>
#include "counter.h"
EGGSFS_DECLARE_COUNTER(eggsfs_stat_dir_revalidations);
struct dentry* eggsfs_lookup(struct inode* dir, struct dentry* dentry, unsigned int flags);
int eggsfs_mkdir(struct inode* dir, struct dentry* dentry, umode_t mode);
int eggsfs_rmdir(struct inode* dir, struct dentry* dentry);
int eggsfs_unlink(struct inode* dir, struct dentry* dentry);
int eggsfs_rename(
struct inode* old_dir, struct dentry* old_dentry,
struct inode* new_dir, struct dentry* new_dentry,
unsigned int flags
);
extern struct dentry_operations eggsfs_dentry_ops;
#endif
+337
View File
@@ -0,0 +1,337 @@
#include "net.h"
#include <linux/version.h>
#include "common.h"
#include "trace.h"
static struct eggsfs_shard_request* get_shard_request(struct eggsfs_shard_socket* s, u64 request_id) __must_hold(s->lock) {
struct rb_node* node = s->requests.rb_node;
while (node) {
struct eggsfs_shard_request* req = container_of(node, struct eggsfs_shard_request, node);
if (request_id < req->request_id) { node = node->rb_left; }
else if (request_id > req->request_id) { node = node->rb_right; }
else { return req; }
}
return NULL;
}
static void sock_readable(struct sock* sk) {
struct eggsfs_shard_socket* s;
struct sk_buff* skb;
int err;
u64 request_id;
read_lock_bh(&sk->sk_callback_lock);
s = (struct eggsfs_shard_socket*)sk->sk_user_data;
BUG_ON(!s);
while (1) {
skb = skb_recv_udp(sk, 0, 1, &err);
if (!skb) {
read_unlock_bh(&sk->sk_callback_lock);
return;
}
if (unlikely(skb_is_nonlinear(skb))) {
// TODO: need top handle this
eggsfs_warn_print("dropping nonlinear request");
goto drop_skb;
} else if (unlikely(skb_checksum_complete(skb))) {
eggsfs_warn_print("dropping request with bad checksum");
goto drop_skb;
}
// u32 protocol + u64 req_id
if (unlikely(skb->len < 12)) {
eggsfs_warn_print("dropping runt eggsfs request");
goto drop_skb;
}
request_id = get_unaligned_le64(skb->data + 4);
struct eggsfs_shard_request* req;
spin_lock_bh(&s->lock); {
req = get_shard_request(s, request_id);
if (req) {
BUG_ON(req->skb);
rb_erase(&req->node, &s->requests);
req->skb = skb;
skb = NULL;
complete(&req->comp);
#if 0
if (!(req->flags & EGGSFS_SHARD_ASYNC)) {
} else {
struct eggsfs_shard_async_request* areq = container_of(req, struct eggsfs_shard_async_request, request);
cancel_delayed_work(&areq->work);
queue_work(eggsfs_wq, &areq->callback);
}
#endif
}
} spin_unlock_bh(&s->lock);
if (!req) {
eggsfs_warn_print("could not find request id %llu", request_id);
}
if (skb) {
drop_skb:
kfree_skb(skb);
}
}
}
// timer async request expired
// if skb then retrun
// requeue
// whoever wants to remove request needs to handle timer and workqueue
static void insert_shard_request(struct eggsfs_shard_socket* s, struct eggsfs_shard_request* req) __must_hold(s->lock) {
u64 request_id = req->request_id;
struct rb_node** new = &s->requests.rb_node;
struct rb_node* parent = NULL;
while (*new) {
struct eggsfs_shard_request* this = container_of(*new, struct eggsfs_shard_request, node);
parent = *new;
if (request_id < this->request_id) { new = &((*new)->rb_left); }
else if (request_id > this->request_id) { new = &((*new)->rb_right); }
else { printk(KERN_ERR "dup id %lld vs. %lld %p vs. %p\n", request_id, this->request_id, req, this); BUG_ON(1); return; }
}
rb_link_node(&req->node, parent, new);
rb_insert_color(&req->node, &s->requests);
}
static_assert(HZ / 100 > 0);
#define SHARD_ATTEMPTS 10
const static u64 shard_timeouts_10ms[SHARD_ATTEMPTS] = {
1, 2, 4, 8, 16, 32, 64, 100, 200, 200
};
// 5 times the shard numbers, given that we roughly do 5 shard requests per CDC request
#define CDC_ATTEMPTS 10
const static u64 cdc_timeouts_10ms[CDC_ATTEMPTS] = {
5, 10, 20, 40, 80, 160, 320, 640, 1000, 1000
};
// Returns:
//
// * n == 0 for success
// * n < 0 for error (notable ones being -ETIMEOUT and -ERESTARTSYS)
//
// Once this is done, the request is guaranteed to not be in the tree anymore.
static int __must_check wait_for_request(struct eggsfs_shard_socket* s, struct eggsfs_shard_request* req, u64 timeout_10ms) {
int ret = wait_for_completion_killable_timeout(&req->comp, (timeout_10ms * 10 * HZ) / 1000);
// We got a response, this is the happy path.
if (likely(ret > 0)) {
BUG_ON(!req->skb);
return 0;
}
// We didn't get a response, it's our job to clean up the request.
spin_lock_bh(&s->lock);
// We might have filled the request in the time it took us to get here, in which
// case `sock_readable` will have already erased it.
if (likely(!req->skb)) {
rb_erase(&req->node, &s->requests);
}
spin_unlock_bh(&s->lock);
// If the request was filled in the meantime, we also need to free the skb.
if (unlikely(req->skb)) {
kfree_skb(req->skb);
req->skb = NULL;
}
return ret ? ret : -ETIMEDOUT;
}
void eggsfs_net_shard_free_socket(struct eggsfs_shard_socket* s) {
// TODO anything else?
sock_release(s->sock);
}
int eggsfs_init_shard_socket(struct eggsfs_shard_socket* s) {
struct sockaddr_in addr;
int err;
err = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &s->sock);
if (err) { goto out_err; }
write_lock_bh(&s->sock->sk->sk_callback_lock);
BUG_ON(s->sock->sk->sk_user_data);
s->sock->sk->sk_user_data = s;
s->sock->sk->sk_data_ready = sock_readable; // FIXME: keep around the default one
write_unlock_bh(&s->sock->sk->sk_callback_lock);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = 0;
addr.sin_port = 0;
err = s->sock->ops->bind(s->sock, (struct sockaddr*)&addr, sizeof(addr));
if (err) { goto out_socket; }
s->requests = RB_ROOT;
spin_lock_init(&s->lock);
return 0;
out_socket:
sock_release(s->sock);
out_err:
return err;
}
// TODO: congestion control
// shard_id is just used for debugging/event tracing, and should be -1 if we're going to the CDC
struct sk_buff* eggsfs_metadata_request(
struct eggsfs_shard_socket* sock,
s16 shard_id,
struct msghdr* msg,
u64 req_id,
void* p,
u32 len,
u32* attempts
) {
struct sockaddr_in* addr = (struct sockaddr_in*)msg->msg_name;
struct eggsfs_shard_request req;
struct kvec vec;
int err = -EIO;
u8 kind = *((u8*)p + 12);
trace_eggsfs_metadata_request_enter(msg, req_id, len, shard_id, kind); // which socket?
req.skb = NULL;
req.request_id = req_id;
*attempts = 0;
vec.iov_base = p;
vec.iov_len = len;
int max_attempts = shard_id < 0 ? CDC_ATTEMPTS : SHARD_ATTEMPTS;
const u64* timeouts_10ms = shard_id < 0 ? cdc_timeouts_10ms : shard_timeouts_10ms;
do {
init_completion(&req.comp);
BUG_ON(req.skb);
spin_lock_bh(&sock->lock);
insert_shard_request(sock, &req);
spin_unlock_bh(&sock->lock);
eggsfs_debug_print("sending request of kind 0x%02x, len %d to %pI4:%d, after %d attempts", (int)kind, len, &addr->sin_addr, ntohs(addr->sin_port), *attempts);
eggsfs_debug_print("sending to sock=%p iov=%p len=%u", sock->sock, p, len);
err = kernel_sendmsg(sock->sock, msg, &vec, 1, len);
if (err < 0) {
eggsfs_info_print("could not send req %llu of kind 0x%02x to %pI4:%d: %d", req_id, (int)kind, &addr->sin_addr, ntohs(addr->sin_port), err);
goto out_unregister;
}
err = wait_for_request(sock, &req, timeouts_10ms[*attempts]);
(*attempts)++;
if (!err) {
eggsfs_debug_print("got response");
BUG_ON(!req.skb);
trace_eggsfs_metadata_request_exit(req_id, *attempts, req.skb->len, 0);
return req.skb;
}
eggsfs_info_print("err=%d", err);
if (err != -ETIMEDOUT || *attempts >= max_attempts) {
eggsfs_info_print("giving up after %d attempts due to err %d", *attempts, err);
goto out_err;
}
} while (1);
out_unregister:
spin_lock_bh(&sock->lock);
// TODO what are the circumstances where this can be true? That is, kernel_sendmsg returns
// an error, but we get a fill from the shard?
BUG_ON(req.skb);
rb_erase(&req.node, &sock->requests);
spin_unlock_bh(&sock->lock);
out_err:
trace_eggsfs_metadata_request_exit(req_id, *attempts, 0, err);
eggsfs_info_print("err=%d", err);
return ERR_PTR(err);
}
#if 0
static void eggsfs_shard_async_request_worker(struct work_struct* work) {
struct eggsfs_shard_async_request* req = container_of(to_delayed_work(work), struct eggsfs_shard_async_request, work);
struct kvec vec;
int err;
eggsfs_debug_print();
spin_lock_bh(&req->socket->lock);
if (req->request.skb) { goto out_done; }
if (req->request.attempt == 0) {
insert_shard_request(req->socket, &req->request);
} else if (req->request.attempt >= eggsfs_max_request_retries) {
rb_erase(&req->request.node, &req->socket->requests);
req->request.skb = ERR_PTR(-ETIMEDOUT);
goto out_failed;
}
++req->request.attempt;
spin_unlock_bh(&req->socket->lock);
vec.iov_base = req + 1;
vec.iov_len = req->length;
err = kernel_sendmsg(req->socket->sock, &req->msg, &vec, 1, req->length);
if (err < 0) {
spin_lock_bh(&req->socket->lock);
if (!req->request.skb) {
rb_erase(&req->request.node, &req->socket->requests);
req->request.skb = ERR_PTR(err);
goto out_failed;
}
spin_unlock_bh(&req->socket->lock);
} else {
queue_delayed_work(eggsfs_wq, &req->work, HZ);
}
return;
out_failed:
spin_unlock_bh(&req->socket->lock);
queue_work(system_wq, &req->callback);
return;
out_done:
spin_unlock_bh(&req->socket->lock);
return;
}
// this layer needs to be capable of picking sockets / though we still need a list of them
void eggsfs_shard_async_request(struct eggsfs_shard_async_request* req, struct eggsfs_shard_socket* sock, struct msghdr* hdr, u64 req_id, u32 len) {
eggsfs_debug_print();
req->socket = sock;
req->request.request_id = req_id;
req->request.flags = EGGSFS_SHARD_ASYNC;
req->request.skb = NULL;
req->request.attempt = 0;
req->length = len;
req->msg = *hdr;
INIT_DELAYED_WORK(&req->work, eggsfs_shard_async_request_worker);
queue_delayed_work(eggsfs_wq, &req->work, 0);
}
void eggsfs_shard_async_cleanup(struct eggsfs_shard_async_request* req) {
cancel_delayed_work_sync(&req->work);
}
#endif
/*
void eggsfs_shard_async_cancel(struct eggsfs_shard_async_request* req) {
spin_lock_bh(&req->socket->lock);
if (!req->request.skb) {
rb_erase(&req.node, &sock->requests);
}
spin_unlock_bh(&req->socket->lock);
cancel_delayed_work_sync(&req->work);
}
*/
+49
View File
@@ -0,0 +1,49 @@
#ifndef _EGGSFS_NET_H
#define _EGGSFS_NET_H
#include <linux/net.h>
#include <net/udp.h>
#include <net/sock.h>
#include <net/inet_common.h>
#define EGGSFS_UDP_MTU 1472
struct eggsfs_shard_socket {
struct socket* sock;
struct rb_root requests;
spinlock_t lock;
};
#define EGGSFS_SHARD_ASYNC 1
struct eggsfs_shard_request {
struct rb_node node;
u64 request_id;
struct sk_buff* skb;
struct completion comp;
};
int eggsfs_init_shard_socket(struct eggsfs_shard_socket* sock);
void eggsfs_net_shard_free_socket(struct eggsfs_shard_socket* sock);
struct sk_buff* eggsfs_metadata_request(
struct eggsfs_shard_socket* sock, s16 shard_id, struct msghdr* msg, u64 req_id, void* p, u32 len, u32* attempts
);
struct eggsfs_shard_async_request {
struct eggsfs_shard_request request;
struct eggsfs_shard_socket* socket;
struct msghdr msg;
struct delayed_work work;
struct work_struct callback;
u32 length;
};
#if 0
void eggsfs_shard_async_request(struct eggsfs_shard_async_request* req, struct eggsfs_shard_socket* sock, struct msghdr* msg, u64 req_id, u32 len);
void eggsfs_shard_async_cleanup(struct eggsfs_shard_async_request* req);
#endif
#endif
+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env bash
set -eu -o pipefail
run=false
deploy=false
build_type='alpine-debug'
while [[ "$#" -gt 0 ]]; do
case "$1" in
-run)
run=true
shift
;;
-deploy)
deploy=true
shift
;;
-build-type)
shift
build_type="$1"
shift
;;
*)
echo "Bad usage"
exit 2
;;
esac
done
# Kills the current VM, starts it again, deploys to it and builds kmod, loads kmod, opens a tmux
# session with dmesg in one pane and a console in ~/eggs in the other.
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
make -j kmod
pkill qemu || true
tmux kill-session -t uovo || true
sleep 1 # sometimes qemu lingers?
# Windows:
#
# 0. dmesg
# 1. free shell
# 2. shell with eggsfs running
# 3. qemu running
tmux new-session -d -s uovo
tmux new-window -t uovo:3 './startvm.sh'
# Wait for VM to go up, and build
while ! scp eggsfs.ko uovo: ; do sleep 1; done
# Start dmesg as soon as it's booted (before we insert module)
tmux send-keys -t uovo:0 "ssh -t uovo dmesg -wHT" Enter
# Insert module
ssh uovo 'sudo insmod eggsfs.ko'
if [[ "$deploy" = true ]]; then
# Deploy binaries
(cd ../deploy && ./deploy.py --build-type "$build_type" --upload --host fmazzol@uovo)
fi
# Create shells
tmux new-window -t uovo:1 'ssh -t uovo'
if [[ "$run" = true ]]; then
tmux new-window -t uovo:2 'ssh -t uovo "cd eggs && ./eggsrun -trace -binaries-dir ~/eggs -data-dir ~/eggs-data/"'
fi
# Attach
tmux attach-session -t uovo:1
# 4349852
# ./eggstests -kmod -filter mounted -cfg fsTest.checkThreads=1 -cfg fsTest.numDirs=10 -cfg fsTest.numFiles=100 -binaries-dir $(pwd)
+130
View File
@@ -0,0 +1,130 @@
#include "rs.h"
#include <linux/string.h>
#include <linux/slab.h>
#include <asm/fpu/api.h>
#include "common.h"
#include "intrshims.h"
#define rs_malloc(sz) kmalloc(sz, GFP_KERNEL)
#define rs_free(p) kfree(p)
#define die(...) BUG()
enum rs_cpu_level {
RS_CPU_SCALAR = 1,
RS_CPU_AVX2 = 2,
RS_CPU_GFNI = 3,
};
static inline bool rs_detect_valgrind(void) {
return false;
}
#define broadcast_u8(x) \
((__m256i)__builtin_ia32_pbroadcastb256((__v16qi){x,0,0,0,0,0,0,0,0,0,0,0,0,0,0}))
static enum rs_cpu_level eggsfs_rs_cpu_level;
#include "rs_core.c"
#define rs_compute_parity_scalar_func(D, P) \
static void rs_compute_parity_scalar_##D##_##P(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) { \
rs_compute_parity_scalar(D, P, r, size, data, parity); \
}
#define rs_compute_parity_avx2_func(D, P) \
__attribute__((target("avx,avx2"))) \
static void rs_compute_parity_avx2_##D##_##P(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) { \
rs_compute_parity_avx2(D, P, r, size, data, parity); \
}
#define rs_compute_parity_gfni_func(D, P) \
__attribute__((target("avx,avx2,gfni"))) \
static void rs_compute_parity_gfni_##D##_##P(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) { \
rs_compute_parity_gfni(D, P, r, size, data, parity); \
}
#define rs_gen(D, P) \
static struct rs* rs_##D##_##P = NULL; \
static int rs_init_##D##_##P(void) { \
rs_##D##_##P = rs_new_core(eggsfs_mk_parity(D, P)); \
if (rs_##D##_##P == NULL) { \
return -ENOMEM; \
} \
return 0; \
} \
rs_compute_parity_scalar_func(D, P) \
rs_compute_parity_avx2_func(D, P) \
rs_compute_parity_gfni_func(D, P) \
static void rs_compute_parity_##D##_##P(uint64_t size, const uint8_t** data, uint8_t** parity) { \
switch (eggsfs_rs_cpu_level) { \
case RS_CPU_SCALAR: \
rs_compute_parity_scalar_##D##_##P(rs_##D##_##P, size, data, parity); \
break; \
case RS_CPU_AVX2: \
rs_compute_parity_avx2_##D##_##P(rs_##D##_##P, size, data, parity); \
break; \
case RS_CPU_GFNI: \
rs_compute_parity_gfni_##D##_##P(rs_##D##_##P, size, data, parity); \
break; \
default: \
BUG(); \
} \
}
// Right now we don't need anything else, let's not needlessly generate tons of code
rs_gen( 4, 4)
rs_gen(10, 4)
int eggsfs_compute_parity(u8 parity, ssize_t size, const char** data, char** out) {
if (eggsfs_parity_blocks(parity) == 0) { // nothing to do
return 0;
}
if (eggsfs_data_blocks(parity) == 1) { // mirroring
int i;
for (i = 0; i < eggsfs_parity_blocks(parity); i++) {
memcpy(out[i], data[0], size);
}
return 0;
}
int err = 0;
if (parity == eggsfs_mk_parity(4, 4)) {
rs_compute_parity_4_4(size, (const u8**)data, (u8**)out);
} else if (parity == eggsfs_mk_parity(10, 4)) {
rs_compute_parity_10_4(size, (const u8**)data, (u8**)out);
} else {
eggsfs_warn_print("cannot compute with RS(%d,%d)", eggsfs_data_blocks(parity), eggsfs_parity_blocks(parity));
err = -EINVAL;
}
return err;
}
int __init eggsfs_rs_init(void) {
if (rs_has_cpu_level_core(RS_CPU_GFNI)) {
eggsfs_info_print("picking GFNI");
eggsfs_rs_cpu_level = RS_CPU_GFNI;
} else if (rs_has_cpu_level_core(RS_CPU_AVX2)) {
eggsfs_info_print("picking AVX2");
eggsfs_rs_cpu_level = RS_CPU_AVX2;
} else {
eggsfs_warn_print("picking scalar execution -- this will be slow.");
eggsfs_rs_cpu_level = RS_CPU_SCALAR;
}
int err;
err = rs_init_4_4();
if (err < 0) { return err; }
err = rs_init_10_4();
if (err < 0) { kfree(rs_4_4); return err; }
return 0;
}
void __cold eggsfs_rs_exit(void) {
kfree(rs_4_4);
kfree(rs_10_4);
}
#include "gf_tables.c"
+31
View File
@@ -0,0 +1,31 @@
#ifndef _EGGSFS_RS_H
#define _EGGSFS_RS_H
#include <linux/kernel.h>
static inline u8 eggsfs_data_blocks(u8 parity) {
return parity & 0x0F;
}
static inline u8 eggsfs_parity_blocks(u8 parity) {
return parity >> 4;
}
static inline u8 eggsfs_blocks(u8 parity) {
return eggsfs_data_blocks(parity) + eggsfs_parity_blocks(parity);
}
static inline u8 eggsfs_mk_parity(u8 data, u8 parity) {
return (parity << 4) | data;
}
// len(data) = eggsfs_data_blocks(parity)
// len(out) = eggsfs_parity_blocks(parity)
//
// You _must_ wrap this with kernel_fpu_begin()/kernel_fpu_end()!
int eggsfs_compute_parity(u8 parity, ssize_t size, const char** data, char** out);
int __init eggsfs_rs_init(void);
void __cold eggsfs_rs_exit(void);
#endif
+1
View File
@@ -0,0 +1 @@
../cpp/rs/rs_core.c
+64
View File
@@ -0,0 +1,64 @@
#include <asm/unaligned.h>
#include "bincode.h"
#include "shuckle.h"
#include "common.h"
void eggsfs_write_shuckle_req_header(char* buf, u32 req_len, u8 req_kind) {
put_unaligned_le32(EGGSFS_SHUCKLE_REQ_PROTOCOL_VERSION, buf); buf += 4;
// automatically include the kind, much nicer for the caller
put_unaligned_le32(req_len+1, buf); buf += 4;
*(u8*)buf = req_kind; buf++;
}
int eggsfs_read_shuckle_resp_header(char* buf, u32* resp_len, u8* resp_kind) {
u32 protocol = get_unaligned_le32(buf); buf += 4;
if (protocol != EGGSFS_SHUCKLE_RESP_PROTOCOL_VERSION) {
eggsfs_debug_print("bad shuckle protocol, expected 0x%08x, got 0x%08x", EGGSFS_SHUCKLE_RESP_PROTOCOL_VERSION, protocol);
return -EINVAL;
}
*resp_len = get_unaligned_le32(buf); buf += 4;
if (*resp_len == 0) {
eggsfs_debug_print("unexpected zero-length shuckle response (the kind should at least be there)");
return -EINVAL;
}
*resp_kind = *(u8*)buf; buf++;
eggsfs_debug_print("resp_len=%d, resp_kind=0x%02x", *resp_len, (int)*resp_kind);
(*resp_len)--; // exclude the kind, it's much nicer for the caller
return 0;
}
int eggsfs_create_shuckle_socket(const char* shuckle_addr, struct socket** sock) {
int err;
const char* addr_end;
u16 port;
// parse device, which is the shuckle address in 0.0.0.0:0 form (ipv4, port)
struct sockaddr_in addr;
addr.sin_family = AF_INET;
if (in4_pton(shuckle_addr, -1, (u8*)&addr.sin_addr, ':', &addr_end) != 1) {
err = -EINVAL;
goto out_err;
}
err = kstrtou16(addr_end+1, 10, &port);
if (err < 0) { goto out_err; }
addr.sin_port = htons(port);
eggsfs_debug_print("parsed shuckle addr %pI4:%d", &addr.sin_addr, ntohs(addr.sin_port));
// create socket
err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, sock);
if (err < 0) { goto out_err; }
// connect
err = kernel_connect(*sock, (struct sockaddr*)&addr, sizeof(addr), 0);
if (err < 0) { goto out_connect; }
eggsfs_debug_print("connected to shuckle");
return 0;
out_connect:
sock_release(*sock);
out_err:
eggsfs_debug_print("failed err=%d", err);
return err;
}
+15
View File
@@ -0,0 +1,15 @@
#ifndef _EGGSFS_SHUCKLE_H
#define _EGGSFS_SHUCKLE_H
#include <linux/kernel.h>
#include <linux/inet.h>
#define EGGSFS_SHUCKLE_REQ_HEADER_SIZE (4 + 4 + 1) // protocol + len + kind
void eggsfs_write_shuckle_req_header(char* buf, u32 req_len, u8 req_kind);
#define EGGSFS_SHUCKLE_RESP_HEADER_SIZE (4 + 4 + 1) // protocol + len + kind
int eggsfs_read_shuckle_resp_header(char* buf, u32* resp_len, u8* resp_kind);
int eggsfs_create_shuckle_socket(const char* shuckle_addr, struct socket** sock);
#endif
+95
View File
@@ -0,0 +1,95 @@
#include "skb.h"
#include <linux/highmem.h>
#include "block.h"
#include "common.h"
u32 eggsfs_skb_copy(void* dstp, struct sk_buff* skb, u32 offset, u32 len) {
struct skb_seq_state seq;
u32 consumed = 0;
char* dst = dstp;
skb_prepare_seq_read(skb, offset, min(offset + len, skb->len), &seq);
for (;;) {
const u8* ptr;
int avail = skb_seq_read(consumed, &ptr, &seq);
if (avail == 0) { return consumed; }
BUG_ON(avail < 0);
// avail _can_ exceed the upper bound here
int actual_avail = min((u32)avail, len-consumed);
memcpy(dst + consumed, ptr, actual_avail);
consumed += actual_avail;
if (actual_avail < avail) {
skb_abort_seq_read(&seq);
return consumed;
}
}
BUG();
}
#if 0
u32 eggsfs_skb_copy_to_pages(struct eggsfs_block_request* req, struct sk_buff* skb, u32 offset) {
struct skb_seq_state seq;
u32 total_consumed = 0;
struct page* page;
void* page_ptr;
u32 page_offset = req->page_offset;
if (WARN_ON(!req->bytes_left)) { return 0; }
skb_prepare_seq_read(skb, offset, min(offset + req->bytes_left, skb->len), &seq);
BUG_ON(list_empty(&req->pages));
page = list_first_entry(&req->pages, struct page, lru);
page_ptr = kmap_atomic(page);
while (total_consumed < req->bytes_left) {
const u8* ptr;
int avail = skb_seq_read(total_consumed, &ptr, &seq);
printk(KERN_INFO "eggsfs: skb_seq_read total_consumed=%u page_offset=%u avail=%d bytes_left=%u\n", total_consumed, page_offset, avail, req->bytes_left);
if (avail == 0) {
break;
}
while (avail) {
// FIXME: is the last one needed?
u32 this_len = min3((u32)avail, (u32)PAGE_SIZE - page_offset, req->bytes_left - total_consumed);
printk(KERN_INFO "eggsfs: copy to page total_consumed=%u avail=%d this_len=%d page_offset=%u\n", total_consumed, avail, this_len, page_offset);
req->crc32 = eggsfs_crc(req->crc32, ptr, this_len);
memcpy((char*)page_ptr + page_offset, ptr, this_len);
ptr += this_len;
page_offset += this_len;
avail -= this_len;
total_consumed += this_len;
if (page_offset >= PAGE_SIZE) {
BUG_ON(page_offset > PAGE_SIZE);
kunmap_atomic(page_ptr);
list_rotate_left(&req->pages);
page = list_first_entry(&req->pages, struct page, lru);
page_ptr = kmap_atomic(page);
page_offset = 0;
}
}
}
skb_abort_seq_read(&seq);
req->bytes_left -= total_consumed;
req->page_offset = page_offset;
if (!req->bytes_left && page_offset) {
// partially filled page at the end
memset(page_ptr + page_offset, 0, PAGE_SIZE - page_offset);
list_rotate_left(&req->pages);
}
kunmap_atomic(page_ptr);
return total_consumed;
}
#endif
+16
View File
@@ -0,0 +1,16 @@
#ifndef _EGGSFS_SKB_H
#define _EGGSFS_SKB_H
#include <linux/skbuff.h>
struct eggsfs_block_request;
// TODO possibly move this one to blocksimple.c
u32 eggsfs_skb_copy(void* dstp, struct sk_buff* skb, u32 offset, u32 len);
#if 0
// TODO remove this thing below
u32 eggsfs_skb_copy_to_pages(struct eggsfs_block_request* req, struct sk_buff* skb, u32 offset);
#endif
#endif
+324
View File
@@ -0,0 +1,324 @@
#include "bincode.h"
#include "common.h"
#include "span.h"
#include "metadata.h"
#include "rs.h"
#include "latch.h"
#include "crc.h"
#include "err.h"
static struct eggsfs_span* eggsfs_lookup_span(struct rb_root* spans, u64 offset) {
struct rb_node* node = spans->rb_node;
while (node) {
struct eggsfs_span* span = container_of(node, struct eggsfs_span, node);
if (offset < span->start) { node = node->rb_left; }
else if (offset >= span->end) { node = node->rb_right; }
else {
eggsfs_debug_print("off=%llu span=%p", offset, span);
return span;
}
}
eggsfs_debug_print("off=%llu no_span", offset);
return NULL;
}
static bool eggsfs_insert_span(struct rb_root* spans, struct eggsfs_span* span) {
struct rb_node** new = &(spans->rb_node);
struct rb_node* parent = NULL;
while (*new) {
struct eggsfs_span* this = container_of(*new, struct eggsfs_span, node);
parent = *new;
if (span->end <= this->start) { new = &((*new)->rb_left); }
else if (span->start >= this->end) { new = &((*new)->rb_right); }
else {
// TODO: be loud if there are overlaping spans
eggsfs_debug_print("span=%p (%llu-%llu) already present", span, span->start, span->end);
return false;
}
}
rb_link_node(&span->node, parent, new);
rb_insert_color(&span->node, spans);
eggsfs_debug_print("span=%p (%llu-%llu) inserted", span, span->start, span->end);
return true;
}
static void eggsfs_free_span(struct eggsfs_span* span) {
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
kfree(EGGSFS_INLINE_SPAN(span));
} else {
kfree(EGGSFS_BLOCK_SPAN(span));
}
}
struct eggsfs_get_span_ctx {
struct list_head spans;
int err;
};
void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs) {
eggsfs_debug_print("offset=%llu size=%u crc=%08x storage_class=%d parity=%d stripes=%d cell_size=%u", offset, size, crc, storage_class, parity, stripes, cell_size);
struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data;
if (ctx->err) { return; }
struct eggsfs_block_span* span = kmalloc(sizeof(struct eggsfs_block_span), GFP_KERNEL);
if (!span) { ctx->err = -ENOMEM; return; }
if (eggsfs_data_blocks(parity) > EGGSFS_MAX_DATA || eggsfs_parity_blocks(parity) > EGGSFS_MAX_PARITY) {
eggsfs_warn_print("D=%d > %d || P=%d > %d", eggsfs_data_blocks(parity), EGGSFS_MAX_DATA, eggsfs_data_blocks(parity), EGGSFS_MAX_PARITY);
ctx->err = -EIO;
return;
}
span->span.start = offset;
span->span.end = offset + size;
span->span.storage_class = storage_class;
xa_init(&span->pages);
int i;
for (i = 0; i < stripes; i++) {
eggsfs_latch_init(&span->stripe_latches[i]);
}
span->cell_size = cell_size;
memcpy(span->stripes_crc, stripes_crcs, sizeof(uint32_t)*stripes);
span->stripes = stripes;
span->parity = parity;
eggsfs_debug_print("adding normal span");
list_add_tail(&span->span.lru, &ctx->spans);
}
void eggsfs_file_spans_cb_block(
void* data, int block_ix,
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2,
u64 block_id, u32 crc
) {
struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data;
if (ctx->err) { return; }
struct eggsfs_span* span = list_last_entry(&ctx->spans, struct eggsfs_span, lru);
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
struct eggsfs_block* block = &block_span->blocks[block_ix];
block->bs.id = bs_id;
block->bs.ip1 = ip1;
block->bs.port1 = port1;
block->bs.ip2 = ip2;
block->bs.port2 = port2;
block->id = block_id;
block->crc = crc;
}
void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body) {
eggsfs_debug_print("offset=%llu size=%u len=%u body=%*pE", offset, size, len, len, body);
struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data;
if (ctx->err) { return; }
struct eggsfs_inline_span* span = kmalloc(sizeof(struct eggsfs_inline_span), GFP_KERNEL);
if (!span) { ctx->err = -ENOMEM; return; }
span->span.start = offset;
span->span.end = offset + size;
span->span.storage_class = EGGSFS_INLINE_STORAGE;
span->len = len;
memcpy(span->body, body, len);
eggsfs_debug_print("adding inline span");
list_add_tail(&span->span.lru, &ctx->spans);
}
struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset) {
unsigned seq;
struct eggsfs_span* span;
struct eggsfs_inode_file* file = &enode->file;
eggsfs_debug_print("ino=%016lx, pid=%d, mu=%p, off=%llu getting span", enode->inode.i_ino, get_current()->pid, &file->spans_wlock, offset);
retry: // seqlock protects against spurious NULLs, if we get a span it is valid
seq = read_seqcount_begin(&file->spans_seqcount);
span = eggsfs_lookup_span(&file->spans, offset);
if (likely(span)) { return span; }
if (read_seqcount_retry(&file->spans_seqcount, seq)) { goto retry; }
int err = mutex_lock_killable(&file->spans_wlock);
if (err) { return ERR_PTR(err); }
span = eggsfs_lookup_span(&file->spans, offset);
if (!span) { // somebody might have gotten here first
struct eggsfs_span* tmp;
struct eggsfs_get_span_ctx ctx = { .err = 0, };
INIT_LIST_HEAD(&ctx.spans);
int err = eggsfs_error_to_linux(eggsfs_shard_file_spans(
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, offset, &ctx
));
err = err ?: ctx.err;
if (unlikely(err)) {
eggsfs_debug_print("failed to get file spans err=%d", err);
list_for_each_entry_safe(span, tmp, &ctx.spans, lru) {
list_del(&span->lru);
eggsfs_free_span(span);
}
span = ERR_PTR(err);
goto out;
}
// Need to disable preemption around seqcount_t write critical section,
// see docs for seqcount_t.
preempt_disable();
write_seqcount_begin(&file->spans_seqcount);
list_for_each_entry_safe(span, tmp, &ctx.spans, lru) {
eggsfs_debug_print("inserting span start=%llu, end=%llu", span->start, span->end);
if (!eggsfs_insert_span(&file->spans, span)) {
// span already cached
list_del(&span->lru);
eggsfs_free_span(span);
} else {
}
}
write_seqcount_end(&file->spans_seqcount);
preempt_enable();
span = eggsfs_lookup_span(&file->spans, offset);
// TODO: add to reclaimer LRU _after_ we've looked it up
}
out:
mutex_unlock(&file->spans_wlock);
return span;
}
struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix) {
// this should be guaranteed by the caller but we rely on it below, so let's check
if (span->cell_size%PAGE_SIZE != 0) {
eggsfs_warn_print("cell_size=%u, PAGE_SIZE=%lu, span->cell_size%%PAGE_SIZE=%lu", span->cell_size, PAGE_SIZE, span->cell_size%PAGE_SIZE);
return ERR_PTR(-EIO);
}
struct page* page;
u32 start_page, end_page, curr_page;
int err = 0;
again:
page = xa_load(&span->pages, page_ix);
if (unlikely(page != NULL)) { return page; }
// We need to load the stripe
int D = eggsfs_data_blocks(span->parity);
u32 span_offset = page_ix * PAGE_SIZE;
u32 stripe = span_offset / (span->cell_size*D);
// TODO better error?
if (stripe > span->stripes) {
eggsfs_warn_print("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, span->stripes);
return ERR_PTR(-EIO);
}
start_page = (span->cell_size/PAGE_SIZE)*D*stripe;
end_page = (span->cell_size/PAGE_SIZE)*D*((int)stripe + 1);
int seqno;
if (!eggsfs_latch_try_acquire(&span->stripe_latches[stripe], seqno)) {
int err = eggsfs_latch_wait_killable(&span->stripe_latches[stripe], seqno);
if (err) { return ERR_PTR(err); }
goto again;
}
struct eggsfs_block_socket* socks[EGGSFS_MAX_DATA];
memset(socks, 0, sizeof(struct eggsfs_block_socket*)*EGGSFS_MAX_DATA);
struct eggsfs_fetch_block_request* reqs[EGGSFS_MAX_DATA];
memset(reqs, 0, sizeof(struct eggsfs_fetch_block_request*)*EGGSFS_MAX_DATA);
// get block services sockets
int i;
for (i = 0; i < D; i++) {
socks[i] = eggsfs_get_fetch_block_socket(&span->blocks[i].bs);
if (IS_ERR(socks[i])) {
err = PTR_ERR(socks[i]);
goto out_err;
}
}
// fetch them all at the right offset
u32 block_offset = stripe * span->cell_size;
for (i = 0; i < D; i++) {
struct eggsfs_block* block = &span->blocks[i];
reqs[i] = eggsfs_fetch_block(socks[i], block->bs.id, block->id, block_offset, span->cell_size);
if (IS_ERR(reqs[i])) {
err = PTR_ERR(reqs[i]);
goto out_err;
}
}
// Wait for all the block requests
for (i = 0; i < D; i++) {
struct eggsfs_fetch_block_request* req = reqs[i];
err = wait_for_completion_killable(&req->comp);
if (err) { goto out_err; }
err = req->err;
if (err) {
eggsfs_debug_print("request failed: %d", err);
goto out_err;
}
}
// Store all the pages into the xarray
curr_page = start_page;
u32 crc = 0;
for (i = 0; i < D; i++) {
struct eggsfs_fetch_block_request* req = reqs[i];
crc = eggsfs_crc32c_append(crc, req->crc, span->cell_size);
struct page* page;
list_for_each_entry(page, &req->pages, lru) {
struct page* old_page = xa_store(&span->pages, curr_page, page, GFP_KERNEL);
if (IS_ERR(old_page)) {
err = PTR_ERR(old_page);
eggsfs_debug_print("xa_store failed: %d", err);
goto out_err;
}
BUG_ON(old_page != NULL); // the latch protects against this
curr_page++;
}
}
// Check the crc
if (crc != span->stripes_crc[stripe]) {
err = -EIO;
eggsfs_debug_print("crc check failed failed, expected %08x, got %08x", span->stripes_crc[stripe], crc);
goto out_err;
}
// OK, we're good, now take ownership of the pages by emptying the pages in the requests
// so that they won't be free by `eggsfs_put_fetch_block_request`
for (i = 0; i < D; i++) {
struct list_head* pages = &reqs[i]->pages;
while (!list_empty(pages)) {
list_del(&lru_to_page(pages)->lru);
}
}
page = xa_load(&span->pages, page_ix);
BUG_ON(page == NULL);
out:
// Unlock and free everything
eggsfs_latch_release(&span->stripe_latches[stripe], seqno);
for (i = 0; i < D; i++) {
if (reqs[i] == NULL) { continue; }
eggsfs_put_fetch_block_request(reqs[i]);
}
for (i = 0; i < D; i++) {
if (socks[i] == NULL) { continue; }
eggsfs_put_fetch_block_socket(socks[i]);
}
return err == 0 ? page : ERR_PTR(err);
out_err:
eggsfs_debug_print("getting span page failed, err=%d", err);
// we might have partially written the stripe, forget about them
for (curr_page = start_page; curr_page < end_page; curr_page++) {
xa_erase(&span->pages, curr_page);
}
goto out;
}
+69
View File
@@ -0,0 +1,69 @@
#ifndef _EGGSFS_SPANSIMPLE_H
#define _EGGSFS_SPANSIMPLE_H
#include <linux/kernel.h>
#include "inode.h"
struct eggsfs_span {
struct rb_node node; // to look them up
struct list_head lru; // to decide what to evict
u64 start;
u64 end;
u8 storage_class; // used to determine which type of span this is enclosed in
};
struct eggsfs_inline_span {
struct eggsfs_span span;
u8 len;
u8 body[255];
};
#define EGGSFS_INLINE_SPAN(_span) ({ \
BUG_ON(_span->storage_class != EGGSFS_INLINE_STORAGE); \
container_of(_span, struct eggsfs_inline_span, span); \
})
struct eggsfs_block {
u64 id;
struct eggsfs_block_service bs;
u32 crc;
};
struct eggsfs_block_span {
struct eggsfs_span span;
struct xarray pages; // indexed by page number
struct eggsfs_block blocks[EGGSFS_MAX_BLOCKS];
struct eggsfs_latch stripe_latches[15];
u32 cell_size;
u32 stripes_crc[15];
u8 stripes;
u8 parity;
};
#define EGGSFS_BLOCK_SPAN(_span) ({ \
BUG_ON(_span->storage_class == EGGSFS_EMPTY_STORAGE || _span->storage_class == EGGSFS_INLINE_STORAGE); \
container_of(_span, struct eggsfs_block_span, span); \
})
// Fetches and caches the span if necessary. `offset` need not be the actual
// span offset, the span containing the offset will be returned.
//
// If the offset is out of bounds, NULL will be returned. Errors might
// be returned if we fail to fetch the span.
struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset);
// The page_ix is the page number inside the span.
struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix);
void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs);
void eggsfs_file_spans_cb_block(
void* data, int block_ix,
// block service stuff
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2,
// block stuff
u64 block_id, u32 crc
);
void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body);
#endif
+20
View File
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
set -eu -o pipefail
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# qemu-system-x86_64 -s -enable-kvm -cpu host -m 64G -drive "file=${SCRIPT_DIR}/ubuntu-20.04.qcow2,if=virtio" -nic user,hostfwd=tcp::2222-:22,model=virtio-net-pci -smp 50 -virtfs local,path=/data/home/fmazzol/src/eggsfs/kmod/eggs-data,mount_tag=eggs-data,security_model=mapped,id=eggs-data -vnc :1 -monitor stdio
qemu-system-x86_64 \
-machine accel=kvm,type=q35 \
-enable-kvm \
-cpu host \
-kernel /data/home/fmazzol/src/eggsfs/kmod/linux-5.4.237/arch/x86/boot/bzImage \
-append "root=/dev/sda1 single console=ttyS0 systemd.unit=graphical.target" \
-hda ubuntu-20.04.img \
-hdb init.img \
-m 64G \
-smp 50 \
-nographic \
-nic user,hostfwd=tcp::2222-:22,model=virtio-net-pci \
-virtfs local,path=/data/home/fmazzol/src/eggsfs/kmod/eggs-data,mount_tag=eggs-data,security_model=mapped,id=eggs-data
+274
View File
@@ -0,0 +1,274 @@
#include "super.h"
#include <linux/inet.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include "common.h"
#include "inode.h"
#include "metadata.h"
#include "namei.h"
#include "net.h"
#include "shuckle.h"
#include "bincode.h"
#include "sysfs.h"
#include "err.h"
#include "rs.h"
static void eggsfs_free_fs_info(struct eggsfs_fs_info* info) {
eggsfs_debug_print("info=%p", info);
eggsfs_net_shard_free_socket(&info->sock);
kfree(info);
}
static struct eggsfs_fs_info* eggsfs_init_fs_info(const char* dev_name) {
int err;
struct eggsfs_fs_info* info = kmalloc(sizeof(struct eggsfs_fs_info), GFP_KERNEL);
if (!info) { err = -ENOMEM; goto out; }
err = eggsfs_init_shard_socket(&info->sock);
if (err) { goto out_info; }
struct socket* shuckle_sock;
err = eggsfs_create_shuckle_socket(dev_name, &shuckle_sock);
if (err < 0) { goto out_info; }
struct kvec iov;
struct msghdr msg = {NULL};
static_assert(EGGSFS_SHARDS_REQ_SIZE == 0);
char shuckle_req[EGGSFS_SHUCKLE_REQ_HEADER_SIZE];
eggsfs_write_shuckle_req_header(shuckle_req, 0, EGGSFS_SHUCKLE_SHARDS);
int written_so_far;
for (written_so_far = 0; written_so_far < sizeof(shuckle_req);) {
iov.iov_base = shuckle_req + written_so_far;
iov.iov_len = sizeof(shuckle_req) - written_so_far;
int written = kernel_sendmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len);
if (written < 0) { err = written; goto out_sock; }
written_so_far += written;
}
char shards_resp_header[EGGSFS_SHUCKLE_RESP_HEADER_SIZE + 2]; // + 2 = list len
int read_so_far;
for (read_so_far = 0; read_so_far < sizeof(shards_resp_header);) {
iov.iov_base = shards_resp_header + read_so_far;
iov.iov_len = sizeof(shards_resp_header) - read_so_far;
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
if (read < 0) { err = read; goto out_sock; }
read_so_far += read;
}
u32 shuckle_resp_len;
u8 shuckle_resp_kind;
err = eggsfs_read_shuckle_resp_header(shards_resp_header, &shuckle_resp_len, &shuckle_resp_kind);
if (err < 0) { goto out_sock; }
u16 shard_info_len = get_unaligned_le16(shards_resp_header + sizeof(shards_resp_header) - 2);
if (shard_info_len != 256) {
eggsfs_info_print("expected 256 shard infos, got %d", shard_info_len);
err = -EIO; goto out_sock;
}
if (shuckle_resp_len != 2 + EGGSFS_SHARD_INFO_SIZE*256) {
eggsfs_info_print("expected size of %d, got %d", 2 + EGGSFS_SHARD_INFO_SIZE*256, shuckle_resp_len);
err = -EIO; goto out_sock;
}
int shid;
for (shid = 0; shid < 256; shid++) {
char shard_info_resp[EGGSFS_SHARD_INFO_SIZE];
for (read_so_far = 0; read_so_far < EGGSFS_SHARD_INFO_SIZE;) {
iov.iov_base = shard_info_resp + read_so_far;
iov.iov_len = sizeof(shard_info_resp) - read_so_far;
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
if (read < 0) { err = read; goto out_sock; }
read_so_far += read;
}
struct eggsfs_bincode_get_ctx ctx = {
.buf = shard_info_resp,
.end = shard_info_resp + sizeof(shard_info_resp),
.err = 0,
};
eggsfs_shard_info_get_start(&ctx, start);
eggsfs_shard_info_get_ip(&ctx, start, shard_ip);
eggsfs_shard_info_get_port(&ctx, shard_ip, shard_port);
eggsfs_shard_info_get_last_seen(&ctx, shard_port, last_seen);
eggsfs_shard_info_get_end(&ctx, last_seen, end);
eggsfs_shard_info_get_finish(&ctx, end);
if (ctx.err != 0) { err = eggsfs_error_to_linux(ctx.err); goto out_sock; }
struct sockaddr_in* addr = &info->shards_addrs[shid];
struct msghdr* hdr = &info->shards_msghdrs[shid];
addr->sin_addr.s_addr = htonl(shard_ip.x);
addr->sin_family = AF_INET;
addr->sin_port = htons(shard_port.x);
eggsfs_debug_print("shard %d has addr %pI4:%d", shid, &addr->sin_addr, ntohs(addr->sin_port));
hdr->msg_name = addr;
hdr->msg_namelen = sizeof(struct sockaddr_in);
hdr->msg_control = NULL;
hdr->msg_controllen = 0;
hdr->msg_flags = 0;
}
static_assert(EGGSFS_CDC_REQ_SIZE == 0);
eggsfs_write_shuckle_req_header(shuckle_req, EGGSFS_SHARDS_REQ_SIZE, EGGSFS_SHUCKLE_CDC);
for (written_so_far = 0; written_so_far < sizeof(shuckle_req);) {
iov.iov_base = shuckle_req + written_so_far;
iov.iov_len = sizeof(shuckle_req) - written_so_far;
int written = kernel_sendmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len);
if (written < 0) { err = written; goto out_sock; }
written_so_far += written;
}
char cdc_resp_header[EGGSFS_SHUCKLE_RESP_HEADER_SIZE];
for (read_so_far = 0; read_so_far < sizeof(cdc_resp_header);) {
iov.iov_base = cdc_resp_header + read_so_far;
iov.iov_len = sizeof(cdc_resp_header) - read_so_far;
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
if (read < 0) { err = read; goto out_sock; }
read_so_far += read;
}
err = eggsfs_read_shuckle_resp_header(cdc_resp_header, &shuckle_resp_len, &shuckle_resp_kind);
if (err < 0) { goto out_sock; }
if (shuckle_resp_len != EGGSFS_CDC_RESP_SIZE) {
eggsfs_debug_print("expected size of %d, got %d", EGGSFS_CDC_RESP_SIZE, shuckle_resp_len);
err = -EINVAL; goto out_sock;
}
{
char cdc_resp[EGGSFS_CDC_RESP_SIZE];
for (read_so_far = 0; read_so_far < sizeof(cdc_resp);) {
iov.iov_base = (char*)&cdc_resp + read_so_far;
iov.iov_len = sizeof(cdc_resp) - read_so_far;
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
if (read < 0) { err = read; goto out_sock; }
read_so_far += read;
}
struct eggsfs_bincode_get_ctx ctx = {
.buf = cdc_resp,
.end = cdc_resp + sizeof(cdc_resp),
.err = 0,
};
eggsfs_cdc_resp_get_start(&ctx, start);
eggsfs_cdc_resp_get_ip(&ctx, start, cdc_ip);
eggsfs_cdc_resp_get_port(&ctx, cdc_ip, cdc_port);
eggsfs_cdc_resp_get_last_seen(&ctx, cdc_port, last_seen);
eggsfs_cdc_resp_get_end(&ctx, last_seen, end);
eggsfs_cdc_resp_get_finish(&ctx, end);
if (ctx.err != 0) { err = eggsfs_error_to_linux(ctx.err); goto out_sock; }
struct sockaddr_in* addr = &info->cdc_addr;
struct msghdr* hdr = &info->cdc_msghdr;
addr->sin_addr.s_addr = htonl(cdc_ip.x);
addr->sin_family = AF_INET;
addr->sin_port = htons(cdc_port.x);
eggsfs_debug_print("CDC has addr %pI4:%d", &addr->sin_addr, ntohs(addr->sin_port));
hdr->msg_name = addr;
hdr->msg_namelen = sizeof(struct sockaddr_in);
hdr->msg_control = NULL;
hdr->msg_controllen = 0;
hdr->msg_flags = 0;
}
sock_release(shuckle_sock);
eggsfs_info_print("mount successful");
return info;
out_sock:
sock_release(shuckle_sock);
out_info:
eggsfs_free_fs_info(info);
out:
return ERR_PTR(err);
}
static void eggsfs_put_super(struct super_block* sb) {
eggsfs_debug_print("sb=%p", sb);
eggsfs_free_fs_info(sb->s_fs_info);
sb->s_fs_info = NULL;
}
static const struct super_operations eggsfs_super_ops = {
.alloc_inode = eggsfs_inode_alloc,
.evict_inode = eggsfs_inode_evict,
.free_inode = eggsfs_inode_free,
.put_super = eggsfs_put_super,
};
static struct dentry* eggsfs_mount(struct file_system_type* fs_type, int flags, const char* dev_name, void* data) {
int err;
eggsfs_info_print("mounting at %s", dev_name);
eggsfs_debug_print("fs_type=%p flags=%d dev_name=%s data=%p", fs_type, flags, dev_name, data);
struct eggsfs_fs_info* info = eggsfs_init_fs_info(dev_name);
if (IS_ERR(info)) { err = PTR_ERR(info); goto out_err; }
struct super_block* sb = sget(fs_type, NULL, set_anon_super, flags, NULL);
if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out_info; }
sb->s_fs_info = info;
sb->s_flags = SB_NOSUID | SB_NODEV | SB_NOEXEC | SB_NOATIME | SB_NODIRATIME;
sb->s_iflags = SB_I_NOEXEC | SB_I_NODEV;
sb->s_op = &eggsfs_super_ops;
sb->s_d_op = &eggsfs_dentry_ops;
struct inode* root = eggsfs_get_inode(sb, EGGSFS_ROOT_INODE);
if (IS_ERR(root)) { err = PTR_ERR(root); goto out_sb; }
struct eggsfs_inode* root_enode = EGGSFS_I(root);
err = eggsfs_do_getattr(root_enode);
if (err) { goto out_sb; }
BUG_ON(
!root_enode->block_policies.len || !root_enode->span_policies.len || !root_enode->target_stripe_size
);
sb->s_root = d_make_root(root);
if (!sb->s_root) { err = -ENOMEM; goto out_sb; }
sb->s_flags |= SB_ACTIVE;
return dget(sb->s_root);
out_sb:
deactivate_locked_super(sb);
out_info:
eggsfs_free_fs_info(info);
out_err:
eggsfs_debug_print("failed err=%d", err);
return ERR_PTR(err);
}
static void eggsfs_kill_sb(struct super_block* sb) {
eggsfs_debug_print("sb=%p", sb);
kill_anon_super(sb);
}
static struct file_system_type eggsfs_fs_type = {
.owner = THIS_MODULE,
.name = "eggsfs",
.mount = eggsfs_mount,
.kill_sb = eggsfs_kill_sb,
// TODO is FS_RENAME_DOES_D_MOVE needed?
.fs_flags = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("eggsfs");
int __init eggsfs_fs_init(void) {
return register_filesystem(&eggsfs_fs_type);
}
void __cold eggsfs_fs_exit(void) {
unregister_filesystem(&eggsfs_fs_type);
}
+25
View File
@@ -0,0 +1,25 @@
#ifndef _EGGSFS_SUPER_H
#define _EGGSFS_SUPER_H
#include <linux/inet.h>
#include "net.h"
struct eggsfs_fs_info {
struct eggsfs_shard_socket sock;
// NB: ->msg_iter is used by `kernel_sendmsg`, so when you want to use the `struct msghhdr`
// stored here you should copy it to some local structure, and then use it, otherwise
// multiple users might step on each others' toes.
struct sockaddr_in shards_addrs[256];
struct msghdr shards_msghdrs[256];
struct sockaddr_in cdc_addr;
struct msghdr cdc_msghdr;
};
int __init eggsfs_fs_init(void);
void __cold eggsfs_fs_exit(void);
#endif
Executable
+4
View File
@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -eu -o pipefail
rsync -vaxAXL --include '*.py' --include '*.c' --include '*.h' --include 'Makefile' --exclude '*' ./ uovo:eggs-kmod/
+106
View File
@@ -0,0 +1,106 @@
#include <linux/sysctl.h>
#include "dir.h"
#include "span.h"
int eggsfs_max_request_retries = 3;
#if 0
bool eggsfs_reclaim_span(void) { return false; }
static int eggsfs_drop_spancache_var;
static int eggsfs_drop_spancache_sysctl(struct ctl_table* table, int write, void __user* buffer, size_t* len, loff_t* ppos) {
int ret;
ret = proc_dointvec_minmax(table, write, buffer, len, ppos);
if (ret)
return ret;
if (write) {
int n_spans = 0;
while (eggsfs_reclaim_span()) { ++n_spans; }
printk(KERN_INFO "eggsfs: reclaimed %d spans\n", n_spans);
}
return 0;
}
#endif
#define EGGSFS_CTL_ULONG(_name) \
{ \
.procname = #_name, \
.data = &eggsfs_##_name, \
.maxlen = sizeof(eggsfs_##_name), \
.mode = 0644, \
.proc_handler = proc_doulongvec_minmax, \
}
#define EGGSFS_CTL_INT_TIME(_name) \
{ \
.procname = #_name "_ms", \
.data = &eggsfs_##_name, \
.maxlen = sizeof(eggsfs_##_name), \
.mode = 0644, \
.proc_handler = proc_dointvec_ms_jiffies, \
}
static struct ctl_table eggsfs_cb_sysctls[] = {
{
.procname = "max_request_retries",
.data = &eggsfs_max_request_retries,
.maxlen = sizeof(eggsfs_max_request_retries),
.mode = 0644,
.proc_handler = proc_dointvec,
},
EGGSFS_CTL_INT_TIME(dir_refresh_time),
#if 0
EGGSFS_CTL_ULONG(spancache_max_size_async),
EGGSFS_CTL_ULONG(spancache_min_avail_mem_async),
EGGSFS_CTL_ULONG(spancache_max_size_sync),
EGGSFS_CTL_ULONG(spancache_min_avail_mem_sync),
{
.procname = "drop_spancache",
.data = &eggsfs_drop_spancache_var,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = eggsfs_drop_spancache_sysctl,
},
#endif
{}
};
static struct ctl_table eggsfs_cb_sysctl_dir[] = {
{
.procname = "eggsfs",
.mode = 0555,
.child = eggsfs_cb_sysctls,
},
{ }
};
static struct ctl_table eggsfs_cb_sysctl_root[] = {
{
.procname = "fs",
.mode = 0555,
.child = eggsfs_cb_sysctl_dir,
},
{ }
};
static struct ctl_table_header* eggsfs_callback_sysctl_table;
int __init eggsfs_sysctl_init(void) {
eggsfs_callback_sysctl_table = register_sysctl_table(eggsfs_cb_sysctl_root);
if (eggsfs_callback_sysctl_table == NULL) {
return -ENOMEM;
}
return 0;
}
void __cold eggsfs_sysctl_exit(void) {
unregister_sysctl_table(eggsfs_callback_sysctl_table);
eggsfs_callback_sysctl_table = NULL;
}
+10
View File
@@ -0,0 +1,10 @@
#ifndef _EGGSFS_SYSCTL_H
#define _EGGSFS_SYSCTL_H
#include <linux/init.h>
int __init eggsfs_sysctl_init(void);
void __cold eggsfs_sysctl_exit(void);
#endif
+76
View File
@@ -0,0 +1,76 @@
#include "sysfs.h"
#include <linux/kobject.h>
#include <linux/fs.h>
#include "namei.h"
static struct kset* eggsfs_kset;
struct eggsfs_sysfs_counter {
struct kobj_attribute kobj_attr;
u64 __percpu * ptr;
};
static ssize_t eggsfs_sysfs_counter_show(struct kobject* kobj, struct kobj_attribute* attr, char* buf) {
struct eggsfs_sysfs_counter* ettr = container_of(attr, struct eggsfs_sysfs_counter, kobj_attr);
u64 val = eggsfs_counter_get(ettr->ptr);
return snprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static ssize_t eggsfs_sysfs_counter_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t count) {
return -EPERM;
}
static umode_t eggsfs_stat_visible(struct kobject *kobj, struct attribute *attr, int unused) {
return attr->mode;
}
#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
{ \
.attr = { .name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
}
#define EGGSFS_SYSFS_COUNTER(_name) \
static struct eggsfs_sysfs_counter eggsfs_sysfs_counter_##_name = { \
.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, eggsfs_sysfs_counter_show, eggsfs_sysfs_counter_store), \
.ptr = &eggsfs_stat_##_name, \
}
#define EGGSFS_SYSFS_COUNTER_PTR(_name) (&eggsfs_sysfs_counter_##_name.kobj_attr.attr)
EGGSFS_SYSFS_COUNTER(dir_revalidations);
static struct attribute* eggsfs_stat_attrs[] = {
EGGSFS_SYSFS_COUNTER_PTR(dir_revalidations),
NULL
};
static const struct attribute_group eggsfs_stat_attr_group = {
.name = "stats",
.is_visible = eggsfs_stat_visible,
.attrs = eggsfs_stat_attrs,
};
int __init eggsfs_sysfs_init(void) {
int err;
eggsfs_kset = kset_create_and_add("eggsfs", NULL, fs_kobj);
if (!eggsfs_kset) { return -ENOMEM; }
err = sysfs_create_group(&eggsfs_kset->kobj, &eggsfs_stat_attr_group);
if (err) { goto out_unreg; }
return 0;
out_unreg:
kset_unregister(eggsfs_kset);
return err;
}
void __cold eggsfs_sysfs_exit(void) {
kset_unregister(eggsfs_kset);
}
+10
View File
@@ -0,0 +1,10 @@
#ifndef _EGGSFS_SYSFS_H
#define _EGGSFS_SYSFS_H
#include <linux/init.h>
int __init eggsfs_sysfs_init(void);
void __cold eggsfs_sysfs_exit(void);
#endif
+3
View File
@@ -0,0 +1,3 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
+423
View File
@@ -0,0 +1,423 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM eggsfs
#if !defined(_TRACE_EGGFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EGGSFS_H
#include <linux/tracepoint.h>
#include <linux/namei.h>
#include <linux/net.h>
#include <net/udp.h>
#include <net/sock.h>
#include <linux/socket.h>
#include <linux/pagemap.h>
#include <net/inet_common.h>
#include "inode.h"
#include "rs.h"
TRACE_DEFINE_ENUM(LOOKUP_FOLLOW);
TRACE_DEFINE_ENUM(LOOKUP_DIRECTORY);
TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT);
TRACE_DEFINE_ENUM(LOOKUP_PARENT);
TRACE_DEFINE_ENUM(LOOKUP_REVAL);
TRACE_DEFINE_ENUM(LOOKUP_RCU);
TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL);
TRACE_DEFINE_ENUM(LOOKUP_OPEN);
TRACE_DEFINE_ENUM(LOOKUP_CREATE);
TRACE_DEFINE_ENUM(LOOKUP_EXCL);
TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
TRACE_DEFINE_ENUM(LOOKUP_JUMPED);
TRACE_DEFINE_ENUM(LOOKUP_ROOT);
TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
TRACE_DEFINE_ENUM(LOOKUP_DOWN);
#define show_lookup_flags(flags) \
__print_flags(flags, "|", \
{ LOOKUP_FOLLOW, "FOLLOW" }, \
{ LOOKUP_DIRECTORY, "DIRECTORY" }, \
{ LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
{ LOOKUP_PARENT, "PARENT" }, \
{ LOOKUP_REVAL, "REVAL" }, \
{ LOOKUP_RCU, "RCU" }, \
{ LOOKUP_NO_REVAL, "NO_REVAL" }, \
{ LOOKUP_OPEN, "OPEN" }, \
{ LOOKUP_CREATE, "CREATE" }, \
{ LOOKUP_EXCL, "EXCL" }, \
{ LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
{ LOOKUP_JUMPED, "JUMPED" }, \
{ LOOKUP_ROOT, "ROOT" }, \
{ LOOKUP_EMPTY, "EMPTY" }, \
{ LOOKUP_DOWN, "DOWN" })
#define EGGSFS_TRACE_EVENT_inode(_name) \
TRACE_EVENT(eggsfs_##_name, \
TP_PROTO(struct inode* inode), \
TP_ARGS(inode), \
TP_STRUCT__entry( \
__field(u64, ino) \
), \
TP_fast_assign( \
__entry->ino = inode->i_ino; \
), \
TP_printk("enode=%#llx", __entry->ino) \
)
#define EGGSFS_TRACE_EVENT_inode_ret(_name) \
TRACE_EVENT(eggsfs_##_name, \
TP_PROTO(struct inode* inode, int err), \
TP_ARGS(inode, err), \
TP_STRUCT__entry( \
__field(u64, ino) \
__field(int, err) \
), \
TP_fast_assign( \
__entry->ino = inode->i_ino; \
__entry->err = err; \
), \
TP_printk("enode=%#llx err=%d", __entry->ino, __entry->err) \
)
#define EGGSFS_TRACE_EVENT_dir_dentry(_name) \
TRACE_EVENT(eggsfs_##_name, \
TP_PROTO(struct inode* dir, struct dentry* dentry), \
TP_ARGS(dir, dentry), \
TP_STRUCT__entry( \
__field(u64, dir) \
__string(name, dentry->d_name.name) \
), \
TP_fast_assign( \
__entry->dir = dir->i_ino; \
__assign_str(name, dentry->d_name.name); \
), \
TP_printk("dir=%#llx name=%s", __entry->dir, __get_str(name)) \
)
#define EGGSFS_TRACE_EVENT_dir_dentry_ret(_name) \
TRACE_EVENT(eggsfs_##_name, \
TP_PROTO(struct inode* dir, struct dentry* dentry, int err), \
TP_ARGS(dir, dentry, err), \
TP_STRUCT__entry( \
__field(u64, dir) \
__field(int, err) \
__string(name, dentry->d_name.name) \
), \
TP_fast_assign( \
__entry->dir = dir->i_ino; \
__entry->err = err; \
__assign_str(name, dentry->d_name.name); \
), \
TP_printk("dir=%llx name=%s err=%d", __entry->dir, __get_str(name), __entry->err) \
)
#define EGGSFS_TRACE_EVENT_dir_dentry_inode(_name) \
TRACE_EVENT(eggsfs_##_name, \
TP_PROTO(struct inode* dir, struct dentry* dentry, struct inode* inode), \
TP_ARGS(dir, dentry, inode), \
TP_STRUCT__entry( \
__field(u64, dir) \
__field(u64, ino) \
__string(name, dentry->d_name.name) \
), \
TP_fast_assign( \
__entry->dir = dir->i_ino; \
__entry->ino = inode ? inode->i_ino : 0; \
__assign_str(name, dentry->d_name.name); \
), \
TP_printk("dir=%llx name=%s ino=%llx", __entry->dir, __get_str(name), __entry->ino) \
)
#define EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(_name) \
TRACE_EVENT(eggsfs_##_name, \
TP_PROTO(struct inode* dir, struct dentry* dentry, struct inode* inode, int err), \
TP_ARGS(dir, dentry, inode, err), \
TP_STRUCT__entry( \
__field(u64, dir) \
__field(u64, ino) \
__field(int, err) \
__string(name, dentry->d_name.name) \
), \
TP_fast_assign( \
__entry->dir = dir->i_ino; \
__entry->ino = inode ? inode->i_ino : 0; \
__entry->err = err; \
__assign_str(name, dentry->d_name.name); \
), \
TP_printk("dir=%llx name=%s ino=%llx err=%d", __entry->dir, __get_str(name), __entry->ino, __entry->err) \
)
// dcache
EGGSFS_TRACE_EVENT_inode(dcache_delete_inode);
EGGSFS_TRACE_EVENT_inode(dcache_invalidate_dir);
EGGSFS_TRACE_EVENT_dir_dentry(dcache_invalidate_neg_entry);
EGGSFS_TRACE_EVENT_dir_dentry_inode(dcache_delete_entry);
EGGSFS_TRACE_EVENT_dir_dentry_inode(dcache_invalidate_entry);
// inode.c
EGGSFS_TRACE_EVENT_inode(vfs_getattr_enter);
EGGSFS_TRACE_EVENT_inode(vfs_getattr_lock);
EGGSFS_TRACE_EVENT_inode_ret(vfs_getattr_exit);
// dir.c
EGGSFS_TRACE_EVENT_inode(vfs_opendir_enter);
EGGSFS_TRACE_EVENT_inode_ret(vfs_opendir_exit);
EGGSFS_TRACE_EVENT_inode(vfs_closedir_enter);
EGGSFS_TRACE_EVENT_inode_ret(vfs_closedir_exit);
// namei.c
EGGSFS_TRACE_EVENT_dir_dentry(vfs_lookup_enter);
EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(vfs_lookup_exit);
EGGSFS_TRACE_EVENT_dir_dentry(vfs_mkdir_enter);
EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(vfs_mkdir_exit);
EGGSFS_TRACE_EVENT_dir_dentry_inode(vfs_rmdir_enter);
EGGSFS_TRACE_EVENT_dir_dentry_ret(vfs_rmdir_exit);
EGGSFS_TRACE_EVENT_dir_dentry_inode(vfs_unlink_enter);
EGGSFS_TRACE_EVENT_dir_dentry_ret(vfs_unlink_exit);
TRACE_EVENT(eggsfs_vfs_rename_enter,
TP_PROTO(struct inode* old_dir, struct dentry* old_dentry, struct inode* new_dir, struct dentry* new_dentry),
TP_ARGS(old_dir, old_dentry, new_dir, new_dentry),
TP_STRUCT__entry(
__field(u64, old_dir_ino)
__field(u64, new_dir_ino)
__field(u64, ino)
__string(old_parent, old_dentry->d_parent->d_name.name)
__string(old_name, old_dentry->d_name.name)
__string(new_parent, new_dentry->d_parent->d_name.name)
__string(new_name, new_dentry->d_name.name)
),
TP_fast_assign(
__entry->old_dir_ino = old_dir->i_ino;
__entry->new_dir_ino = new_dir->i_ino;
__entry->ino = old_dentry->d_inode->i_ino;
__assign_str(old_parent, old_dentry->d_parent->d_name.name);
__assign_str(old_name, old_dentry->d_name.name)
__assign_str(new_parent, new_dentry->d_parent->d_name.name);
__assign_str(new_name, new_dentry->d_name.name)
),
TP_printk("ino=%lld old_dir=%lld old_name=%s/%s -> new_dir=%lld new_name=%s/%s", __entry->ino, __entry->old_dir_ino, __get_str(old_parent), __get_str(old_name), __entry->new_dir_ino, __get_str(new_parent), __get_str(new_name))
);
TRACE_EVENT(eggsfs_vfs_rename_exit,
TP_PROTO(struct inode* old_dir, struct dentry* old_dentry, struct inode* new_dir, struct dentry* new_dentry, int err),
TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, err),
TP_STRUCT__entry(
__field(u64, old_dir_ino)
__field(u64, new_dir_ino)
__field(u64, ino)
__field(int, err)
__string(old_parent, old_dentry->d_parent->d_name.name)
__string(old_name, old_dentry->d_name.name)
__string(new_parent, new_dentry->d_parent->d_name.name)
__string(new_name, new_dentry->d_name.name)
),
TP_fast_assign(
__entry->old_dir_ino = old_dir->i_ino;
__entry->new_dir_ino = new_dir->i_ino;
__entry->ino = old_dentry->d_inode->i_ino;
__entry->err = err;
__assign_str(old_parent, old_dentry->d_parent->d_name.name);
__assign_str(old_name, old_dentry->d_name.name)
__assign_str(new_parent, new_dentry->d_parent->d_name.name);
__assign_str(new_name, new_dentry->d_name.name)
),
TP_printk("ino=%lld old_dir=%lld old_name=%s/%s -> new_dir=%lld new_name=%s/%s err=%d", __entry->ino, __entry->old_dir_ino, __get_str(old_parent), __get_str(old_name), __entry->new_dir_ino, __get_str(new_parent), __get_str(new_name), __entry->err)
);
TRACE_EVENT(eggsfs_metadata_request_enter,
TP_PROTO(struct msghdr* msg, u64 req_id, u32 len, s16 shard_id, u8 kind),
TP_ARGS(msg, req_id, len, shard_id, kind),
TP_STRUCT__entry(
__array(u8, addr, sizeof(struct sockaddr_in))
__field(u64, req_id)
__field(u32, len)
__field(s16, shard_id) // -1 is used for CDC
__field(u8, kind)
),
TP_fast_assign(
memcpy(__entry->addr, msg->msg_name, sizeof(struct sockaddr_in));
__entry->req_id = req_id;
__entry->len = len;
__entry->shard_id = shard_id;
__entry->kind = kind;
),
TP_printk("dst=%pISp req_id=%llu shard_id=%d kind=%d len=%u", __entry->addr, __entry->req_id, __entry->shard_id, (int)__entry->kind, __entry->len)
);
TRACE_EVENT(eggsfs_metadata_request_exit,
TP_PROTO(u64 req_id, u32 n_attempts, u32 resp_len, int error),
TP_ARGS(req_id, n_attempts, resp_len, error),
TP_STRUCT__entry(
__field(u64, req_id)
__field(u32, n_attempts)
__field(u32, resp_len)
__field(int, error)
),
TP_fast_assign(
__entry->req_id = req_id;
__entry->n_attempts = n_attempts;
__entry->resp_len = resp_len;
__entry->error = error;
),
TP_printk("req_id=%llu n_attempts=%u resp_len=%u error=%d", __entry->req_id, __entry->n_attempts, __entry->resp_len, __entry->error)
);
TRACE_EVENT(eggsfs_get_inode_enter,
TP_PROTO(u64 ino),
TP_ARGS(ino),
TP_STRUCT__entry(
__field(u64, ino)
),
TP_fast_assign(
__entry->ino = ino;
),
TP_printk("ino=%lld", __entry->ino)
);
TRACE_EVENT(eggsfs_get_inode_exit,
TP_PROTO(u64 ino, struct inode* inode, bool new, int error),
TP_ARGS(ino, inode, new, error),
TP_STRUCT__entry(
__field(u64, ino)
__field(struct inode*, inode)
__field(bool, new)
__field(int, error)
),
TP_fast_assign(
__entry->ino = ino;
__entry->inode = inode;
__entry->new = new;
__entry->error = error;
),
TP_printk("ino=%lld inode=%p new=%d error=%d", __entry->ino, __entry->inode, __entry->new, __entry->error)
);
TRACE_EVENT(eggsfs_dentry_handle_enoent,
TP_PROTO(struct dentry* dentry),
TP_ARGS(dentry),
TP_STRUCT__entry(
__field(u64, ino)
__string(parent, dentry->d_parent->d_name.name)
__string(name, dentry->d_name.name)
),
TP_fast_assign(
__entry->ino = dentry->d_inode ? dentry->d_inode->i_ino : 0;
__assign_str(name, dentry->d_parent->d_name.name);
__assign_str(name, dentry->d_name.name);
),
TP_printk("ino=%lld name=%s/%s", __entry->ino, __get_str(parent), __get_str(name))
);
TRACE_EVENT(eggsfs_block_write_enter,
TP_PROTO(u64 block_service_id, u64 block_id, u32 size),
TP_ARGS(block_service_id, block_id, size),
TP_STRUCT__entry(
__field(u64, block_service_id)
__field(u64, block_id)
__field(u32, size)
),
TP_fast_assign(
__entry->block_service_id = block_service_id;
__entry->block_id = block_id;
__entry->size = size;
),
TP_printk("block_service=%016llx block_id=%016llx size=%u", __entry->block_service_id, __entry->block_id, __entry->size)
);
TRACE_EVENT(eggsfs_block_write_exit,
TP_PROTO(u64 block_service_id, u64 block_id, int err),
TP_ARGS(block_service_id, block_id, err),
TP_STRUCT__entry(
__field(u64, block_service_id)
__field(u64, block_id)
__field(int, err)
),
TP_fast_assign(
__entry->block_service_id = block_service_id;
__entry->block_id = block_id;
__entry->err = err;
),
TP_printk("block_service=%016llx block_id=%016llx err=%u", __entry->block_service_id, __entry->block_id, __entry->err)
);
TRACE_EVENT(eggsfs_span_flush_enter,
TP_PROTO(u64 file_id, u64 offset, u32 size, u8 parity, u8 stripes, u8 storage_class),
TP_ARGS( file_id, offset, size, parity, stripes, storage_class),
TP_STRUCT__entry(
__field(u64, file_id)
__field(u64, offset)
__field(u32, size)
__field(u8, parity)
__field(u8, stripes)
__field(u8, storage_class)
),
TP_fast_assign(
__entry->file_id = file_id;
__entry->offset = offset;
__entry->size = size;
__entry->parity = parity;
__entry->stripes = stripes;
__entry->storage_class = storage_class;
),
TP_printk("file_id=%016llx offset=%llu size=%u parity=RS(%u,%u) stripes=%u, storage_class=%u", __entry->file_id, __entry->offset, __entry->size, eggsfs_data_blocks(__entry->parity), eggsfs_parity_blocks(__entry->parity), (int)__entry->stripes, (int)__entry->storage_class)
);
TRACE_EVENT(eggsfs_span_flush_exit,
TP_PROTO(u64 file_id, u64 offset, u32 size, int err),
TP_ARGS( file_id, offset, size, err),
TP_STRUCT__entry(
__field(u64, file_id)
__field(u64, offset)
__field(u32, size)
__field(int, err)
),
TP_fast_assign(
__entry->file_id = file_id;
__entry->offset = offset;
__entry->size = size;
__entry->err = err;
),
TP_printk("file_id=%016llx offset=%llu size=%u err=%d", __entry->file_id, __entry->offset, __entry->size, __entry->err)
);
TRACE_EVENT(eggsfs_span_add,
TP_PROTO(u64 file_id, u64 offset),
TP_ARGS( file_id, offset),
TP_STRUCT__entry(
__field(u64, file_id)
__field(u64, offset)
),
TP_fast_assign(
__entry->file_id = file_id;
__entry->offset = offset;
),
TP_printk("file_id=%016llx offset=%llu", __entry->file_id, __entry->offset)
);
#endif /* _TRACE_EGGFS_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
+76
View File
@@ -0,0 +1,76 @@
#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
static void bad_usage() {
fprintf(stderr, "bad usage\n");
exit(2);
}
int main(int argc, const char** argv) {
ssize_t file_size = -1;
ssize_t buf_size = -1; // if -1, all in one go
const char* filename = NULL;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "-buf-size") == 0) {
if (i+1 >= argc) { bad_usage(); } i++;
buf_size = strtoull(argv[i], NULL, 0);
if (buf_size == ULLONG_MAX) {
perror("bad bad_size");
exit(1);
}
} else if (strcmp(argv[i], "-size") == 0) {
if (i+1 >= argc) { bad_usage(); } i++;
file_size = strtoull(argv[i], NULL, 0);
if (file_size == ULLONG_MAX) {
perror("bad bad_size");
exit(1);
}
} else {
if (filename != NULL) { bad_usage(); }
filename = argv[i];
}
}
if (buf_size < 0) { buf_size = file_size; }
if (file_size < 0 || filename == NULL) { bad_usage(); }
printf("writing %ld bytes with bufsize %ld to %s\n", file_size, buf_size, filename);
int fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666);
if (fd < 0) {
perror("open");
exit(1);
}
unsigned char* buffer = malloc(buf_size);
if (buffer == NULL) {
perror("malloc");
exit(1);
}
while (file_size > 0) {
ssize_t res = write(fd, buffer, file_size > buf_size ? buf_size : file_size);
if (res < 0) {
perror("write");
exit(1);
}
file_size -= res;
}
printf("finished writing, will now close\n");
if (close(fd) < 0) {
perror("close");
exit(1);
}
printf("done.\n");
return 0;
}
+7 -5
View File
@@ -3,26 +3,28 @@ set -eu -o pipefail
echo "$(tput bold)building requisites$(tput sgr0)"
./cpp/build.py alpine rs crc32c # build libs for go
(cd go/msgs && go generate ./...) # build cpp files
(cd go/msgs && go generate ./...)
echo "$(tput bold)go tests$(tput sgr0)"
(cd go && go test ./...)
./cpp/tests.sh
./go/build.py integrationtest
(cd kmod && make bincode_tests && ./bincode_tests)
(cd go/eggstests && go build .)
echo "$(tput bold)integration tests$(tput sgr0)"
set -x
./go/integrationtest/integrationtest -repo-dir $(pwd)
./go/eggstests/eggstests -repo-dir $(pwd)
set +x
echo "$(tput bold)integration tests, sanitized, packet drop$(tput sgr0)"
set -x
./go/integrationtest/integrationtest -repo-dir $(pwd) -build-type sanitized -outgoing-packet-drop 0.1 -short
./go/eggstests/eggstests -repo-dir $(pwd) -build-type sanitized -outgoing-packet-drop 0.1 -short
set +x
echo "$(tput bold)integration tests, valgrind$(tput sgr0)"
set -x
./go/integrationtest/integrationtest -repo-dir $(pwd) -build-type valgrind -short
./go/eggstests/eggstests -repo-dir $(pwd) -build-type valgrind -short
set +x