mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-05-07 12:52:00 -05:00
First version of kernel module
Initial version really by Pawel, but many changes in between. Big outstanding issues: * span cache reclamation (unbounded memory otherwise...) * bad block service detection and workarounds * corrupted blocks detection and workaround Co-authored-by: Paweł Dziepak <pawel.dziepak@xtxmarkets.com>
This commit is contained in:
Vendored
+7
@@ -11,5 +11,12 @@
|
||||
"go/runeggs/runeggs": true,
|
||||
"go/gcdaemon/gcdaemon": true,
|
||||
"go/cli/cli": true,
|
||||
"kmod/*.cmd": true,
|
||||
"kmod/*.o": true,
|
||||
"kmod/*.ko": true,
|
||||
"kmod/*.mod": true,
|
||||
"kmod/*.mod.c": true,
|
||||
"kmod/Module.symvers": true,
|
||||
"kmod/modules.order": true,
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ set -eu -o pipefail
|
||||
|
||||
# build C++, all variants
|
||||
./cpp/build.py alpine
|
||||
./cpp/build.py alpine-debug
|
||||
./cpp/build.py release
|
||||
./cpp/build.py debug
|
||||
./cpp/build.py sanitized
|
||||
|
||||
+2
-2
@@ -59,7 +59,6 @@
|
||||
|
||||
#define EGGSFS_META_LOOKUP 0x1
|
||||
#define EGGSFS_META_STAT_FILE 0x2
|
||||
#define EGGSFS_META_STAT_TRANSIENT_FILE 0x3
|
||||
#define EGGSFS_META_STAT_DIRECTORY 0x4
|
||||
#define EGGSFS_META_READ_DIR 0x5
|
||||
#define EGGSFS_META_CONSTRUCT_FILE 0x6
|
||||
@@ -69,10 +68,11 @@
|
||||
#define EGGSFS_META_SOFT_UNLINK_FILE 0xA
|
||||
#define EGGSFS_META_FILE_SPANS 0xB
|
||||
#define EGGSFS_META_SAME_DIRECTORY_RENAME 0xC
|
||||
#define EGGSFS_META_ADD_INLINE_SPAN 0x10
|
||||
#define EGGSFS_META_STAT_TRANSIENT_FILE 0x3
|
||||
#define EGGSFS_META_SET_DIRECTORY_INFO 0xD
|
||||
#define EGGSFS_META_SNAPSHOT_LOOKUP 0xE
|
||||
#define EGGSFS_META_EXPIRE_TRANSIENT_FILE 0xF
|
||||
#define EGGSFS_META_ADD_INLINE_SPAN 0x10
|
||||
#define EGGSFS_META_VISIT_DIRECTORIES 0x70
|
||||
#define EGGSFS_META_VISIT_FILES 0x71
|
||||
#define EGGSFS_META_VISIT_TRANSIENT_FILES 0x72
|
||||
|
||||
+7
-7
@@ -16,8 +16,8 @@ if(NOT CMAKE_BUILD_TYPE)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|sanitized|valgrind)$"))
|
||||
message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, got ${CMAKE_BUILD_TYPE}")
|
||||
if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|alpine-debug|sanitized|valgrind)$"))
|
||||
message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, alpine-debug got ${CMAKE_BUILD_TYPE}")
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
@@ -33,13 +33,13 @@ add_compile_options("$<$<CONFIG:valgrind>:-march=haswell;-maes;-mgfni>")
|
||||
add_compile_options("$<$<NOT:$<CONFIG:valgrind>>:-march=skylake;-mgfni>")
|
||||
|
||||
# performance/debug stuff
|
||||
add_compile_options("$<$<NOT:$<CONFIG:debug>>:-O3>")
|
||||
add_compile_options("$<$<CONFIG:debug>:-O;-DEGGS_DEBUG>")
|
||||
add_compile_options("$<$<NOT:$<CONFIG:debug,alpine-debug>>:-O3>")
|
||||
add_compile_options("$<$<CONFIG:debug,alpine-debug>:-O;-DEGGS_DEBUG>")
|
||||
|
||||
# We build the release build statically in Alpine
|
||||
add_compile_options("$<$<CONFIG:alpine>:-DEGGS_ALPINE>")
|
||||
add_link_options("$<$<CONFIG:alpine>:-static>")
|
||||
add_link_options("$<$<NOT:$<CONFIG:alpine>>:-no-pie>")
|
||||
add_compile_options("$<$<CONFIG:alpine,alpine-debug>:-DEGGS_ALPINE>")
|
||||
add_link_options("$<$<CONFIG:alpine,alpine-debug>:-static>")
|
||||
add_link_options("$<$<NOT:$<CONFIG:alpine,alpine-debug>>:-no-pie>")
|
||||
|
||||
# sanitizer options
|
||||
set(SANITIZE_OPTIONS "-fsanitize=undefined,address,integer,function;-fno-sanitize-recover=all;-fsanitize-blacklist=${CMAKE_SOURCE_DIR}/ubsan-ignorelist")
|
||||
|
||||
+3
-3
@@ -5,7 +5,7 @@ from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print(f'Usage: {sys.argv[0]} release|alpine|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
|
||||
print(f'Usage: {sys.argv[0]} release|alpine|alpine-debug|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
@@ -19,9 +19,9 @@ repo_dir = cpp_dir.parent
|
||||
build_dir = cpp_dir / 'build' / build_type
|
||||
build_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if build_type == 'alpine' and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
|
||||
if build_type in ('alpine', 'alpine-debug') and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
|
||||
subprocess.run(
|
||||
['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', 'alpine'] + sys.argv[2:],
|
||||
['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', build_type] + sys.argv[2:],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
|
||||
+1
-1
@@ -264,7 +264,6 @@ struct MakeDirectoryStateMachine {
|
||||
void createDirectoryInode() {
|
||||
auto& shardReq = env.needsShard(MAKE_DIRECTORY_CREATE_DIR, state.dirId().shard()).setCreateDirectoryInode();
|
||||
shardReq.id = state.dirId();
|
||||
shardReq.info = req.info;
|
||||
shardReq.ownerId = req.ownerId;
|
||||
}
|
||||
|
||||
@@ -738,6 +737,7 @@ struct SoftUnlinkDirectoryStateMachine {
|
||||
shardReq.name = req.name;
|
||||
shardReq.targetId = req.targetId;
|
||||
shardReq.wasMoved = false;
|
||||
shardReq.creationTime = req.creationTime;
|
||||
}
|
||||
|
||||
void afterRollback(EggsError err, const ShardRespContainer* resp) {
|
||||
|
||||
@@ -3,26 +3,6 @@
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const BincodeBytesRef& x) {
|
||||
return goLangBytesFmt(out, x.data(), x.size());
|
||||
/*
|
||||
out << "b\"";
|
||||
uint8_t len = x.size();
|
||||
const uint8_t* data = (const uint8_t*)x.data();
|
||||
for (int i = 0; i < len; i++) {
|
||||
uint8_t ch = data[i];
|
||||
if (isprint(ch)) {
|
||||
out << ch;
|
||||
} else if (ch == 0) {
|
||||
out << "\\0";
|
||||
} else {
|
||||
const char cfill = out.fill();
|
||||
out << std::hex << std::setfill('0');
|
||||
out << "\\x" << std::setw(2) << (int)ch;
|
||||
out << std::setfill(cfill) << std::dec;
|
||||
}
|
||||
}
|
||||
out << "\"";
|
||||
return out;
|
||||
*/
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const BincodeBytes& x) {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <netinet/in.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <iomanip>
|
||||
|
||||
// Throwing in static initialization is nasty, and there is no useful stacktrace
|
||||
// Also use direct syscalls to write the error as iostream might not be initialized
|
||||
@@ -48,4 +49,27 @@ std::ostream& goLangBytesFmt(std::ostream& out, const char* str, size_t len) {
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes) {
|
||||
return goLangBytesFmt(out, bytes.str, bytes.len);
|
||||
}
|
||||
|
||||
std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* data, size_t len) {
|
||||
out << "\"";
|
||||
for (int i = 0; i < len; i++) {
|
||||
uint8_t ch = data[i];
|
||||
if (isprint(ch)) {
|
||||
out << ch;
|
||||
} else if (ch == 0) {
|
||||
out << "\\0";
|
||||
} else {
|
||||
const char cfill = out.fill();
|
||||
out << std::hex << std::setfill('0');
|
||||
out << "\\x" << std::setw(2) << (int)ch;
|
||||
out << std::setfill(cfill) << std::dec;
|
||||
}
|
||||
}
|
||||
out << "\"";
|
||||
return out;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes) {
|
||||
return goLangQuotedStringFmt(out, bytes.str, bytes.len);
|
||||
}
|
||||
+12
-1
@@ -52,4 +52,15 @@ struct GoLangBytesFmt {
|
||||
GoLangBytesFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
|
||||
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
|
||||
|
||||
std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* str, size_t len);
|
||||
|
||||
struct GoLangQuotedStringFmt {
|
||||
const char* str;
|
||||
size_t len;
|
||||
|
||||
GoLangQuotedStringFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes);
|
||||
+987
-987
File diff suppressed because it is too large
Load Diff
+546
-546
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
add_library(crc32c crc32c.h crc32c.cpp iscsi.hpp)
|
||||
add_library(crc32c crc32c.h crc32c.c iscsi.h)
|
||||
|
||||
add_executable(crc32c-tables tables.cpp iscsi.hpp)
|
||||
add_executable(crc32c-tables tables.cpp iscsi.h)
|
||||
|
||||
add_executable(crc32c-tests tests.cpp)
|
||||
target_link_libraries(crc32c-tests PRIVATE crc32c)
|
||||
|
||||
@@ -1,11 +1,29 @@
|
||||
#ifndef __KERNEL__
|
||||
|
||||
#include "crc32c.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#define kernel_static
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint32_t u32;
|
||||
|
||||
#else
|
||||
|
||||
#define kernel_static static
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_blocks) {
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32,pclmul")))
|
||||
#endif
|
||||
static u32 crc32_fusion_kernel(u32 acc_a, const char* buf, size_t n_blocks) {
|
||||
size_t stride = n_blocks * 24 + 8;
|
||||
// Four chunks:
|
||||
// Chunk A: 0 through stride
|
||||
@@ -18,8 +36,8 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
|
||||
__m128i x2 = _mm_loadu_si128((__m128i*)(buf2 + 16));
|
||||
__m128i x3 = _mm_loadu_si128((__m128i*)(buf2 + 32));
|
||||
__m128i x4 = _mm_loadu_si128((__m128i*)(buf2 + 48));
|
||||
uint32_t acc_b = 0;
|
||||
uint32_t acc_c = 0;
|
||||
u32 acc_b = 0;
|
||||
u32 acc_c = 0;
|
||||
// Parallel fold remaining blocks of 64 from D, and 24 from each of A/B/C.
|
||||
// k1 == magic(4*128+32-1)
|
||||
// k2 == magic(4*128-32-1)
|
||||
@@ -100,16 +118,16 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
|
||||
stack_a = ~stack_a;
|
||||
stack_b = ~stack_b;
|
||||
stack_c = ~stack_c;
|
||||
uint32_t magic_a = ((uint32_t)0x80000000) >> (bits_a & 31); bits_a >>= 5;
|
||||
uint32_t magic_b = ((uint32_t)0x80000000) >> (bits_b & 31); bits_b >>= 5;
|
||||
uint32_t magic_c = ((uint32_t)0x80000000) >> (bits_c & 31); bits_c >>= 5;
|
||||
u32 magic_a = ((u32)0x80000000) >> (bits_a & 31); bits_a >>= 5;
|
||||
u32 magic_b = ((u32)0x80000000) >> (bits_b & 31); bits_b >>= 5;
|
||||
u32 magic_c = ((u32)0x80000000) >> (bits_c & 31); bits_c >>= 5;
|
||||
bits_a -= bits_b;
|
||||
bits_b -= bits_c;
|
||||
for (; bits_c; --bits_c) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0), magic_c = _mm_crc32_u32(magic_c, 0);
|
||||
for (; bits_b; --bits_b) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0);
|
||||
for (; bits_a; --bits_a) magic_a = _mm_crc32_u32(magic_a, 0);
|
||||
for (;;) {
|
||||
uint32_t low = stack_a & 1;
|
||||
u32 low = stack_a & 1;
|
||||
if (!(stack_a >>= 1)) break;
|
||||
__m128i x = _mm_cvtsi32_si128(magic_a);
|
||||
uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
|
||||
@@ -130,13 +148,18 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
uint64_t abc = _mm_cvtsi128_si64(_mm_xor_si128(_mm_xor_si128(vec_c, vec_a), vec_b));
|
||||
// Apply missing <<32 and fold down to 32-bits.
|
||||
uint32_t crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
|
||||
u32 crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
|
||||
crc = _mm_crc32_u64(crc, abc ^ _mm_extract_epi64(x1, 1));
|
||||
return crc;
|
||||
}
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32")))
|
||||
#endif
|
||||
kernel_static u32 crc32c(u32 crc, const char* buf, size_t length) {
|
||||
crc = ~crc; // preset to -1 (distinguish leading zeros)
|
||||
if (length >= 31) {
|
||||
size_t n_blocks = (length - 16) / 136;
|
||||
@@ -149,7 +172,7 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
|
||||
const char* kernel_start = kernel_end - kernel_length;
|
||||
length -= kernel_start - buf;
|
||||
for (; buf != kernel_start; ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
if (n_blocks) {
|
||||
length -= kernel_length;
|
||||
@@ -161,17 +184,34 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
|
||||
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
|
||||
}
|
||||
for (; length; --length, ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
return ~crc; // post-invert -- distinguish scaled multiples
|
||||
}
|
||||
|
||||
#include "iscsi.hpp"
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32")))
|
||||
#endif
|
||||
kernel_static u32 crc32c_simple(u32 crc, const char* buf, size_t length) {
|
||||
crc = ~crc; // preset to -1 (distinguish leading zeros)
|
||||
for (; length >= 8; length -= 8, buf += 8) {
|
||||
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
|
||||
}
|
||||
for (; length; --length, ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
return ~crc; // post-invert -- distinguish scaled multiples
|
||||
}
|
||||
|
||||
#include "iscsi.h"
|
||||
|
||||
// In the comments below, multiplication is meant to be modulo ISCSI_POLY.
|
||||
|
||||
// Stores x^2^0, ..., x^2^31. Can be generated with `mult_mod_p`, see tables.cpp.
|
||||
static uint32_t CRC_POWER_TABLE[31] = {
|
||||
static u32 CRC_POWER_TABLE[31] = {
|
||||
0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,
|
||||
0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
|
||||
0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a,
|
||||
@@ -182,7 +222,7 @@ static uint32_t CRC_POWER_TABLE[31] = {
|
||||
};
|
||||
|
||||
// Stores x^-(2^0), ..., x^-(2^31). Can be generated with `mult_mod_p`, see tables.cpp.
|
||||
static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
|
||||
static u32 CRC_INVERSE_POWER_TABLE[31] = {
|
||||
0x05ec76f1, 0x0bd8ede2, 0x2f63b788, 0xfde39562, 0xbef0965e,
|
||||
0xd610d67e, 0xe67cce65, 0xa268b79e, 0x134fb088, 0x32998d96,
|
||||
0xcedac2cc, 0x70118575, 0x0e004a40, 0xa7864c8b, 0xbc7be916,
|
||||
@@ -193,7 +233,7 @@ static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
|
||||
};
|
||||
|
||||
// Return x^(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
|
||||
static uint32_t x2n_mod_p(size_t n, uint32_t k) {
|
||||
static u32 x2n_mod_p(size_t n, u32 k) {
|
||||
// CRC_POWER_TABLE has all powers of two can multiply combinations
|
||||
// of these to achieve any power.
|
||||
|
||||
@@ -210,7 +250,7 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
|
||||
// We walk along the p_is above and keep adding to the result.
|
||||
//
|
||||
// Note that 2^2^(31 + k) = 2^2^k TODO clarify
|
||||
uint32_t p = 1u << 31;
|
||||
u32 p = 1u << 31;
|
||||
for (; n != 0; n >>= 1, k++) {
|
||||
if (n & 1) {
|
||||
// p(x) = p(x) * 2^(k % 31)
|
||||
@@ -221,8 +261,8 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
|
||||
}
|
||||
|
||||
// Return x^-(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
|
||||
static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
|
||||
uint32_t p = 1u << 31;
|
||||
static u32 x2n_mod_p_inv(size_t n, u32 k) {
|
||||
u32 p = 1u << 31;
|
||||
for (; n != 0; n >>= 1, k++) {
|
||||
if (n & 1) {
|
||||
p = crc32c_mult_mod_p(CRC_INVERSE_POWER_TABLE[k & 0x1F], p);
|
||||
@@ -231,7 +271,7 @@ static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
|
||||
return p;
|
||||
}
|
||||
|
||||
uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
|
||||
kernel_static u32 crc32c_zero_extend(u32 crc, ssize_t zeros) {
|
||||
if (zeros > 0) {
|
||||
return ~crc32c_mult_mod_p(x2n_mod_p(zeros, 3), ~crc);
|
||||
} else {
|
||||
@@ -239,17 +279,17 @@ uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t crc32c_append(uint32_t crc_a, uint32_t crc_b, size_t len_b) {
|
||||
kernel_static u32 crc32c_append(u32 crc_a, u32 crc_b, size_t len_b) {
|
||||
// We need to extend crc_a with len_b*8 zeros (len_b is the number
|
||||
// of bytes) (without the inversion).
|
||||
// This amounts to performing `crc_a * x^(len_b*8)`.
|
||||
return crc32c_mult_mod_p(x2n_mod_p(len_b, 3), crc_a) ^ crc_b;
|
||||
}
|
||||
|
||||
uint32_t crc32c_xor(uint32_t crc_a, uint32_t crc_b, size_t len) {
|
||||
kernel_static u32 crc32c_xor(u32 crc_a, u32 crc_b, size_t len) {
|
||||
// We need to to extend crc_a with the crc of len*8 bits.
|
||||
// We could do this in 32 steps rather than 32+32 with a dedicated
|
||||
// table, but probably doesn't matter.
|
||||
uint32_t crc_0 = ~crc32c_mult_mod_p(~(uint32_t)0, x2n_mod_p(len, 3));
|
||||
u32 crc_0 = ~crc32c_mult_mod_p(~(u32)0, x2n_mod_p(len, 3));
|
||||
return crc_a ^ crc_b ^ crc_0;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
#define ISCSI_POLY 0x82F63B78u
|
||||
|
||||
// Return a(x) multiplied by b(x) modulo ISCSI_POLY, For speed, this requires
|
||||
// that a(x) not be zero.
|
||||
static u32 crc32c_mult_mod_p(u32 a, u32 b) {
|
||||
// m goes from x^0 to x^31
|
||||
u32 m = 1u << 31;
|
||||
u32 p = 0;
|
||||
for (;;) {
|
||||
// If a(x) contains x^n, add b(x)*x^n
|
||||
if (a & m) {
|
||||
p ^= b;
|
||||
// Exit when there are no higher bits in a(x)
|
||||
if ((a & (m - 1)) == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Go from x^n to x^(n+1)
|
||||
m >>= 1;
|
||||
// Go from b(x)*x^n to b(x)*x^(n+1)
|
||||
b = (b & 1) ? ((b >> 1) ^ ISCSI_POLY) : b >> 1;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
@@ -30,6 +30,7 @@ int main() {
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
auto s1 = randString(1 + wyhash64(&rand)%100);
|
||||
uint32_t crc1 = crc32c(0, (const char*)s1.data(), s1.size());
|
||||
ASSERT(crc1 == crc32c_append(0, crc1, s1.size()));
|
||||
auto s2 = randString(1 + wyhash64(&rand)%100);
|
||||
uint32_t crc2 = crc32c(0, (const char*)s2.data(), s2.size());
|
||||
std::vector<uint8_t> s = s1;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
add_library(rs rs.h rs.cpp gf.hpp gf_tables.cpp)
|
||||
add_library(rs rs.h rs.cpp gf_tables.c)
|
||||
|
||||
add_executable(rs-tests tests.cpp)
|
||||
target_link_libraries(rs-tests PRIVATE rs)
|
||||
|
||||
-110
@@ -1,110 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
extern const uint8_t rs_gf_inv_table[256];
|
||||
extern const uint8_t rs_gf_log_table[256];
|
||||
extern const uint8_t rs_gf_exp_table[256];
|
||||
|
||||
inline uint8_t gf_inv(uint8_t x) {
|
||||
return rs_gf_inv_table[x];
|
||||
}
|
||||
|
||||
inline uint8_t gf_mul(uint8_t x, uint8_t y) {
|
||||
if (x == 0 || y == 0) {
|
||||
return 0;
|
||||
}
|
||||
int i = rs_gf_log_table[x] + rs_gf_log_table[y];
|
||||
return rs_gf_exp_table[i > 254 ? i - 255 : i];
|
||||
}
|
||||
|
||||
inline void gf_mul_expand_factor(uint8_t x, uint8_t* expanded_x) {
|
||||
for (int i = 0; i < 16; i++) {
|
||||
expanded_x[i] = gf_mul(i, x);
|
||||
}
|
||||
for (int i = 0; i < 16; i++) {
|
||||
expanded_x[16 + i] = gf_mul(i << 4, x);
|
||||
}
|
||||
}
|
||||
|
||||
inline uint8_t gf_mul_expanded(uint8_t x, const uint8_t* expanded_y) {
|
||||
return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
|
||||
}
|
||||
|
||||
inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
|
||||
__m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
|
||||
__m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
|
||||
|
||||
__m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
|
||||
__m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
|
||||
|
||||
return _mm256_xor_si256(
|
||||
_mm256_shuffle_epi8(expanded_y_lo, x_lo),
|
||||
_mm256_shuffle_epi8(expanded_y_hi, x_hi)
|
||||
);
|
||||
}
|
||||
|
||||
// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
|
||||
// just Gaussian elimination.
|
||||
//
|
||||
// TODO this is in row-major, it'd be nice to have it in column-major
|
||||
// to save have the final operation to be more natural in rs_recover.
|
||||
__attribute__((noinline))
|
||||
static bool rs_gf_invert_matrix(uint8_t* in_mat, uint8_t* out_mat, const int n) {
|
||||
int i, j, k;
|
||||
uint8_t temp;
|
||||
|
||||
// Set out_mat[] to the identity matrix
|
||||
memset(out_mat, 0, n*n);
|
||||
for (i = 0; i < n; i++) {
|
||||
out_mat[i * n + i] = 1;
|
||||
}
|
||||
|
||||
// Inverse
|
||||
for (i = 0; i < n; i++) {
|
||||
// Check for 0 in pivot element
|
||||
if (in_mat[i * n + i] == 0) {
|
||||
// Find a row with non-zero in current column and swap
|
||||
for (j = i + 1; j < n; j++) {
|
||||
if (in_mat[j * n + i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (j == n) { // Couldn't find means it's singular
|
||||
return false;
|
||||
}
|
||||
|
||||
for (k = 0; k < n; k++) { // Swap rows i,j
|
||||
temp = in_mat[i * n + k];
|
||||
in_mat[i * n + k] = in_mat[j * n + k];
|
||||
in_mat[j * n + k] = temp;
|
||||
|
||||
temp = out_mat[i * n + k];
|
||||
out_mat[i * n + k] = out_mat[j * n + k];
|
||||
out_mat[j * n + k] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
|
||||
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
|
||||
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
|
||||
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
|
||||
}
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
if (j == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
temp = in_mat[j * n + i];
|
||||
for (k = 0; k < n; k++) {
|
||||
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
|
||||
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -1,7 +1,9 @@
|
||||
// generated with gf_tables.py
|
||||
#ifndef __KERNEL__
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
extern const uint8_t rs_gf_exp_table[256] = {
|
||||
const uint8_t rs_gf_exp_table[256] = {
|
||||
0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
|
||||
0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
|
||||
0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
|
||||
@@ -20,7 +22,7 @@ extern const uint8_t rs_gf_exp_table[256] = {
|
||||
0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01,
|
||||
};
|
||||
|
||||
extern const uint8_t rs_gf_log_table[256] = {
|
||||
const uint8_t rs_gf_log_table[256] = {
|
||||
0x00, 0xff, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
|
||||
0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
|
||||
0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
|
||||
@@ -39,7 +41,7 @@ extern const uint8_t rs_gf_log_table[256] = {
|
||||
0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07,
|
||||
};
|
||||
|
||||
extern const uint8_t rs_gf_inv_table[256] = {
|
||||
const uint8_t rs_gf_inv_table[256] = {
|
||||
0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0, 0xb0, 0xe1, 0xe5, 0xc7,
|
||||
0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f, 0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2,
|
||||
0x3a, 0x6e, 0x5a, 0xf1, 0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2,
|
||||
+76
-337
@@ -6,108 +6,10 @@
|
||||
#include <array>
|
||||
|
||||
#include "rs.h"
|
||||
#include "gf.hpp"
|
||||
|
||||
#define die(...) do { fprintf(stderr, __VA_ARGS__); raise(SIGABRT); } while(false)
|
||||
|
||||
static void* malloc_or_die(size_t size, const char* what) {
|
||||
void* ptr = malloc(size);
|
||||
if (ptr == nullptr) {
|
||||
die(what);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
struct rs {
|
||||
uint8_t parity;
|
||||
// uint8_t[D*B], in column-major.
|
||||
uint8_t* matrix;
|
||||
// uint8_t[D*P][32], in column-major. These are the lookup tables
|
||||
// to perform multiplication quickly.
|
||||
uint8_t* expanded_matrix;
|
||||
};
|
||||
|
||||
static struct rs* rs_cached[256] = {
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
};
|
||||
|
||||
// Note that we pervasively assume that the first column of the parity columns
|
||||
// is 1s, which causes the first parity to be the XORs of the data. So you can't
|
||||
// really change how the matrix is generated.
|
||||
static void rs_cauchy_matrix(struct rs* r) {
|
||||
int D = rs_data_blocks(r->parity);
|
||||
int B = rs_blocks(r->parity);
|
||||
uint8_t* matrix = r->matrix;
|
||||
memset(matrix, 0, D*B);
|
||||
// Identity in the d*d upper half
|
||||
for (int i = 0; i < D; i++) {
|
||||
matrix[D*i + i] = 1;
|
||||
}
|
||||
// Fill in the rest using cauchy
|
||||
for (int col = D; col < B; col++) {
|
||||
for (int row = 0; row < D; row++) {
|
||||
matrix[col*D + row] = gf_inv(col ^ row);
|
||||
}
|
||||
}
|
||||
// Scale the columns
|
||||
for (int col = D; col < B; col++) {
|
||||
uint8_t factor = gf_inv(matrix[col*D]);
|
||||
for (int row = 0; row < D; row++) {
|
||||
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
|
||||
}
|
||||
}
|
||||
// Scale the rows
|
||||
for (int row = 1; row < D; row++) {
|
||||
uint8_t factor = gf_inv(matrix[D*D + row]);
|
||||
for (int col = D; col < B; col++) {
|
||||
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct rs* rs_new(uint8_t parity) {
|
||||
int B = rs_blocks(parity);
|
||||
int D = rs_data_blocks(parity);
|
||||
int P = rs_parity_blocks(parity);
|
||||
struct rs* r = (struct rs*)malloc_or_die(sizeof(struct rs) + B*D + D*P*32, "rs_new\n");
|
||||
r->parity = parity;
|
||||
r->matrix = (uint8_t*)(r + 1);
|
||||
r->expanded_matrix = r->matrix + B*D;
|
||||
rs_cauchy_matrix(r);
|
||||
for (int p = 0; p < P; p++) {
|
||||
for (int d = 0; d < D; d++) {
|
||||
gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static void rs_delete(struct rs* r) {
|
||||
free(r);
|
||||
}
|
||||
|
||||
static std::array<uint32_t, 4> rs_cpuidex(uint32_t function_id, uint32_t subfunction_id) {
|
||||
uint32_t a, b, c, d;
|
||||
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
|
||||
return {a, b, c, d};
|
||||
}
|
||||
|
||||
static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
|
||||
#define rs_malloc malloc
|
||||
#define rs_free free
|
||||
|
||||
// See `valgrind.h`
|
||||
static uint64_t rs_valgrind_client_request(uint64_t defaultResult, uint64_t reqID, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5) {
|
||||
@@ -129,20 +31,48 @@ static bool rs_detect_valgrind() {
|
||||
return rs_valgrind_client_request(0, 0x1001, 0, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
bool rs_has_cpu_level(rs_cpu_level level) {
|
||||
const auto cpuid7 = (rs_cpuidex(0, 0)[0] >= 7) ? rs_cpuidex(7, 0) : std::array<uint32_t, 4>{0, 0, 0, 0};
|
||||
switch (level) {
|
||||
case RS_CPU_SCALAR:
|
||||
return true;
|
||||
case RS_CPU_AVX2:
|
||||
return cpuid7[1] & (1<<5);
|
||||
case RS_CPU_GFNI:
|
||||
return cpuid7[2] & (1<<8) && !rs_detect_valgrind();
|
||||
default:
|
||||
die("bad CPU level %d\n", level);
|
||||
}
|
||||
// This will emit vbroadcastb
|
||||
__attribute__((no_sanitize("integer")))
|
||||
static inline __m256i broadcast_u8(uint8_t x) {
|
||||
return _mm256_set_epi8(
|
||||
x, x, x, x, x, x, x, x,
|
||||
x, x, x, x, x, x, x, x,
|
||||
x, x, x, x, x, x, x, x,
|
||||
x, x, x, x, x, x, x, x
|
||||
);
|
||||
}
|
||||
|
||||
#include "rs_core.c"
|
||||
|
||||
uint8_t rs_parity(struct rs* r) {
|
||||
return r->parity;
|
||||
}
|
||||
|
||||
static struct rs* rs_cached[256] = {
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
|
||||
};
|
||||
|
||||
bool rs_has_cpu_level(rs_cpu_level level) {
|
||||
return rs_has_cpu_level_core(level);
|
||||
}
|
||||
|
||||
static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
|
||||
|
||||
__attribute__((constructor))
|
||||
void rs_detect_cpu_level() {
|
||||
if (rs_has_cpu_level(RS_CPU_GFNI)) {
|
||||
@@ -172,124 +102,46 @@ struct rs* rs_get(uint8_t parity) {
|
||||
}
|
||||
struct rs* r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
|
||||
if (__builtin_expect(r == nullptr, 0)) {
|
||||
r = rs_new(parity);
|
||||
r = rs_new_core(parity);
|
||||
if (r == nullptr) {
|
||||
die("could not allocate RS data");
|
||||
}
|
||||
struct rs* expected = nullptr;
|
||||
if (!__atomic_compare_exchange_n(&rs_cached[parity], &expected, r, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
|
||||
// somebody else got to it first
|
||||
rs_delete(r);
|
||||
rs_delete_core(r);
|
||||
r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
uint8_t rs_parity(struct rs* r) {
|
||||
return r->parity;
|
||||
template<int D, int P> __attribute__((noinline))
|
||||
static void rs_compute_parity_scalar_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
|
||||
rs_compute_parity_scalar(D, P, r, size, data, parity);
|
||||
}
|
||||
|
||||
// This will emit vbroadcastb
|
||||
__attribute__((no_sanitize("integer")))
|
||||
inline __m256i broadcast_u8(uint8_t x) {
|
||||
return _mm256_set_epi8(
|
||||
x, x, x, x, x, x, x, x,
|
||||
x, x, x, x, x, x, x, x,
|
||||
x, x, x, x, x, x, x, x,
|
||||
x, x, x, x, x, x, x, x
|
||||
);
|
||||
template<int D, int P> __attribute__((noinline))
|
||||
static void rs_compute_parity_avx2_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
|
||||
rs_compute_parity_avx2(D, P, r, size, data, parity);
|
||||
}
|
||||
|
||||
template<int D, int P>
|
||||
static void rs_compute_parity_single(struct rs* r, uint64_t i, const uint8_t** data, uint8_t** parity) {
|
||||
parity[0][i] = 0;
|
||||
for (int d = 0; d < D; d++) {
|
||||
parity[0][i] ^= data[d][i];
|
||||
}
|
||||
for (int p = 1; p < P; p++) {
|
||||
const uint8_t* factor = &r->expanded_matrix[D*32*p];
|
||||
parity[p][i] = 0;
|
||||
for (int d = 0; d < D; d++, factor += 32) {
|
||||
parity[p][i] ^= gf_mul_expanded(data[d][i], factor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int D, int P>
|
||||
__attribute__((noinline))
|
||||
void rs_compute_parity_scalar(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
|
||||
// parity = r->matrix * data
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
rs_compute_parity_single<D, P>(r, i, data, parity);
|
||||
}
|
||||
}
|
||||
|
||||
template<int D, int P>
|
||||
__attribute__((noinline))
|
||||
void rs_compute_parity_avx2(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
|
||||
__m256i low_nibble_mask = broadcast_u8(0x0f);
|
||||
size_t avx_leftover = size % 32;
|
||||
size_t avx_size = size-avx_leftover;
|
||||
for (uint64_t i = 0; i < avx_size; i += 32) {
|
||||
{
|
||||
__m256i parity_0 = _mm256_setzero_si256();
|
||||
for (int d = 0; d < D; d++) {
|
||||
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
|
||||
}
|
||||
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
|
||||
}
|
||||
for (int p = 1; p < P; p++) {
|
||||
__m256i parity_p = _mm256_setzero_si256();
|
||||
for (int d = 0; d < D; d++) {
|
||||
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
|
||||
__m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]);
|
||||
parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask));
|
||||
}
|
||||
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
|
||||
}
|
||||
}
|
||||
for (uint64_t i = avx_size; i < size; i++) {
|
||||
rs_compute_parity_single<D, P>(r, i, data, parity);
|
||||
}
|
||||
}
|
||||
|
||||
template<int D, int P>
|
||||
__attribute__((noinline))
|
||||
void rs_compute_parity_gfni(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
|
||||
size_t avx_leftover = size % 32;
|
||||
size_t avx_size = size-avx_leftover;
|
||||
for (uint64_t i = 0; i < avx_size; i += 32) {
|
||||
{
|
||||
__m256i parity_0 = _mm256_setzero_si256();
|
||||
for (int d = 0; d < D; d++) {
|
||||
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
|
||||
}
|
||||
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
|
||||
}
|
||||
for (int p = 1; p < P; p++) {
|
||||
__m256i parity_p = _mm256_setzero_si256();
|
||||
for (int d = 0; d < D; d++) {
|
||||
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
|
||||
__m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]);
|
||||
parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor));
|
||||
}
|
||||
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
|
||||
}
|
||||
}
|
||||
for (uint64_t i = avx_size; i < size; i++) {
|
||||
rs_compute_parity_single<D, P>(r, i, data, parity);
|
||||
}
|
||||
template<int D, int P> __attribute__((noinline))
|
||||
static void rs_compute_parity_gfni_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
|
||||
rs_compute_parity_gfni(D, P, r, size, data, parity);
|
||||
}
|
||||
|
||||
template<int D, int P>
|
||||
static void rs_compute_parity_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
|
||||
switch (rs_cpu_level l = rs_get_cpu_level()) {
|
||||
case RS_CPU_SCALAR:
|
||||
rs_compute_parity_scalar<D, P>(r, size, data, parity);
|
||||
rs_compute_parity_scalar_tmpl<D, P>(r, size, data, parity);
|
||||
break;
|
||||
case RS_CPU_AVX2:
|
||||
rs_compute_parity_avx2<D, P>(r, size, data, parity);
|
||||
rs_compute_parity_avx2_tmpl<D, P>(r, size, data, parity);
|
||||
break;
|
||||
case RS_CPU_GFNI:
|
||||
rs_compute_parity_gfni<D, P>(r, size, data, parity);
|
||||
rs_compute_parity_gfni_tmpl<D, P>(r, size, data, parity);
|
||||
break;
|
||||
default:
|
||||
die("bad cpu_level %d\n", l);
|
||||
@@ -301,101 +153,32 @@ void rs_compute_parity(struct rs* r, uint64_t size, const uint8_t** data, uint8_
|
||||
rs_compute_parity_funcs[r->parity](r, size, data, parity);
|
||||
}
|
||||
|
||||
template<int D>
|
||||
static void rs_recover_matmul_single(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
|
||||
want[i] = 0;
|
||||
for (int j = 0; j < D; j++) {
|
||||
want[i] ^= gf_mul(have_to_want[j], have[j][i]);
|
||||
}
|
||||
template<int D> __attribute__((noinline))
|
||||
static void rs_recover_matmul_scalar_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
|
||||
rs_recover_matmul_scalar(D, size, have, want, mat);
|
||||
}
|
||||
|
||||
template<int D>
|
||||
static void rs_recover_matmul_single_expanded(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want_expanded) {
|
||||
want[i] = 0;
|
||||
for (int j = 0; j < D; j++) {
|
||||
want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]);
|
||||
}
|
||||
template<int D> __attribute__((noinline))
|
||||
static void rs_recover_matmul_avx2_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
|
||||
rs_recover_matmul_avx2(D, size, have, want, mat);
|
||||
}
|
||||
|
||||
template<int D>
|
||||
__attribute__((noinline))
|
||||
static void rs_recover_matmul_scalar(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
|
||||
uint8_t have_to_want_expanded[D*32];
|
||||
for (int i = 0; i < D; i++) {
|
||||
gf_mul_expand_factor(have_to_want[i], &have_to_want_expanded[i*32]);
|
||||
}
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
rs_recover_matmul_single_expanded<D>(i, have, want, have_to_want_expanded);
|
||||
}
|
||||
}
|
||||
|
||||
template<int D>
|
||||
__attribute__((noinline))
|
||||
static void rs_recover_matmul_avx2(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
|
||||
__m256i have_to_want_expanded[D];
|
||||
for (int i = 0; i < D; i++) {
|
||||
gf_mul_expand_factor(have_to_want[i], (uint8_t*)&have_to_want_expanded[i]);
|
||||
}
|
||||
__m256i low_nibble_mask = broadcast_u8(0x0f);
|
||||
size_t avx_leftover = size % 32;
|
||||
size_t avx_size = size-avx_leftover;
|
||||
for (uint64_t i = 0; i < avx_size; i += 32) {
|
||||
__m256i want_i = _mm256_setzero_si256();
|
||||
for (int d = 0; d < D; d++) {
|
||||
want_i = _mm256_xor_si256(
|
||||
want_i,
|
||||
gf_mul_expanded_avx2(
|
||||
_mm256_loadu_si256((const __m256i*)(have[d] + i)),
|
||||
have_to_want_expanded[d],
|
||||
low_nibble_mask
|
||||
)
|
||||
);
|
||||
}
|
||||
_mm256_storeu_si256((__m256i*)(want + i), want_i);
|
||||
}
|
||||
for (uint64_t i = avx_size; i < size; i++) {
|
||||
rs_recover_matmul_single_expanded<D>(i, have, want, (const uint8_t*)have_to_want_expanded);
|
||||
}
|
||||
}
|
||||
|
||||
template<int D>
|
||||
__attribute__((noinline))
|
||||
static void rs_recover_matmul_gfni(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
|
||||
__m256i have_to_want_avx[D];
|
||||
for (int i = 0; i < D; i++) {
|
||||
have_to_want_avx[i] = broadcast_u8(have_to_want[i]);
|
||||
}
|
||||
size_t avx_leftover = size % 32;
|
||||
size_t avx_size = size-avx_leftover;
|
||||
for (uint64_t i = 0; i < avx_size; i += 32) {
|
||||
__m256i want_i = _mm256_setzero_si256();
|
||||
for (int d = 0; d < D; d++) {
|
||||
want_i = _mm256_xor_si256(
|
||||
want_i,
|
||||
_mm256_gf2p8mul_epi8(
|
||||
_mm256_loadu_si256((const __m256i*)(have[d] + i)),
|
||||
have_to_want_avx[d]
|
||||
)
|
||||
);
|
||||
}
|
||||
_mm256_storeu_si256((__m256i*)(want + i), want_i);
|
||||
}
|
||||
for (uint64_t i = avx_size; i < size; i++) {
|
||||
rs_recover_matmul_single<D>(i, have, want, have_to_want);
|
||||
}
|
||||
template<int D> __attribute__((noinline))
|
||||
static void rs_recover_matmul_gfni_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
|
||||
rs_recover_matmul_gfni(D, size, have, want, mat);
|
||||
}
|
||||
|
||||
template<int D>
|
||||
static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
|
||||
switch (rs_cpu_level l = rs_get_cpu_level()) {
|
||||
case RS_CPU_SCALAR:
|
||||
rs_recover_matmul_scalar<D>(size, have, want, mat);
|
||||
rs_recover_matmul_scalar_tmpl<D>(size, have, want, mat);
|
||||
break;
|
||||
case RS_CPU_AVX2:
|
||||
rs_recover_matmul_avx2<D>(size, have, want, mat);
|
||||
rs_recover_matmul_avx2_tmpl<D>(size, have, want, mat);
|
||||
break;
|
||||
case RS_CPU_GFNI:
|
||||
rs_recover_matmul_gfni<D>(size, have, want, mat);
|
||||
rs_recover_matmul_gfni_tmpl<D>(size, have, want, mat);
|
||||
break;
|
||||
default:
|
||||
die("bad cpu_level %d\n", l);
|
||||
@@ -403,7 +186,6 @@ static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t*
|
||||
}
|
||||
|
||||
static void (*rs_recover_matmul_funcs[16])(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat);
|
||||
|
||||
void rs_recover(
|
||||
struct rs* r,
|
||||
uint64_t size,
|
||||
@@ -412,55 +194,12 @@ void rs_recover(
|
||||
uint8_t want_block,
|
||||
uint8_t* want
|
||||
) {
|
||||
int D = rs_data_blocks(r->parity);
|
||||
int B = rs_blocks(r->parity);
|
||||
// Create some space
|
||||
uint8_t* scratch = (uint8_t*)malloc(D*D + D*D);
|
||||
uint8_t* mat_1 = scratch;
|
||||
uint8_t* mat_2 = scratch + D*D;
|
||||
// Preliminary checks
|
||||
for (int i = 0; i < D; i++) {
|
||||
if (have_blocks[i] >= B) {
|
||||
die("have_blocks[%d]=%d >= %d\n", i, have_blocks[i], B);
|
||||
rs_recover_core(
|
||||
r, size, have_blocks, have, want_block, want,
|
||||
[](int D, uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
|
||||
rs_recover_matmul_funcs[D](size, have, want, mat);
|
||||
}
|
||||
if (have_blocks[i] == want_block) {
|
||||
die("have_blocks[%d]=%d == want_block=%d\n", i, have_blocks[i], want_block);
|
||||
}
|
||||
if (i > 0 && have_blocks[i] <= have_blocks[i-1]) {
|
||||
die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", i, have_blocks[i], i, have_blocks[i-1]);
|
||||
}
|
||||
}
|
||||
// below in the dimensionality annotation we paper over transposes
|
||||
// [DxD] matrix going from the data blocks to the blocks we currently have
|
||||
uint8_t* data_to_have = mat_1;
|
||||
for (int i = 0, have_cursor = 0; i < B; i++) {
|
||||
if (have_cursor >= D || have_blocks[have_cursor] != i) {
|
||||
continue;
|
||||
}
|
||||
memcpy(data_to_have + have_cursor*D, r->matrix + i*D, D);
|
||||
have_cursor++;
|
||||
}
|
||||
// [DxD] matrix going from what we have to the original data blocks
|
||||
uint8_t* have_to_data = mat_2;
|
||||
if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
|
||||
die("unexpected singular matrix\n");
|
||||
}
|
||||
data_to_have = nullptr;
|
||||
// [Dx1] matrix going from the data blocks to the block we want
|
||||
uint8_t* data_to_want = &r->matrix[want_block*D];
|
||||
// have_to_want = data_to_want * have_to_data
|
||||
// [Dx1] matrix going from `blocks` to the block we're into
|
||||
uint8_t* have_to_want = mat_1;
|
||||
for (int i = 0; i < D; i++) {
|
||||
have_to_want[i] = 0;
|
||||
for (int j = 0; j < D; j++) {
|
||||
have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
|
||||
}
|
||||
}
|
||||
// want = have_to_want * have
|
||||
rs_recover_matmul_funcs[D](size, have, want, have_to_want);
|
||||
// We're done.
|
||||
free(scratch);
|
||||
);
|
||||
}
|
||||
|
||||
__attribute__((constructor))
|
||||
|
||||
@@ -0,0 +1,451 @@
|
||||
#ifndef __KERNEL__
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint32_t u32;
|
||||
typedef uint64_t u64;
|
||||
|
||||
#endif
|
||||
|
||||
extern const u8 rs_gf_inv_table[256];
|
||||
extern const u8 rs_gf_log_table[256];
|
||||
extern const u8 rs_gf_exp_table[256];
|
||||
|
||||
static inline u8 gf_inv(u8 x) {
|
||||
return rs_gf_inv_table[x];
|
||||
}
|
||||
|
||||
static inline u8 gf_mul(u8 x, u8 y) {
|
||||
if (x == 0 || y == 0) {
|
||||
return 0;
|
||||
}
|
||||
int i = rs_gf_log_table[x] + rs_gf_log_table[y];
|
||||
return rs_gf_exp_table[i > 254 ? i - 255 : i];
|
||||
}
|
||||
|
||||
static inline void gf_mul_expand_factor(u8 x, u8* expanded_x) {
|
||||
int i;
|
||||
for (i = 0; i < 16; i++) {
|
||||
expanded_x[i] = gf_mul(i, x);
|
||||
}
|
||||
for (i = 0; i < 16; i++) {
|
||||
expanded_x[16 + i] = gf_mul(i << 4, x);
|
||||
}
|
||||
}
|
||||
|
||||
static inline u8 gf_mul_expanded(u8 x, const u8* expanded_y) {
|
||||
return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
|
||||
}
|
||||
|
||||
__attribute__((target("avx2")))
|
||||
static inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
|
||||
__m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
|
||||
__m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
|
||||
|
||||
__m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
|
||||
__m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
|
||||
|
||||
return _mm256_xor_si256(
|
||||
_mm256_shuffle_epi8(expanded_y_lo, x_lo),
|
||||
_mm256_shuffle_epi8(expanded_y_hi, x_hi)
|
||||
);
|
||||
}
|
||||
|
||||
static inline u8 rs_data_blocks_core(u8 parity) {
|
||||
return parity & 0x0F;
|
||||
}
|
||||
|
||||
static inline u8 rs_parity_blocks_core(u8 parity) {
|
||||
return parity >> 4;
|
||||
}
|
||||
|
||||
static inline u8 rs_blocks_core(u8 parity) {
|
||||
return rs_data_blocks_core(parity) + rs_parity_blocks_core(parity);
|
||||
}
|
||||
|
||||
// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
|
||||
// just Gaussian elimination.
|
||||
//
|
||||
// TODO this is in row-major, it'd be nice to have it in column-major
|
||||
// to save have the final operation to be more natural in rs_recover.
|
||||
static bool rs_gf_invert_matrix(u8* in_mat, u8* out_mat, const int n) {
|
||||
int i, j, k;
|
||||
u8 temp;
|
||||
|
||||
// Set out_mat[] to the identity matrix
|
||||
memset(out_mat, 0, n*n);
|
||||
for (i = 0; i < n; i++) {
|
||||
out_mat[i * n + i] = 1;
|
||||
}
|
||||
|
||||
// Inverse
|
||||
for (i = 0; i < n; i++) {
|
||||
// Check for 0 in pivot element
|
||||
if (in_mat[i * n + i] == 0) {
|
||||
// Find a row with non-zero in current column and swap
|
||||
for (j = i + 1; j < n; j++) {
|
||||
if (in_mat[j * n + i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (j == n) { // Couldn't find means it's singular
|
||||
return false;
|
||||
}
|
||||
|
||||
for (k = 0; k < n; k++) { // Swap rows i,j
|
||||
temp = in_mat[i * n + k];
|
||||
in_mat[i * n + k] = in_mat[j * n + k];
|
||||
in_mat[j * n + k] = temp;
|
||||
|
||||
temp = out_mat[i * n + k];
|
||||
out_mat[i * n + k] = out_mat[j * n + k];
|
||||
out_mat[j * n + k] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
|
||||
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
|
||||
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
|
||||
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
|
||||
}
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
if (j == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
temp = in_mat[j * n + i];
|
||||
for (k = 0; k < n; k++) {
|
||||
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
|
||||
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
struct rs {
|
||||
u8 parity;
|
||||
// u8[D*B], in column-major.
|
||||
u8* matrix;
|
||||
// u8[D*P][32], in column-major. These are the lookup tables
|
||||
// to perform multiplication quickly.
|
||||
u8* expanded_matrix;
|
||||
};
|
||||
|
||||
// Note that we pervasively assume that the first column of the parity columns
|
||||
// is 1s, which causes the first parity to be the XORs of the data. So you can't
|
||||
// really change how the matrix is generated.
|
||||
static void rs_cauchy_matrix(struct rs* r) {
|
||||
int D = rs_data_blocks_core(r->parity);
|
||||
int B = rs_blocks_core(r->parity);
|
||||
u8* matrix = r->matrix;
|
||||
memset(matrix, 0, D*B);
|
||||
// Identity in the d*d upper half
|
||||
int i;
|
||||
for (i = 0; i < D; i++) {
|
||||
matrix[D*i + i] = 1;
|
||||
}
|
||||
// Fill in the rest using cauchy
|
||||
int col, row;
|
||||
for (col = D; col < B; col++) {
|
||||
for (row = 0; row < D; row++) {
|
||||
matrix[col*D + row] = gf_inv(col ^ row);
|
||||
}
|
||||
}
|
||||
// Scale the columns
|
||||
for (col = D; col < B; col++) {
|
||||
u8 factor = gf_inv(matrix[col*D]);
|
||||
for (row = 0; row < D; row++) {
|
||||
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
|
||||
}
|
||||
}
|
||||
// Scale the rows
|
||||
for (row = 1; row < D; row++) {
|
||||
u8 factor = gf_inv(matrix[D*D + row]);
|
||||
for (col = D; col < B; col++) {
|
||||
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out must be at leas size 4
|
||||
static void rs_cpuidex(u32 function_id, u32 subfunction_id, u32* out) {
|
||||
u32 a, b, c, d;
|
||||
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
|
||||
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
|
||||
}
|
||||
|
||||
static bool rs_has_cpu_level_core(enum rs_cpu_level level) {
|
||||
u32 _0_0[4];
|
||||
rs_cpuidex(0, 0, _0_0);
|
||||
u32 _7_0[4];
|
||||
if (_0_0[0] >= 7) {
|
||||
rs_cpuidex(7, 0, _7_0);
|
||||
} else {
|
||||
memset(_7_0, 0, sizeof(_7_0));
|
||||
}
|
||||
switch (level) {
|
||||
case RS_CPU_SCALAR:
|
||||
return true;
|
||||
case RS_CPU_AVX2:
|
||||
return _7_0[1] & (1<<5);
|
||||
case RS_CPU_GFNI:
|
||||
return _7_0[2] & (1<<8) && !rs_detect_valgrind();
|
||||
default:
|
||||
die("bad CPU level %d\n", level);
|
||||
}
|
||||
}
|
||||
|
||||
#define rs_compute_parity_single(D, P, r, i, data, parity) do { \
|
||||
parity[0][i] = 0; \
|
||||
int d; \
|
||||
int p; \
|
||||
for (d = 0; d < D; d++) { \
|
||||
parity[0][i] ^= data[d][i]; \
|
||||
} \
|
||||
for (p = 1; p < P; p++) { \
|
||||
const u8* factor = &r->expanded_matrix[D*32*p]; \
|
||||
parity[p][i] = 0; \
|
||||
for (d = 0; d < D; d++, factor += 32) { \
|
||||
parity[p][i] ^= gf_mul_expanded(data[d][i], factor); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define rs_compute_parity_scalar(D, P, r, size, data, parity) do { \
|
||||
/* parity = r->matrix * data */ \
|
||||
u64 i; \
|
||||
for (i = 0; i < size; i++) { \
|
||||
rs_compute_parity_single(D, P, r, i, data, parity); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define rs_compute_parity_avx2(D, P, r, size, data, parity) do { \
|
||||
__m256i low_nibble_mask = broadcast_u8(0x0f); \
|
||||
size_t avx_leftover = size % 32; \
|
||||
size_t avx_size = size-avx_leftover; \
|
||||
u32 i; \
|
||||
int p; \
|
||||
int d; \
|
||||
for (i = 0; i < avx_size; i += 32) { \
|
||||
{ \
|
||||
__m256i parity_0 = _mm256_setzero_si256(); \
|
||||
for (d = 0; d < D; d++) { \
|
||||
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
|
||||
} \
|
||||
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
|
||||
} \
|
||||
for (p = 1; p < P; p++) { \
|
||||
__m256i parity_p = _mm256_setzero_si256(); \
|
||||
for (d = 0; d < D; d++) { \
|
||||
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
|
||||
__m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]); \
|
||||
parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask)); \
|
||||
} \
|
||||
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
|
||||
} \
|
||||
} \
|
||||
for (i = avx_size; i < size; i++) { \
|
||||
rs_compute_parity_single(D, P, r, i, data, parity); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define rs_compute_parity_gfni(D, P, r, size, data, parity) do { \
|
||||
size_t avx_leftover = size % 32; \
|
||||
size_t avx_size = size-avx_leftover; \
|
||||
u64 i; \
|
||||
int p; \
|
||||
int d; \
|
||||
for (i = 0; i < avx_size; i += 32) { \
|
||||
{ \
|
||||
__m256i parity_0 = _mm256_setzero_si256(); \
|
||||
for (d = 0; d < D; d++) { \
|
||||
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
|
||||
} \
|
||||
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
|
||||
} \
|
||||
for (p = 1; p < P; p++) { \
|
||||
__m256i parity_p = _mm256_setzero_si256(); \
|
||||
for (d = 0; d < D; d++) { \
|
||||
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
|
||||
__m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]); \
|
||||
parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor)); \
|
||||
} \
|
||||
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
|
||||
} \
|
||||
} \
|
||||
for (i = avx_size; i < size; i++) { \
|
||||
rs_compute_parity_single(D, P, r, i, data, parity); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define rs_recover_matmul_single(D, i, have, want, have_to_want) do { \
|
||||
want[i] = 0; \
|
||||
int j; \
|
||||
for (j = 0; j < D; j++) { \
|
||||
want[i] ^= gf_mul(have_to_want[j], have[j][i]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded) do { \
|
||||
want[i] = 0; \
|
||||
int j; \
|
||||
for (j = 0; j < D; j++) { \
|
||||
want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define rs_recover_matmul_scalar(D, size, have, want, have_to_want) do { \
|
||||
u8 have_to_want_expanded[D*32]; \
|
||||
int d; \
|
||||
for (d = 0; d < D; d++) { \
|
||||
gf_mul_expand_factor(have_to_want[d], &have_to_want_expanded[d*32]); \
|
||||
} \
|
||||
size_t i; \
|
||||
for (i = 0; i < size; i++) { \
|
||||
rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded); \
|
||||
} \
|
||||
} while (0) \
|
||||
|
||||
#define rs_recover_matmul_avx2(D, size, have, want, have_to_want) do { \
|
||||
__m256i have_to_want_expanded[D]; \
|
||||
int d; \
|
||||
for (d = 0; d < D; d++) { \
|
||||
gf_mul_expand_factor(have_to_want[d], ((u8*)&have_to_want_expanded[d])); \
|
||||
} \
|
||||
__m256i low_nibble_mask = broadcast_u8(0x0f); \
|
||||
size_t avx_leftover = size % 32; \
|
||||
size_t avx_size = size-avx_leftover; \
|
||||
u64 i; \
|
||||
for (i = 0; i < avx_size; i += 32) { \
|
||||
__m256i want_i = _mm256_setzero_si256(); \
|
||||
for (d = 0; d < D; d++) { \
|
||||
want_i = _mm256_xor_si256( \
|
||||
want_i, \
|
||||
gf_mul_expanded_avx2( \
|
||||
_mm256_loadu_si256((const __m256i*)(have[d] + i)), \
|
||||
have_to_want_expanded[d], \
|
||||
low_nibble_mask \
|
||||
) \
|
||||
); \
|
||||
} \
|
||||
_mm256_storeu_si256((__m256i*)(want + i), want_i); \
|
||||
} \
|
||||
for (i = avx_size; i < size; i++) { \
|
||||
rs_recover_matmul_single_expanded(D, i, have, want, ((const u8*)have_to_want_expanded)); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define rs_recover_matmul_gfni(D, size, have, want, have_to_want) do { \
|
||||
__m256i have_to_want_avx[D]; \
|
||||
int d; \
|
||||
for (d = 0; d < D; d++) { \
|
||||
have_to_want_avx[d] = broadcast_u8(have_to_want[d]); \
|
||||
} \
|
||||
size_t avx_leftover = size % 32; \
|
||||
size_t avx_size = size-avx_leftover; \
|
||||
u64 i; \
|
||||
for (i = 0; i < avx_size; i += 32) { \
|
||||
__m256i want_i = _mm256_setzero_si256(); \
|
||||
for (d = 0; d < D; d++) { \
|
||||
want_i = _mm256_xor_si256( \
|
||||
want_i, \
|
||||
_mm256_gf2p8mul_epi8( \
|
||||
_mm256_loadu_si256((const __m256i*)(have[d] + i)), \
|
||||
have_to_want_avx[d] \
|
||||
) \
|
||||
); \
|
||||
} \
|
||||
_mm256_storeu_si256((__m256i*)(want + i), want_i); \
|
||||
} \
|
||||
for (i = avx_size; i < size; i++) { \
|
||||
rs_recover_matmul_single(D, i, have, want, have_to_want); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static struct rs* rs_new_core(u8 parity) {
|
||||
int B = rs_blocks_core(parity);
|
||||
int D = rs_data_blocks_core(parity);
|
||||
int P = rs_parity_blocks_core(parity);
|
||||
struct rs* r = (struct rs*)rs_malloc(sizeof(struct rs) + B*D + D*P*32);
|
||||
if (r == NULL) { return NULL; }
|
||||
r->parity = parity;
|
||||
r->matrix = (u8*)(r + 1);
|
||||
r->expanded_matrix = r->matrix + B*D;
|
||||
rs_cauchy_matrix(r);
|
||||
int p;
|
||||
int d;
|
||||
for (p = 0; p < P; p++) {
|
||||
for (d = 0; d < D; d++) {
|
||||
gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static void rs_delete_core(struct rs* r) {
|
||||
rs_free(r);
|
||||
}
|
||||
|
||||
static void rs_recover_core(
|
||||
struct rs* r,
|
||||
u64 size,
|
||||
const u8* have_blocks,
|
||||
const u8** have,
|
||||
u8 want_block,
|
||||
u8* want,
|
||||
void (*recover_func)(int D, u64 size, const u8** have, u8* want, const u8* mat)
|
||||
) {
|
||||
int D = rs_data_blocks_core(r->parity);
|
||||
int B = rs_blocks_core(r->parity);
|
||||
// Create some space
|
||||
u8* scratch = (u8*)rs_malloc(D*D + D*D);
|
||||
u8* mat_1 = scratch;
|
||||
u8* mat_2 = scratch + D*D;
|
||||
// Preliminary checks
|
||||
int i, j, d, b;
|
||||
for (d = 0; d < D; d++) {
|
||||
if (have_blocks[d] >= B) {
|
||||
die("have_blocks[%d]=%d >= %d\n", d, have_blocks[d], B);
|
||||
}
|
||||
if (have_blocks[d] == want_block) {
|
||||
die("have_blocks[%d]=%d == want_block=%d\n", d, have_blocks[d], want_block);
|
||||
}
|
||||
if (d > 0 && have_blocks[d] <= have_blocks[d-1]) {
|
||||
die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", d, have_blocks[d], d, have_blocks[d-1]);
|
||||
}
|
||||
}
|
||||
// below in the dimensionality annotation we paper over transposes
|
||||
// [DxD] matrix going from the data blocks to the blocks we currently have
|
||||
u8* data_to_have = mat_1;
|
||||
int have_cursor;
|
||||
for (b = 0, have_cursor = 0; b < B; b++) {
|
||||
if (have_cursor >= D || have_blocks[have_cursor] != b) {
|
||||
continue;
|
||||
}
|
||||
memcpy(data_to_have + have_cursor*D, r->matrix + b*D, D);
|
||||
have_cursor++;
|
||||
}
|
||||
// [DxD] matrix going from what we have to the original data blocks
|
||||
u8* have_to_data = mat_2;
|
||||
if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
|
||||
die("unexpected singular matrix\n");
|
||||
}
|
||||
data_to_have = NULL;
|
||||
// [Dx1] matrix going from the data blocks to the block we want
|
||||
u8* data_to_want = &r->matrix[want_block*D];
|
||||
// have_to_want = data_to_want * have_to_data
|
||||
// [Dx1] matrix going from `blocks` to the block we're into
|
||||
u8* have_to_want = mat_1;
|
||||
for (i = 0; i < D; i++) {
|
||||
have_to_want[i] = 0;
|
||||
for (j = 0; j < D; j++) {
|
||||
have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
|
||||
}
|
||||
}
|
||||
// want = have_to_want * have
|
||||
recover_func(D, size, have, want, have_to_want);
|
||||
// We're done.
|
||||
rs_free(scratch);
|
||||
}
|
||||
+36
-4
@@ -156,6 +156,8 @@
|
||||
//
|
||||
// TODO fill in results
|
||||
|
||||
static constexpr uint64_t EGGSFS_PAGE_SIZE = 4096;
|
||||
|
||||
static auto wrappedSnapshot(rocksdb::DB* db) {
|
||||
auto deleter = [db](const rocksdb::Snapshot* snapshot) {
|
||||
db->ReleaseSnapshot(snapshot);
|
||||
@@ -726,7 +728,11 @@ struct ShardDBImpl {
|
||||
beginKey().setOffset(req.byteOffset);
|
||||
{
|
||||
std::unique_ptr<rocksdb::Iterator> it(_db->NewIterator(options, _spansCf));
|
||||
for (it->SeekForPrev(beginKey.toSlice()); it->Valid(); it->Next()) {
|
||||
for (
|
||||
it->SeekForPrev(beginKey.toSlice());
|
||||
it->Valid() && (req.limit == 0 || resp.spans.els.size() < req.limit);
|
||||
it->Next()
|
||||
) {
|
||||
auto key = ExternalValue<SpanKey>::FromSlice(it->key());
|
||||
if (key().fileId() != req.fileId) {
|
||||
break;
|
||||
@@ -1258,6 +1264,11 @@ struct ShardDBImpl {
|
||||
LOG_DEBUG(_env, "inline span has bad storage class %s", req.storageClass);
|
||||
return EggsError::BAD_SPAN_BODY;
|
||||
}
|
||||
|
||||
if (req.byteOffset%EGGSFS_PAGE_SIZE != 0) {
|
||||
LOG_DEBUG(_env, "req.byteOffset=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, EGGSFS_PAGE_SIZE);
|
||||
return EggsError::BAD_SPAN_BODY;
|
||||
}
|
||||
|
||||
uint32_t expectedCrc = crc32c(0, req.body.data(), req.body.size());
|
||||
expectedCrc = crc32c_zero_extend(expectedCrc, req.size - req.body.size());
|
||||
@@ -1293,10 +1304,14 @@ struct ShardDBImpl {
|
||||
LOG_DEBUG(_env, "bad storage class %s for blocks span", (int)req.storageClass);
|
||||
return EggsError::BAD_SPAN_BODY;
|
||||
}
|
||||
if (req.byteOffset%EGGSFS_PAGE_SIZE != 0 || req.cellSize%EGGSFS_PAGE_SIZE != 0) {
|
||||
LOG_DEBUG(_env, "req.byteOffset=%s or cellSize=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, req.cellSize, EGGSFS_PAGE_SIZE);
|
||||
return EggsError::BAD_SPAN_BODY;
|
||||
}
|
||||
if (!_checkSpanBody(req)) {
|
||||
return EggsError::BAD_SPAN_BODY;
|
||||
}
|
||||
|
||||
|
||||
// start filling in entry
|
||||
entry.fileId = req.fileId;
|
||||
entry.byteOffset = req.byteOffset;
|
||||
@@ -1888,6 +1903,7 @@ struct ShardDBImpl {
|
||||
return EggsError::MISMATCHING_TARGET;
|
||||
}
|
||||
if (edgeBody().creationTime() != creationTime) {
|
||||
LOG_DEBUG(_env, "expected time %s, got %s", edgeBody().creationTime(), creationTime);
|
||||
return EggsError::MISMATCHING_CREATION_TIME;
|
||||
}
|
||||
if (edgeBody().targetIdWithLocked().extra()) { // locked
|
||||
@@ -1992,6 +2008,7 @@ struct ShardDBImpl {
|
||||
}
|
||||
ExternalValue<CurrentEdgeBody> edge(edgeValue);
|
||||
if (edge().creationTime() != entry.creationTime) {
|
||||
LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
|
||||
return EggsError::MISMATCHING_CREATION_TIME;
|
||||
}
|
||||
if (edge().locked()) {
|
||||
@@ -2046,6 +2063,7 @@ struct ShardDBImpl {
|
||||
}
|
||||
ExternalValue<CurrentEdgeBody> edge(edgeValue);
|
||||
if (edge().creationTime() != entry.creationTime) {
|
||||
LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
|
||||
return EggsError::MISMATCHING_CREATION_TIME;
|
||||
}
|
||||
if (!edge().locked()) {
|
||||
@@ -2732,7 +2750,13 @@ struct ShardDBImpl {
|
||||
const auto& cache = _blockServicesCache.at(blockServiceId.u64);
|
||||
auto expectedProof = cbcmac(cache.secretKey, (uint8_t*)buf, sizeof(buf));
|
||||
|
||||
return proof.proof == expectedProof;
|
||||
bool good = proof.proof == expectedProof;
|
||||
|
||||
if (!good) {
|
||||
LOG_DEBUG(_env, "bad block write proof, expected %s, got %s", BincodeFixedBytes<8>(expectedProof), proof);
|
||||
}
|
||||
|
||||
return good;
|
||||
}
|
||||
|
||||
std::array<uint8_t, 8> _blockEraseCertificate(uint32_t blockSize, const BlockBody block, const AES128Key& secretKey) {
|
||||
@@ -3335,8 +3359,16 @@ DirectoryInfo defaultDirectoryInfo() {
|
||||
hddBlocks.storageClass = storageClassByName("HDD");
|
||||
addSegment(BLOCK_POLICY_TAG, blockPolicy);
|
||||
|
||||
// Span policy: up to 10MiB: RS(4,4), up to 100MiB (max span size): RS(10,4)
|
||||
// Span policy:
|
||||
// * up to 64KiB: RS(1,4). This mirroring span simplifies things in the kernel (so that we
|
||||
// we never have cell sizes that are not multiple of span sizes). We still set things
|
||||
// up so that we can lose 4 copies and still be fine.
|
||||
// * up to 10MiB: RS(4,4).
|
||||
// * up to 100MiB (max span size): RS(10,4).
|
||||
SpanPolicy spanPolicy;
|
||||
auto& tinySpans = spanPolicy.entries.els.emplace_back();
|
||||
tinySpans.maxSize = 1 << 16;
|
||||
tinySpans.parity = Parity(1, 4);
|
||||
auto& smallSpans = spanPolicy.entries.els.emplace_back();
|
||||
smallSpans.maxSize = 10 << 20;
|
||||
smallSpans.parity = Parity(4, 4);
|
||||
|
||||
+436
-65
@@ -310,28 +310,381 @@ func generateGo(errors []string, shardReqResps []reqRespType, cdcReqResps []reqR
|
||||
return out.Bytes()
|
||||
}
|
||||
|
||||
func generateC(errors []string, shardReqResps []reqRespType, cdcReqResps []reqRespType, extras []reflect.Type) []byte {
|
||||
out := new(bytes.Buffer)
|
||||
func generateKmodMsgKind(w io.Writer, what string, reqResps []reqRespType) {
|
||||
for _, reqResp := range reqResps {
|
||||
fmt.Fprintf(w, "#define EGGSFS_%s_%s 0x%X\n", what, reqRespEnum(reqResp), reqResp.kind)
|
||||
}
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
|
||||
fmt.Fprintln(out, "// Automatically generated with go run bincodegen.")
|
||||
fmt.Fprintln(out, "// Run `go generate ./...` from the go/ directory to regenerate it.")
|
||||
func kmodFieldName(s string) string {
|
||||
re := regexp.MustCompile(`(.)([A-Z])`)
|
||||
return strings.ToLower(string(re.ReplaceAll([]byte(s), []byte("${1}_${2}"))))
|
||||
}
|
||||
|
||||
func kmodStructName(typ reflect.Type) string {
|
||||
re := regexp.MustCompile(`(.)([A-Z])`)
|
||||
s := strings.ToLower(string(re.ReplaceAll([]byte(typ.Name()), []byte("${1}_${2}"))))
|
||||
return fmt.Sprintf("eggsfs_%s", s)
|
||||
}
|
||||
|
||||
func kmodStaticSizeName(typ reflect.Type) string {
|
||||
re := regexp.MustCompile(`(.)([A-Z])`)
|
||||
s := strings.ToUpper(string(re.ReplaceAll([]byte(typ.Name()), []byte("${1}_${2}"))))
|
||||
return fmt.Sprintf("EGGSFS_%s_SIZE", s)
|
||||
}
|
||||
|
||||
func kmodMaxSizeName(typ reflect.Type) string {
|
||||
re := regexp.MustCompile(`(.)([A-Z])`)
|
||||
s := strings.ToUpper(string(re.ReplaceAll([]byte(typ.Name()), []byte("${1}_${2}"))))
|
||||
return fmt.Sprintf("EGGSFS_%s_MAX_SIZE", s)
|
||||
}
|
||||
|
||||
func generateKmodStructOpaque(w io.Writer, typ reflect.Type, suffix string) {
|
||||
if typ.Kind() != reflect.Struct {
|
||||
panic(fmt.Errorf("unexpected non-struct type %v", typ))
|
||||
}
|
||||
fmt.Fprintf(w, "struct %s_%s;\n", kmodStructName(typ), suffix)
|
||||
}
|
||||
|
||||
func generateKmodStruct(w io.Writer, typ reflect.Type, fldName string, fld string) {
|
||||
if typ.Kind() != reflect.Struct {
|
||||
panic(fmt.Errorf("unexpected non-struct type %v", typ))
|
||||
}
|
||||
fmt.Fprintf(w, "struct %s_%s { %s; };\n", kmodStructName(typ), fldName, fld)
|
||||
}
|
||||
|
||||
func generateKmodSize(staticSizes map[string]int, maxSizes map[string]int, w io.Writer, typ reflect.Type) {
|
||||
if typ.Kind() != reflect.Struct {
|
||||
panic(fmt.Errorf("unexpected non-struct type %v", typ))
|
||||
}
|
||||
updateSize := func(size *int, update int) {
|
||||
if update < 0 {
|
||||
*size = -1
|
||||
} else if *size >= 0 {
|
||||
*size += update
|
||||
}
|
||||
}
|
||||
staticSize := 0
|
||||
maxSize := 0
|
||||
for i := 0; i < typ.NumField(); i++ {
|
||||
fld := typ.Field(i)
|
||||
fldK := fld.Type.Kind()
|
||||
if fldK == reflect.Struct {
|
||||
fldStaticSize, found := staticSizes[kmodStaticSizeName(fld.Type)]
|
||||
if found {
|
||||
updateSize(&staticSize, fldStaticSize)
|
||||
updateSize(&maxSize, fldStaticSize)
|
||||
} else {
|
||||
updateSize(&staticSize, -1)
|
||||
fldMaxSize, found := staticSizes[kmodMaxSizeName(fld.Type)]
|
||||
if found {
|
||||
updateSize(&maxSize, fldMaxSize)
|
||||
} else {
|
||||
updateSize(&maxSize, -1)
|
||||
}
|
||||
}
|
||||
} else if fldK == reflect.Bool || fldK == reflect.Uint8 || fldK == reflect.Uint16 || fldK == reflect.Uint32 || fldK == reflect.Uint64 || fldK == reflect.Array {
|
||||
updateSize(&staticSize, int(fld.Type.Size()))
|
||||
updateSize(&maxSize, int(fld.Type.Size()))
|
||||
} else if fldK == reflect.Slice || fldK == reflect.String {
|
||||
updateSize(&staticSize, -1)
|
||||
elem := sliceTypeElem(fld.Type)
|
||||
if elem.Kind() == reflect.Uint8 {
|
||||
updateSize(&maxSize, 256) // len + body
|
||||
} else {
|
||||
updateSize(&maxSize, -1)
|
||||
}
|
||||
} else {
|
||||
panic(fmt.Errorf("unexpected kind %v", fldK))
|
||||
}
|
||||
}
|
||||
if staticSize >= 0 {
|
||||
sizeName := kmodStaticSizeName(typ)
|
||||
staticSizes[sizeName] = staticSize
|
||||
fmt.Fprintf(w, "#define %s %v\n", sizeName, staticSize)
|
||||
} else if maxSize >= 0 {
|
||||
sizeName := kmodMaxSizeName(typ)
|
||||
maxSizes[sizeName] = maxSize
|
||||
fmt.Fprintf(w, "#define %s %v\n", sizeName, maxSize)
|
||||
}
|
||||
}
|
||||
|
||||
func kmodType(t reflect.Type) string {
|
||||
switch t.Kind() {
|
||||
case reflect.Uint8:
|
||||
return "u8"
|
||||
case reflect.Uint16:
|
||||
return "u16"
|
||||
case reflect.Uint32:
|
||||
return "u32"
|
||||
case reflect.Uint64:
|
||||
return "u64"
|
||||
case reflect.Bool:
|
||||
return "bool"
|
||||
case reflect.Struct:
|
||||
return fmt.Sprintf("%s*", kmodStructName(t))
|
||||
case reflect.Slice, reflect.String:
|
||||
elem := sliceTypeElem(t)
|
||||
if elem.Kind() == reflect.Uint8 {
|
||||
return "eggsfs_bytes*"
|
||||
} else {
|
||||
return kmodType(elem)
|
||||
}
|
||||
default:
|
||||
panic(fmt.Sprintf("unsupported type with kind %v", t.Kind()))
|
||||
}
|
||||
}
|
||||
|
||||
func generateKmodGet(staticSizes map[string]int, h io.Writer, typ reflect.Type) {
|
||||
generateKmodStructOpaque(h, typ, "start")
|
||||
fmt.Fprintf(h, "#define %s_get_start(ctx, start) struct %s_start* start = NULL\n\n", kmodStructName(typ), kmodStructName(typ))
|
||||
prevType := fmt.Sprintf("struct %s_start*", kmodStructName(typ))
|
||||
for i := 0; i < typ.NumField(); i++ {
|
||||
fld := typ.Field(i)
|
||||
fldK := fld.Type.Kind()
|
||||
fldTyp := fld.Type
|
||||
endianness := "le"
|
||||
if fldK == reflect.Array {
|
||||
sz := fld.Type.Size()
|
||||
endianness = "be"
|
||||
if sz == 4 {
|
||||
fldK = reflect.Uint32
|
||||
fldTyp = reflect.TypeOf(uint32(0))
|
||||
} else if sz == 8 {
|
||||
fldK = reflect.Uint64
|
||||
fldTyp = reflect.TypeOf(uint64(0))
|
||||
} else {
|
||||
panic(fmt.Errorf("bad array size %v", fld.Type.Size()))
|
||||
}
|
||||
}
|
||||
fldName := kmodFieldName(fld.Name)
|
||||
fldKTyp := kmodType(fldTyp)
|
||||
if fldK == reflect.Struct {
|
||||
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }; \\\n", prevType)
|
||||
fmt.Fprintf(h, " struct %s_start* next = NULL\n\n", kmodStructName(fldTyp))
|
||||
prevType = fmt.Sprintf("struct %s_end*", kmodStructName(fldTyp))
|
||||
} else if fldK == reflect.Slice || fldK == reflect.String {
|
||||
elemTyp := sliceTypeElem(fldTyp)
|
||||
if elemTyp.Kind() == reflect.Uint8 {
|
||||
generateKmodStruct(h, typ, fldName, "struct eggsfs_bincode_bytes str")
|
||||
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, "static inline void _%s_get_%s(struct eggsfs_bincode_get_ctx* ctx, %s* prev, %s* next) { \n", kmodStructName(typ), fldName, prevType, nextType)
|
||||
fmt.Fprintf(h, " if (likely(ctx->err == 0)) { \n")
|
||||
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < 1)) { \n")
|
||||
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
|
||||
fmt.Fprintf(h, " } else { \n")
|
||||
fmt.Fprintf(h, " next->str.len = *(u8*)(ctx->buf); \n")
|
||||
fmt.Fprintf(h, " ctx->buf++; \n")
|
||||
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < next->str.len)) { \n")
|
||||
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
|
||||
fmt.Fprintf(h, " } else { \n")
|
||||
fmt.Fprintf(h, " next->str.buf = ctx->buf; \n")
|
||||
fmt.Fprintf(h, " ctx->buf += next->str.len; \n")
|
||||
fmt.Fprintf(h, " } \n")
|
||||
fmt.Fprintf(h, " } \n")
|
||||
fmt.Fprintf(h, " }\n")
|
||||
fmt.Fprintf(h, "}\n")
|
||||
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " _%s_get_%s(ctx, &(prev), &(next))\n\n", kmodStructName(typ), fldName)
|
||||
prevType = nextType
|
||||
} else {
|
||||
generateKmodStruct(h, typ, fldName, "u16 len")
|
||||
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, "static inline void _%s_get_%s(struct eggsfs_bincode_get_ctx* ctx, %s* prev, %s* next) { \n", kmodStructName(typ), fldName, prevType, nextType)
|
||||
fmt.Fprintf(h, " if (likely(ctx->err == 0)) { \n")
|
||||
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < 2)) { \n")
|
||||
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
|
||||
fmt.Fprintf(h, " } else { \n")
|
||||
fmt.Fprintf(h, " next->len = get_unaligned_le16(ctx->buf); \n")
|
||||
fmt.Fprintf(h, " ctx->buf += 2; \n")
|
||||
fmt.Fprintf(h, " } \n")
|
||||
fmt.Fprintf(h, " } else { \n")
|
||||
fmt.Fprintf(h, " next->len = 0; \n") // so that loops can work even in the case of errors
|
||||
fmt.Fprintf(h, " } \n")
|
||||
fmt.Fprintf(h, "} \n")
|
||||
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " _%s_get_%s(ctx, &(prev), &(next))\n\n", kmodStructName(typ), fldName)
|
||||
prevType = nextType
|
||||
}
|
||||
} else if fldK == reflect.Bool || fldK == reflect.Uint8 || fldK == reflect.Uint16 || fldK == reflect.Uint32 || fldK == reflect.Uint64 {
|
||||
generateKmodStruct(h, typ, fldName, fmt.Sprintf("%s x", fldKTyp))
|
||||
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, "static inline void _%s_get_%s(struct eggsfs_bincode_get_ctx* ctx, %s* prev, %s* next) { \n", kmodStructName(typ), fldName, prevType, nextType)
|
||||
fmt.Fprintf(h, " if (likely(ctx->err == 0)) { \n")
|
||||
fmt.Fprintf(h, " if (unlikely(ctx->end - ctx->buf < %d)) { \n", fldTyp.Size())
|
||||
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
|
||||
fmt.Fprintf(h, " } else { \n")
|
||||
switch fldK {
|
||||
case reflect.Bool:
|
||||
fmt.Fprintf(h, " next->x = *(bool*)(ctx->buf); \n")
|
||||
case reflect.Uint8:
|
||||
fmt.Fprintf(h, " next->x = *(u8*)(ctx->buf); \n")
|
||||
case reflect.Uint16:
|
||||
fmt.Fprintf(h, " next->x = get_unaligned_%s16(ctx->buf); \n", endianness)
|
||||
case reflect.Uint32:
|
||||
fmt.Fprintf(h, " next->x = get_unaligned_%s32(ctx->buf); \n", endianness)
|
||||
case reflect.Uint64:
|
||||
fmt.Fprintf(h, " next->x = get_unaligned_%s64(ctx->buf); \n", endianness)
|
||||
}
|
||||
fmt.Fprintf(h, " ctx->buf += %d; \n", fldTyp.Size())
|
||||
fmt.Fprintf(h, " } \n")
|
||||
fmt.Fprintf(h, " }\n")
|
||||
fmt.Fprintf(h, "}\n")
|
||||
fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " _%s_get_%s(ctx, &(prev), &(next))\n\n", kmodStructName(typ), fldName)
|
||||
prevType = nextType
|
||||
} else {
|
||||
panic(fmt.Errorf("unexpected kind %v", fldK))
|
||||
}
|
||||
}
|
||||
generateKmodStructOpaque(h, typ, "end")
|
||||
fmt.Fprintf(h, "#define %s_get_end(ctx, prev, next) \\\n", kmodStructName(typ))
|
||||
fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }\\\n", prevType)
|
||||
fmt.Fprintf(h, " struct %s_end* next = NULL\n\n", kmodStructName(typ))
|
||||
fmt.Fprintf(h, "static inline void %s_get_finish(struct eggsfs_bincode_get_ctx* ctx, struct %s_end* end) {\n", kmodStructName(typ), kmodStructName(typ))
|
||||
fmt.Fprintf(h, " if (unlikely(ctx->buf != ctx->end)) { \n")
|
||||
fmt.Fprintf(h, " ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE; \n")
|
||||
fmt.Fprintf(h, " }\n")
|
||||
fmt.Fprintf(h, "}\n\n")
|
||||
}
|
||||
|
||||
func generateKmodPut(staticSizes map[string]int, h io.Writer, typ reflect.Type) {
|
||||
fmt.Fprintf(h, "#define %s_put_start(ctx, start) struct %s_start* start = NULL\n\n", kmodStructName(typ), kmodStructName(typ))
|
||||
prevType := fmt.Sprintf("struct %s_start*", kmodStructName(typ))
|
||||
for i := 0; i < typ.NumField(); i++ {
|
||||
fld := typ.Field(i)
|
||||
fldK := fld.Type.Kind()
|
||||
fldTyp := fld.Type
|
||||
endianness := "le"
|
||||
if fldK == reflect.Array {
|
||||
sz := fld.Type.Size()
|
||||
endianness = "be"
|
||||
if sz == 4 {
|
||||
fldK = reflect.Uint32
|
||||
fldTyp = reflect.TypeOf(uint32(0))
|
||||
} else if sz == 8 {
|
||||
fldK = reflect.Uint64
|
||||
fldTyp = reflect.TypeOf(uint64(0))
|
||||
} else {
|
||||
panic(fmt.Errorf("bad array size %v", fld.Type.Size()))
|
||||
}
|
||||
}
|
||||
fldName := kmodFieldName(fld.Name)
|
||||
fldKTyp := kmodType(fldTyp)
|
||||
if fldK == reflect.Struct {
|
||||
// fmt.Fprintf(h, "#define %s_get_%s(ctx, prev, next) \\\n", kmodStructName(typ), fldName)
|
||||
// fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }; \\\n", prevType)
|
||||
// fmt.Fprintf(h, " struct %s_start* next = NULL\n\n", kmodStructName(fldTyp))
|
||||
// prevType = fmt.Sprintf("struct %s_end*", kmodStructName(fldTyp))
|
||||
} else if fldK == reflect.Slice || fldK == reflect.String {
|
||||
elemTyp := sliceTypeElem(fldTyp)
|
||||
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
|
||||
if elemTyp.Kind() == reflect.Uint8 {
|
||||
fmt.Fprintf(h, "static inline void _%s_put_%s(struct eggsfs_bincode_put_ctx* ctx, %s* prev, %s* next, const char* str, int str_len) {\n", kmodStructName(typ), fldName, prevType, nextType)
|
||||
fmt.Fprintf(h, " next = NULL;\n")
|
||||
fmt.Fprintf(h, " BUG_ON(str_len < 0 || str_len > 255);\n")
|
||||
fmt.Fprintf(h, " BUG_ON(ctx->end - ctx->cursor < (1 + str_len));\n")
|
||||
fmt.Fprintf(h, " *(u8*)(ctx->cursor) = str_len;\n")
|
||||
fmt.Fprintf(h, " memcpy(ctx->cursor + 1, str, str_len);\n")
|
||||
fmt.Fprintf(h, " ctx->cursor += 1 + str_len;\n")
|
||||
fmt.Fprintf(h, "}\n")
|
||||
fmt.Fprintf(h, "#define %s_put_%s(ctx, prev, next, str, str_len) \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " _%s_put_%s(ctx, &(prev), &(next), str, str_len)\n\n", kmodStructName(typ), fldName)
|
||||
} else {
|
||||
fmt.Fprintf(h, "static inline void _%s_put_%s(struct eggsfs_bincode_put_ctx* ctx, %s* prev, %s* next, int len) {\n", kmodStructName(typ), fldName, prevType, nextType)
|
||||
fmt.Fprintf(h, " next = NULL;\n")
|
||||
fmt.Fprintf(h, " BUG_ON(len < 0 || len >= 1<<16);\n")
|
||||
fmt.Fprintf(h, " BUG_ON(ctx->end - ctx->cursor < 2);\n")
|
||||
fmt.Fprintf(h, " put_unaligned_le16(len, ctx->cursor);\n")
|
||||
fmt.Fprintf(h, " ctx->cursor += 2;\n")
|
||||
fmt.Fprintf(h, "}\n")
|
||||
fmt.Fprintf(h, "#define %s_put_%s(ctx, prev, next, len) \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " _%s_put_%s(ctx, &(prev), &(next), len)\n\n", kmodStructName(typ), fldName)
|
||||
}
|
||||
prevType = nextType
|
||||
} else if fldK == reflect.Bool || fldK == reflect.Uint8 || fldK == reflect.Uint16 || fldK == reflect.Uint32 || fldK == reflect.Uint64 {
|
||||
nextType := fmt.Sprintf("struct %s_%s", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, "static inline void _%s_put_%s(struct eggsfs_bincode_put_ctx* ctx, %s* prev, %s* next, %s x) {\n", kmodStructName(typ), fldName, prevType, nextType, fldKTyp)
|
||||
fmt.Fprintf(h, " next = NULL;\n")
|
||||
fmt.Fprintf(h, " BUG_ON(ctx->end - ctx->cursor < %d);\n", fldTyp.Size())
|
||||
switch fldK {
|
||||
case reflect.Bool:
|
||||
fmt.Fprintf(h, " *(bool*)(ctx->cursor) = x;\n")
|
||||
case reflect.Uint8:
|
||||
fmt.Fprintf(h, " *(u8*)(ctx->cursor) = x;\n")
|
||||
case reflect.Uint16:
|
||||
fmt.Fprintf(h, " put_unaligned_%s16(x, ctx->cursor);\n", endianness)
|
||||
case reflect.Uint32:
|
||||
fmt.Fprintf(h, " put_unaligned_%s32(x, ctx->cursor);\n", endianness)
|
||||
case reflect.Uint64:
|
||||
fmt.Fprintf(h, " put_unaligned_%s64(x, ctx->cursor);\n", endianness)
|
||||
}
|
||||
fmt.Fprintf(h, " ctx->cursor += %d;\n", fldTyp.Size())
|
||||
fmt.Fprintf(h, "}\n")
|
||||
fmt.Fprintf(h, "#define %s_put_%s(ctx, prev, next, x) \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " struct %s_%s next; \\\n", kmodStructName(typ), fldName)
|
||||
fmt.Fprintf(h, " _%s_put_%s(ctx, &(prev), &(next), x)\n\n", kmodStructName(typ), fldName)
|
||||
prevType = nextType
|
||||
} else {
|
||||
panic(fmt.Errorf("unexpected kind %v", fldK))
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(h, "#define %s_put_end(ctx, prev, next) \\\n", kmodStructName(typ))
|
||||
fmt.Fprintf(h, " { %s* __dummy __attribute__((unused)) = &(prev); }\\\n", prevType)
|
||||
fmt.Fprintf(h, " struct %s_end* next __attribute__((unused)) = NULL\n\n", kmodStructName(typ))
|
||||
}
|
||||
|
||||
func generateKmod(errors []string, shardReqResps []reqRespType, cdcReqResps []reqRespType, shuckleReqResps []reqRespType, blocksReqResps []reqRespType, extras []reflect.Type) []byte {
|
||||
hOut := new(bytes.Buffer)
|
||||
|
||||
fmt.Fprintln(hOut, "// Automatically generated with go run bincodegen.")
|
||||
fmt.Fprintln(hOut, "// Run `go generate ./...` from the go/ directory to regenerate it.")
|
||||
fmt.Fprintln(hOut)
|
||||
|
||||
for i, err := range errors {
|
||||
fmt.Fprintf(out, "#define EGGSFS_ERR_%s %d\n", err, errCodeOffset+i)
|
||||
fmt.Fprintf(hOut, "#define EGGSFS_ERR_%s %d\n", err, errCodeOffset+i)
|
||||
}
|
||||
fmt.Fprintf(out, "\n")
|
||||
fmt.Fprintf(hOut, "\n")
|
||||
|
||||
for _, reqResp := range shardReqResps {
|
||||
fmt.Fprintf(out, "#define EGGSFS_META_%s 0x%X\n", reqRespEnum(reqResp), reqResp.kind)
|
||||
generateKmodMsgKind(hOut, "SHARD", shardReqResps)
|
||||
generateKmodMsgKind(hOut, "CDC", cdcReqResps)
|
||||
generateKmodMsgKind(hOut, "SHUCKLE", shuckleReqResps)
|
||||
generateKmodMsgKind(hOut, "BLOCKS", blocksReqResps)
|
||||
|
||||
fmt.Fprintln(hOut)
|
||||
|
||||
staticSizes := make(map[string]int)
|
||||
maxSizes := make(map[string]int)
|
||||
for _, typ := range extras {
|
||||
generateKmodSize(staticSizes, maxSizes, hOut, typ)
|
||||
generateKmodGet(staticSizes, hOut, typ)
|
||||
generateKmodPut(staticSizes, hOut, typ)
|
||||
}
|
||||
fmt.Fprintf(out, "\n")
|
||||
|
||||
for _, reqResp := range cdcReqResps {
|
||||
fmt.Fprintf(out, "#define EGGSFS_CDC_%s 0x%X\n", reqRespEnum(reqResp), reqResp.kind)
|
||||
generateReqResps := func(reqResps []reqRespType) {
|
||||
for _, reqResp := range reqResps {
|
||||
generateKmodSize(staticSizes, maxSizes, hOut, reqResp.req)
|
||||
generateKmodGet(staticSizes, hOut, reqResp.req)
|
||||
generateKmodPut(staticSizes, hOut, reqResp.req)
|
||||
generateKmodSize(staticSizes, maxSizes, hOut, reqResp.resp)
|
||||
generateKmodGet(staticSizes, hOut, reqResp.resp)
|
||||
generateKmodPut(staticSizes, hOut, reqResp.resp)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(out, "\n")
|
||||
|
||||
return out.Bytes()
|
||||
generateReqResps(shardReqResps)
|
||||
generateReqResps(cdcReqResps)
|
||||
generateReqResps(shuckleReqResps)
|
||||
generateReqResps(blocksReqResps)
|
||||
|
||||
return hOut.Bytes()
|
||||
}
|
||||
|
||||
func cppType(t reflect.Type) string {
|
||||
@@ -549,6 +902,8 @@ func generateCppSingle(hpp io.Writer, cpp io.Writer, t reflect.Type) {
|
||||
fld := t.Field(i)
|
||||
if cppType(fld.Type) == "uint8_t" {
|
||||
fmt.Fprintf(cpp, " << \"%s=\" << (int)x.%s", fld.Name, cppFieldName(fld))
|
||||
} else if fld.Type.Kind() == reflect.String {
|
||||
fmt.Fprintf(cpp, " << \"%s=\" << GoLangQuotedStringFmt(x.%s.data(), x.%s.size())", fld.Name, cppFieldName(fld), cppFieldName(fld))
|
||||
} else {
|
||||
fmt.Fprintf(cpp, " << \"%s=\" << x.%s", fld.Name, cppFieldName(fld))
|
||||
}
|
||||
@@ -899,7 +1254,7 @@ func main() {
|
||||
"BLOCK_TOO_BIG",
|
||||
}
|
||||
|
||||
shardReqResps := []reqRespType{
|
||||
kernelShardReqResps := []reqRespType{
|
||||
{
|
||||
0x01,
|
||||
reflect.TypeOf(msgs.LookupReq{}),
|
||||
@@ -910,11 +1265,6 @@ func main() {
|
||||
reflect.TypeOf(msgs.StatFileReq{}),
|
||||
reflect.TypeOf(msgs.StatFileResp{}),
|
||||
},
|
||||
{
|
||||
0x03,
|
||||
reflect.TypeOf(msgs.StatTransientFileReq{}),
|
||||
reflect.TypeOf(msgs.StatTransientFileResp{}),
|
||||
},
|
||||
{
|
||||
0x04,
|
||||
reflect.TypeOf(msgs.StatDirectoryReq{}),
|
||||
@@ -961,25 +1311,33 @@ func main() {
|
||||
reflect.TypeOf(msgs.SameDirectoryRenameResp{}),
|
||||
},
|
||||
{
|
||||
0x0D,
|
||||
reflect.TypeOf(msgs.SetDirectoryInfoReq{}),
|
||||
reflect.TypeOf(msgs.SetDirectoryInfoResp{}),
|
||||
0x10,
|
||||
reflect.TypeOf(msgs.AddInlineSpanReq{}),
|
||||
reflect.TypeOf(msgs.AddInlineSpanResp{}),
|
||||
},
|
||||
{
|
||||
0x0E,
|
||||
reflect.TypeOf(msgs.SnapshotLookupReq{}),
|
||||
reflect.TypeOf(msgs.SnapshotLookupResp{}),
|
||||
},
|
||||
}
|
||||
|
||||
shardReqResps := append(kernelShardReqResps, []reqRespType{
|
||||
{
|
||||
0x03,
|
||||
reflect.TypeOf(msgs.StatTransientFileReq{}),
|
||||
reflect.TypeOf(msgs.StatTransientFileResp{}),
|
||||
},
|
||||
{
|
||||
0x0D,
|
||||
reflect.TypeOf(msgs.SetDirectoryInfoReq{}),
|
||||
reflect.TypeOf(msgs.SetDirectoryInfoResp{}),
|
||||
},
|
||||
{
|
||||
0x0F,
|
||||
reflect.TypeOf(msgs.ExpireTransientFileReq{}),
|
||||
reflect.TypeOf(msgs.ExpireTransientFileResp{}),
|
||||
},
|
||||
{
|
||||
0x10,
|
||||
reflect.TypeOf(msgs.AddInlineSpanReq{}),
|
||||
reflect.TypeOf(msgs.AddInlineSpanResp{}),
|
||||
},
|
||||
// PRIVATE OPERATIONS -- These are safe operations, but we don't want the FS client itself
|
||||
// to perform them. TODO make privileged?
|
||||
{
|
||||
@@ -1078,9 +1436,9 @@ func main() {
|
||||
reflect.TypeOf(msgs.MakeFileTransientReq{}),
|
||||
reflect.TypeOf(msgs.MakeFileTransientResp{}),
|
||||
},
|
||||
}
|
||||
}...)
|
||||
|
||||
cdcReqResps := []reqRespType{
|
||||
kernelCdcReqResps := []reqRespType{
|
||||
{
|
||||
0x01,
|
||||
reflect.TypeOf(msgs.MakeDirectoryReq{}),
|
||||
@@ -1101,6 +1459,9 @@ func main() {
|
||||
reflect.TypeOf(msgs.RenameDirectoryReq{}),
|
||||
reflect.TypeOf(msgs.RenameDirectoryResp{}),
|
||||
},
|
||||
}
|
||||
|
||||
cdcReqResps := append(kernelCdcReqResps, []reqRespType{
|
||||
{
|
||||
0x05,
|
||||
reflect.TypeOf(msgs.HardUnlinkDirectoryReq{}),
|
||||
@@ -1111,9 +1472,22 @@ func main() {
|
||||
reflect.TypeOf(msgs.CrossShardHardUnlinkFileReq{}),
|
||||
reflect.TypeOf(msgs.CrossShardHardUnlinkFileResp{}),
|
||||
},
|
||||
}...)
|
||||
|
||||
kernelShuckleReqResps := []reqRespType{
|
||||
{
|
||||
0x03,
|
||||
reflect.TypeOf(msgs.ShardsReq{}),
|
||||
reflect.TypeOf(msgs.ShardsResp{}),
|
||||
},
|
||||
{
|
||||
0x07,
|
||||
reflect.TypeOf(msgs.CdcReq{}),
|
||||
reflect.TypeOf(msgs.CdcResp{}),
|
||||
},
|
||||
}
|
||||
|
||||
shuckleReqResps := []reqRespType{
|
||||
shuckleReqResps := append(kernelShuckleReqResps, []reqRespType{
|
||||
{
|
||||
0x01,
|
||||
reflect.TypeOf(msgs.BlockServicesForShardReq{}),
|
||||
@@ -1124,11 +1498,6 @@ func main() {
|
||||
reflect.TypeOf(msgs.RegisterBlockServicesReq{}),
|
||||
reflect.TypeOf(msgs.RegisterBlockServicesResp{}),
|
||||
},
|
||||
{
|
||||
0x03,
|
||||
reflect.TypeOf(msgs.ShardsReq{}),
|
||||
reflect.TypeOf(msgs.ShardsResp{}),
|
||||
},
|
||||
{
|
||||
0x04,
|
||||
reflect.TypeOf(msgs.RegisterShardReq{}),
|
||||
@@ -1144,19 +1513,9 @@ func main() {
|
||||
reflect.TypeOf(msgs.RegisterCdcReq{}),
|
||||
reflect.TypeOf(msgs.RegisterCdcResp{}),
|
||||
},
|
||||
{
|
||||
0x07,
|
||||
reflect.TypeOf(msgs.CdcReq{}),
|
||||
reflect.TypeOf(msgs.CdcResp{}),
|
||||
},
|
||||
}
|
||||
}...)
|
||||
|
||||
blocksReqResps := []reqRespType{
|
||||
{
|
||||
0x01,
|
||||
reflect.TypeOf(msgs.EraseBlockReq{}),
|
||||
reflect.TypeOf(msgs.EraseBlockResp{}),
|
||||
},
|
||||
kernelBlocksReqResps := []reqRespType{
|
||||
{
|
||||
0x02,
|
||||
reflect.TypeOf(msgs.FetchBlockReq{}),
|
||||
@@ -1174,32 +1533,43 @@ func main() {
|
||||
},
|
||||
}
|
||||
|
||||
extras := []reflect.Type{
|
||||
reflect.TypeOf(msgs.TransientFile{}),
|
||||
reflect.TypeOf(msgs.FetchedBlock{}),
|
||||
blocksReqResps := append(kernelBlocksReqResps, []reqRespType{
|
||||
{
|
||||
0x01,
|
||||
reflect.TypeOf(msgs.EraseBlockReq{}),
|
||||
reflect.TypeOf(msgs.EraseBlockResp{}),
|
||||
},
|
||||
}...)
|
||||
|
||||
kernelExtras := []reflect.Type{
|
||||
reflect.TypeOf(msgs.DirectoryInfoEntry{}),
|
||||
reflect.TypeOf(msgs.DirectoryInfo{}),
|
||||
reflect.TypeOf(msgs.CurrentEdge{}),
|
||||
reflect.TypeOf(msgs.Edge{}),
|
||||
reflect.TypeOf(msgs.BlockInfo{}),
|
||||
reflect.TypeOf(msgs.BlockProof{}),
|
||||
reflect.TypeOf(msgs.BlockService{}),
|
||||
reflect.TypeOf(msgs.ShardInfo{}),
|
||||
reflect.TypeOf(msgs.BlockPolicyEntry{}),
|
||||
reflect.TypeOf(msgs.SpanPolicyEntry{}),
|
||||
reflect.TypeOf(msgs.StripePolicy{}),
|
||||
reflect.TypeOf(msgs.SnapshotLookupEdge{}),
|
||||
reflect.TypeOf(msgs.FetchedBlock{}),
|
||||
reflect.TypeOf(msgs.FetchedSpanHeader{}),
|
||||
reflect.TypeOf(msgs.FetchedInlineSpan{}),
|
||||
reflect.TypeOf(msgs.FetchedBlocksSpan{}),
|
||||
reflect.TypeOf(msgs.BlockProof{}),
|
||||
reflect.TypeOf(msgs.DirectoryInfoEntry{}),
|
||||
reflect.TypeOf(msgs.DirectoryInfo{}),
|
||||
reflect.TypeOf(msgs.BlockService{}),
|
||||
}
|
||||
|
||||
extras := append(kernelExtras, []reflect.Type{
|
||||
reflect.TypeOf(msgs.TransientFile{}),
|
||||
reflect.TypeOf(msgs.Edge{}),
|
||||
reflect.TypeOf(msgs.FullReadDirCursor{}),
|
||||
reflect.TypeOf(msgs.EntryNewBlockInfo{}),
|
||||
reflect.TypeOf(msgs.SnapshotLookupEdge{}),
|
||||
reflect.TypeOf(msgs.BlockServiceInfo{}),
|
||||
reflect.TypeOf(msgs.ShardInfo{}),
|
||||
reflect.TypeOf(msgs.RegisterShardInfo{}),
|
||||
reflect.TypeOf(msgs.SpanPolicyEntry{}),
|
||||
reflect.TypeOf(msgs.SpanPolicy{}),
|
||||
reflect.TypeOf(msgs.BlockPolicyEntry{}),
|
||||
reflect.TypeOf(msgs.BlockPolicy{}),
|
||||
reflect.TypeOf(msgs.SnapshotPolicy{}),
|
||||
reflect.TypeOf(msgs.StripePolicy{}),
|
||||
}
|
||||
}...)
|
||||
|
||||
goCode := generateGo(errors, shardReqResps, cdcReqResps, shuckleReqResps, blocksReqResps, extras)
|
||||
goOutFileName := fmt.Sprintf("%s/msgs_bincode.go", cwd)
|
||||
@@ -1210,13 +1580,14 @@ func main() {
|
||||
defer goOutFile.Close()
|
||||
goOutFile.Write(goCode)
|
||||
|
||||
cOutFilename := fmt.Sprintf("%s/../../c/eggs_msgs.h", cwd)
|
||||
cOutFile, err := os.Create(cOutFilename)
|
||||
kmodHOutFilename := fmt.Sprintf("%s/../../kmod/bincodegen.h", cwd)
|
||||
kmodHOutFile, err := os.Create(kmodHOutFilename)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer cOutFile.Close()
|
||||
cOutFile.Write(generateC(errors, shardReqResps, cdcReqResps, extras))
|
||||
defer kmodHOutFile.Close()
|
||||
kmodHBytes := generateKmod(errors, kernelShardReqResps, kernelCdcReqResps, kernelShuckleReqResps, kernelBlocksReqResps, kernelExtras)
|
||||
kmodHOutFile.Write(kmodHBytes)
|
||||
|
||||
hppOutFilename := fmt.Sprintf("%s/../../cpp/core/MsgsGen.hpp", cwd)
|
||||
hppOutFile, err := os.Create(hppOutFilename)
|
||||
|
||||
@@ -149,13 +149,8 @@ func registerPeriodically(
|
||||
}
|
||||
|
||||
func checkEraseCertificate(log *lib.Logger, blockServiceId msgs.BlockServiceId, cipher cipher.Block, req *msgs.EraseBlockReq) msgs.ErrCode {
|
||||
// compute mac
|
||||
w := bytes.NewBuffer([]byte{})
|
||||
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
|
||||
w.Write([]byte{'e'})
|
||||
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
|
||||
expectedMac := lib.CBCMAC(cipher, w.Bytes())
|
||||
if expectedMac != req.Certificate {
|
||||
expectedMac, good := lib.CheckBlockEraseCertificate(blockServiceId, cipher, req)
|
||||
if !good {
|
||||
log.RaiseAlert(fmt.Errorf("bad MAC, got %v, expected %v", req.Certificate, expectedMac))
|
||||
return msgs.BAD_CERTIFICATE
|
||||
}
|
||||
@@ -221,17 +216,10 @@ func sendFetchBlock(log *lib.Logger, blockServiceId msgs.BlockServiceId, basePat
|
||||
}
|
||||
|
||||
func checkWriteCertificate(log *lib.Logger, cipher cipher.Block, blockServiceId msgs.BlockServiceId, req *msgs.WriteBlockReq) msgs.ErrCode {
|
||||
// Compute mac
|
||||
w := bytes.NewBuffer([]byte{})
|
||||
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
|
||||
w.Write([]byte{'w'})
|
||||
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
|
||||
binary.Write(w, binary.LittleEndian, uint32(req.Crc))
|
||||
binary.Write(w, binary.LittleEndian, uint32(req.Size))
|
||||
expectedMac := lib.CBCMAC(cipher, w.Bytes())
|
||||
if expectedMac != req.Certificate {
|
||||
log.Debug("mac computed for %v %v %v %v", uint64(blockServiceId), uint64(req.BlockId), uint32(req.Crc), uint32(req.Size))
|
||||
log.RaiseAlert(fmt.Errorf("bad MAC for %v, got %v, expected %v", w.Bytes(), req.Certificate, expectedMac))
|
||||
expectedMac, good := lib.CheckBlockWriteCertificate(cipher, blockServiceId, req)
|
||||
if !good {
|
||||
log.Debug("mac computed for %v %v %v %v", blockServiceId, req.BlockId, req.Crc, req.Size)
|
||||
log.RaiseAlert(fmt.Errorf("bad MAC, got %v, expected %v", req.Certificate, expectedMac))
|
||||
return msgs.BAD_CERTIFICATE
|
||||
}
|
||||
return 0
|
||||
@@ -349,6 +337,7 @@ NextRequest:
|
||||
continue
|
||||
}
|
||||
log.Debug("servicing request of type %T from %v", req, conn.RemoteAddr())
|
||||
log.Trace("req %+v", req)
|
||||
switch whichReq := req.(type) {
|
||||
case *msgs.EraseBlockReq:
|
||||
if err := checkEraseCertificate(log, blockServiceId, blockService.cipher, whichReq); err != 0 {
|
||||
@@ -647,6 +636,7 @@ func main() {
|
||||
defer func() { handleRecover(log, terminateChan, recover()) }()
|
||||
for {
|
||||
conn, err := listener1.Accept()
|
||||
log.Trace("new conn %+v", conn)
|
||||
if err != nil {
|
||||
terminateChan <- err
|
||||
return
|
||||
@@ -662,6 +652,7 @@ func main() {
|
||||
defer func() { handleRecover(log, terminateChan, recover()) }()
|
||||
for {
|
||||
conn, err := listener2.Accept()
|
||||
log.Trace("new conn %+v", conn)
|
||||
if err != nil {
|
||||
terminateChan <- err
|
||||
return
|
||||
|
||||
@@ -1,15 +1,18 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"xtx/eggsfs/crc32c"
|
||||
"xtx/eggsfs/lib"
|
||||
"xtx/eggsfs/msgs"
|
||||
)
|
||||
@@ -341,6 +344,44 @@ func main() {
|
||||
run: cpOutofRun,
|
||||
}
|
||||
|
||||
blockReqCmd := flag.NewFlagSet("write-block-req", flag.ExitOnError)
|
||||
blockReqBlockId := blockReqCmd.Uint64("b", 0, "Block id")
|
||||
blockReqBlockService := blockReqCmd.Uint64("bs", 0, "Block service")
|
||||
blockReqFile := blockReqCmd.String("file", "", "")
|
||||
blockReqRun := func() {
|
||||
resp, err := lib.ShuckleRequest(log, *shuckleAddress, &msgs.AllBlockServicesReq{})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
blockServices := resp.(*msgs.AllBlockServicesResp)
|
||||
var blockServiceInfo msgs.BlockServiceInfo
|
||||
for _, bsInfo := range blockServices.BlockServices {
|
||||
if bsInfo.Id == msgs.BlockServiceId(*blockReqBlockService) {
|
||||
blockServiceInfo = bsInfo
|
||||
break
|
||||
}
|
||||
}
|
||||
cipher, err := aes.NewCipher(blockServiceInfo.SecretKey[:])
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fileContents, err := ioutil.ReadFile(*blockReqFile)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
req := msgs.WriteBlockReq{
|
||||
BlockId: msgs.BlockId(*blockReqBlockId),
|
||||
Crc: msgs.Crc(crc32c.Sum(0, fileContents)),
|
||||
Size: uint32(len(fileContents)),
|
||||
}
|
||||
req.Certificate = lib.BlockWriteCertificate(cipher, blockServiceInfo.Id, &req)
|
||||
log.Info("request: %+v", req)
|
||||
}
|
||||
commands["write-block-req"] = commandSpec{
|
||||
flags: blockReqCmd,
|
||||
run: blockReqRun,
|
||||
}
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if flag.NArg() < 1 {
|
||||
|
||||
+64
-47
@@ -153,6 +153,50 @@ type eggsNode struct {
|
||||
id msgs.InodeId
|
||||
}
|
||||
|
||||
func getattr(id msgs.InodeId, out *fuse.Attr) syscall.Errno {
|
||||
log.Debug("getattr inode=%v", id)
|
||||
|
||||
out.Ino = uint64(id)
|
||||
out.Mode = inodeTypeToMode(id.Type())
|
||||
if id.Type() == msgs.DIRECTORY {
|
||||
resp := msgs.StatDirectoryResp{}
|
||||
if err := shardRequest(id.Shard(), &msgs.StatDirectoryReq{Id: id}, &resp); err != 0 {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
fileStatCacheMu.RLock()
|
||||
cached, found := fileStatCache[id]
|
||||
fileStatCacheMu.RUnlock()
|
||||
|
||||
if !found {
|
||||
resp := msgs.StatFileResp{}
|
||||
if err := shardRequest(id.Shard(), &msgs.StatFileReq{Id: id}, &resp); err != 0 {
|
||||
return err
|
||||
}
|
||||
cached.mtime = resp.Mtime
|
||||
cached.size = resp.Size
|
||||
fileStatCacheMu.Lock()
|
||||
fileStatCache[id] = cached
|
||||
fileStatCacheMu.Unlock()
|
||||
}
|
||||
|
||||
log.Debug("getattr size=%v", cached.size)
|
||||
out.Size = cached.size
|
||||
mtime := msgs.EGGS_EPOCH + uint64(cached.mtime)
|
||||
mtimesec := mtime / 1000000000
|
||||
mtimens := uint32(mtime % 1000000000)
|
||||
out.Ctime = mtimesec
|
||||
out.Ctimensec = mtimens
|
||||
out.Mtime = mtimesec
|
||||
out.Mtimensec = mtimens
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (n *eggsNode) Getattr(ctx context.Context, f fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
|
||||
return getattr(n.id, &out.Attr)
|
||||
}
|
||||
|
||||
func (n *eggsNode) Lookup(
|
||||
ctx context.Context, name string, out *fuse.EntryOut,
|
||||
) (*fs.Inode, syscall.Errno) {
|
||||
@@ -172,6 +216,9 @@ func (n *eggsNode) Lookup(
|
||||
default:
|
||||
panic(fmt.Errorf("bad type %v", resp.TargetId.Type()))
|
||||
}
|
||||
if err := getattr(resp.TargetId, &out.Attr); err != 0 {
|
||||
return nil, err
|
||||
}
|
||||
return n.NewInode(ctx, &eggsNode{id: resp.TargetId}, fs.StableAttr{Ino: uint64(resp.TargetId), Mode: mode}), 0
|
||||
}
|
||||
|
||||
@@ -316,61 +363,29 @@ func (n *eggsNode) Mkdir(
|
||||
return n.NewInode(ctx, &eggsNode{id: resp.Id}, fs.StableAttr{Ino: uint64(resp.Id), Mode: syscall.S_IFDIR}), 0
|
||||
}
|
||||
|
||||
func (n *eggsNode) Getattr(ctx context.Context, f fs.FileHandle, out *fuse.AttrOut) syscall.Errno {
|
||||
log.Debug("getattr inode=%v", n.id)
|
||||
|
||||
out.Ino = uint64(n.id)
|
||||
out.Mode = inodeTypeToMode(n.id.Type())
|
||||
if n.id.Type() == msgs.DIRECTORY {
|
||||
resp := msgs.StatDirectoryResp{}
|
||||
if err := shardRequest(n.id.Shard(), &msgs.StatDirectoryReq{Id: n.id}, &resp); err != 0 {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
fileStatCacheMu.RLock()
|
||||
cached, found := fileStatCache[n.id]
|
||||
fileStatCacheMu.RUnlock()
|
||||
|
||||
if !found {
|
||||
resp := msgs.StatFileResp{}
|
||||
if err := shardRequest(n.id.Shard(), &msgs.StatFileReq{Id: n.id}, &resp); err != 0 {
|
||||
return err
|
||||
}
|
||||
cached.mtime = resp.Mtime
|
||||
cached.size = resp.Size
|
||||
fileStatCacheMu.Lock()
|
||||
fileStatCache[n.id] = cached
|
||||
fileStatCacheMu.Unlock()
|
||||
}
|
||||
|
||||
out.Size = cached.size
|
||||
mtime := msgs.EGGS_EPOCH + uint64(cached.mtime)
|
||||
mtimesec := mtime / 1000000000
|
||||
mtimens := uint32(mtime % 1000000000)
|
||||
out.Ctime = mtimesec
|
||||
out.Ctimensec = mtimens
|
||||
out.Mtime = mtimesec
|
||||
out.Mtimensec = mtimens
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (f *transientFile) writeSpan() syscall.Errno {
|
||||
if f.data == nil {
|
||||
return 0 // happens when dup is called on file
|
||||
}
|
||||
spanSize := uint32(len(*f.data))
|
||||
maxSize := int(f.spanPolicy.Entries[len(f.spanPolicy.Entries)-1].MaxSize)
|
||||
boundary := maxSize
|
||||
if len(*f.data) < maxSize {
|
||||
boundary = len(*f.data)
|
||||
}
|
||||
spanData := (*f.data)[:boundary]
|
||||
leftover := append([]byte{}, (*f.data)[boundary:]...)
|
||||
spanSize := uint32(len(spanData))
|
||||
if spanSize == 0 {
|
||||
return 0
|
||||
}
|
||||
var err error
|
||||
*f.data, err = client.CreateSpan(log, []msgs.BlockServiceId{}, f.spanPolicy, f.blockPolicy, f.stripePolicy, f.id, f.cookie, f.written, spanSize, *f.data)
|
||||
*f.data, err = client.CreateSpan(log, []msgs.BlockServiceId{}, f.spanPolicy, f.blockPolicy, f.stripePolicy, f.id, f.cookie, f.written, spanSize, spanData)
|
||||
if err != nil {
|
||||
f.valid = false
|
||||
return eggsErrToErrno(err)
|
||||
}
|
||||
f.written += uint64(spanSize)
|
||||
*f.data = (*f.data)[:0]
|
||||
*f.data = append((*f.data)[:0], leftover...)
|
||||
|
||||
return 0
|
||||
}
|
||||
@@ -514,8 +529,8 @@ type openFile struct {
|
||||
id msgs.InodeId
|
||||
size uint64 // total size of file
|
||||
|
||||
spanReader lib.TaintableReadCloser // the span we're currently reading from
|
||||
readOffset int64 // the offset we're currently reading at, in the span reader above
|
||||
spanReader lib.PuttableReadCloser // the span we're currently reading from
|
||||
readOffset int64 // the offset we're currently reading at, in the span reader above
|
||||
|
||||
// We store the last successful span resp, to avoid re-issuing it if we can
|
||||
spans *msgs.FileSpansResp
|
||||
@@ -619,7 +634,6 @@ func (of *openFile) readInternal(dest []byte, off int64) (int64, syscall.Errno)
|
||||
if off != of.readOffset {
|
||||
log.Debug("mismatching offset (%v vs %v), will reset span", off, of.readOffset)
|
||||
if of.spanReader != nil {
|
||||
of.spanReader.Taint()
|
||||
of.spanReader.Close()
|
||||
of.spanReader = nil
|
||||
}
|
||||
@@ -650,7 +664,6 @@ func (of *openFile) readInternal(dest []byte, off int64) (int64, syscall.Errno)
|
||||
|
||||
func (of *openFile) Flush(ctx context.Context) syscall.Errno {
|
||||
if of.spanReader != nil {
|
||||
of.spanReader.Taint()
|
||||
of.spanReader.Close()
|
||||
}
|
||||
return 0
|
||||
@@ -865,7 +878,11 @@ func main() {
|
||||
root := eggsNode{
|
||||
id: msgs.ROOT_DIR_INODE_ID,
|
||||
}
|
||||
server, err := fs.Mount(mountPoint, &root, &fs.Options{Logger: logger})
|
||||
fuseOptions := &fs.Options{
|
||||
Logger: logger,
|
||||
}
|
||||
// fuseOptions.Debug = *trace
|
||||
server, err := fs.Mount(mountPoint, &root, fuseOptions)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Could not mount: %v", err)
|
||||
os.Exit(1)
|
||||
|
||||
+18
-4
@@ -34,6 +34,7 @@ func main() {
|
||||
shuckleHttpPort := flag.Uint("shuckle-http-port", 10000, "")
|
||||
startingPort := flag.Uint("start-port", 10002, "The services will be assigned port in this order, CDC, shard_000, ..., shard_255, bs_0, ..., bs_n. If 0, ports will be chosen randomly.")
|
||||
repoDir := flag.String("repo-dir", "", "Used to build C++/Go binaries. If not provided, the path will be derived form the filename at build time (so will only work locally).")
|
||||
binariesDir := flag.String("binaries-dir", "", "If provided, nothing will be built, instead it'll be assumed that the binaries will be in the specified directory.")
|
||||
flag.Parse()
|
||||
noRunawayArgs()
|
||||
|
||||
@@ -92,10 +93,23 @@ func main() {
|
||||
}
|
||||
log := lib.NewLogger(level, logOut)
|
||||
|
||||
fmt.Printf("building shard/cdc/blockservice/shuckle\n")
|
||||
|
||||
cppExes := managedprocess.BuildCppExes(log, *repoDir, *buildType)
|
||||
goExes := managedprocess.BuildGoExes(log, *repoDir)
|
||||
var cppExes *managedprocess.CppExes
|
||||
var goExes *managedprocess.GoExes
|
||||
if *binariesDir != "" {
|
||||
cppExes = &managedprocess.CppExes{
|
||||
ShardExe: path.Join(*binariesDir, "eggsshard"),
|
||||
CDCExe: path.Join(*binariesDir, "eggscdc"),
|
||||
}
|
||||
goExes = &managedprocess.GoExes{
|
||||
ShuckleExe: path.Join(*binariesDir, "eggsshuckle"),
|
||||
BlocksExe: path.Join(*binariesDir, "eggsblocks"),
|
||||
FuseExe: path.Join(*binariesDir, "eggsfuse"),
|
||||
}
|
||||
} else {
|
||||
fmt.Printf("building shard/cdc/blockservice/shuckle\n")
|
||||
cppExes = managedprocess.BuildCppExes(log, *repoDir, *buildType)
|
||||
goExes = managedprocess.BuildGoExes(log, *repoDir)
|
||||
}
|
||||
|
||||
terminateChan := make(chan any, 1)
|
||||
|
||||
|
||||
@@ -164,35 +164,40 @@ func handleRequest(log *lib.Logger, s *state, conn *net.TCPConn) {
|
||||
conn.SetLinger(0) // poor man error handling for now
|
||||
defer conn.Close()
|
||||
|
||||
req, err := lib.ReadShuckleRequest(log, conn)
|
||||
if err != nil {
|
||||
log.RaiseAlert(fmt.Errorf("could not decode request: %w", err))
|
||||
return
|
||||
}
|
||||
log.Debug("handling request %T %+v", req, req)
|
||||
var resp msgs.ShuckleResponse
|
||||
for {
|
||||
req, err := lib.ReadShuckleRequest(log, conn)
|
||||
if err == io.EOF {
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
log.RaiseAlert(fmt.Errorf("could not decode request: %w", err))
|
||||
return
|
||||
}
|
||||
log.Debug("handling request %T %+v", req, req)
|
||||
var resp msgs.ShuckleResponse
|
||||
|
||||
switch whichReq := req.(type) {
|
||||
case *msgs.BlockServicesForShardReq:
|
||||
resp = handleBlockServicesForShard(log, s, conn, whichReq)
|
||||
case *msgs.RegisterBlockServicesReq:
|
||||
resp = handleRegisterBlockServices(log, s, conn, whichReq)
|
||||
case *msgs.ShardsReq:
|
||||
resp = handleShards(log, s, conn, whichReq)
|
||||
case *msgs.RegisterShardReq:
|
||||
resp = handleRegisterShard(log, s, conn, whichReq)
|
||||
case *msgs.AllBlockServicesReq:
|
||||
resp = handleAllBlockServicesReq(log, s, conn, whichReq)
|
||||
case *msgs.CdcReq:
|
||||
resp = handleCdcReq(log, s, conn, whichReq)
|
||||
case *msgs.RegisterCdcReq:
|
||||
resp = handleRegisterCdcReq(log, s, conn, whichReq)
|
||||
default:
|
||||
log.RaiseAlert(fmt.Errorf("bad req type %T", req))
|
||||
}
|
||||
log.Debug("sending back response %T", resp)
|
||||
if err := lib.WriteShuckleResponse(log, conn, resp); err != nil {
|
||||
log.RaiseAlert(fmt.Errorf("could not send response: %w", err))
|
||||
switch whichReq := req.(type) {
|
||||
case *msgs.BlockServicesForShardReq:
|
||||
resp = handleBlockServicesForShard(log, s, conn, whichReq)
|
||||
case *msgs.RegisterBlockServicesReq:
|
||||
resp = handleRegisterBlockServices(log, s, conn, whichReq)
|
||||
case *msgs.ShardsReq:
|
||||
resp = handleShards(log, s, conn, whichReq)
|
||||
case *msgs.RegisterShardReq:
|
||||
resp = handleRegisterShard(log, s, conn, whichReq)
|
||||
case *msgs.AllBlockServicesReq:
|
||||
resp = handleAllBlockServicesReq(log, s, conn, whichReq)
|
||||
case *msgs.CdcReq:
|
||||
resp = handleCdcReq(log, s, conn, whichReq)
|
||||
case *msgs.RegisterCdcReq:
|
||||
resp = handleRegisterCdcReq(log, s, conn, whichReq)
|
||||
default:
|
||||
log.RaiseAlert(fmt.Errorf("bad req type %T", req))
|
||||
}
|
||||
log.Debug("sending back response %T", resp)
|
||||
if err := lib.WriteShuckleResponse(log, conn, resp); err != nil {
|
||||
log.RaiseAlert(fmt.Errorf("could not send response: %w", err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -436,7 +441,7 @@ func handleIndex(ll *lib.Logger, state *state, w http.ResponseWriter, r *http.Re
|
||||
for _, bs := range state.blockServices {
|
||||
data.BlockServices = append(data.BlockServices, indexBlockService{
|
||||
Id: bs.Id,
|
||||
Addr1: fmt.Sprintf("%v:%v", net.IP(bs.Ip1[:]), bs.Port2),
|
||||
Addr1: fmt.Sprintf("%v:%v", net.IP(bs.Ip1[:]), bs.Port1),
|
||||
Addr2: fmt.Sprintf("%v:%v", net.IP(bs.Ip2[:]), bs.Port2),
|
||||
StorageClass: bs.StorageClass,
|
||||
FailureDomain: string(bs.FailureDomain[:bytes.Index(bs.FailureDomain[:], []byte{0})]),
|
||||
|
||||
+236
-61
@@ -5,12 +5,13 @@ import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"math"
|
||||
"math/rand"
|
||||
"os"
|
||||
"path"
|
||||
"runtime/debug"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"xtx/eggsfs/lib"
|
||||
"xtx/eggsfs/msgs"
|
||||
"xtx/eggsfs/wyhash"
|
||||
@@ -25,6 +26,7 @@ type fsTestOpts struct {
|
||||
inlineFileProb float64
|
||||
maxFileSize int
|
||||
spanSize int
|
||||
checkThreads int
|
||||
}
|
||||
|
||||
type fsTestHarness[Id comparable] interface {
|
||||
@@ -35,14 +37,14 @@ type fsTestHarness[Id comparable] interface {
|
||||
checkFileData(log *lib.Logger, id Id, size uint64, dataSeed uint64)
|
||||
// files, directories
|
||||
readDirectory(log *lib.Logger, dir Id) ([]string, []string)
|
||||
removeFile(log *lib.Logger, dir Id, name string)
|
||||
removeDirectory(log *lib.Logger, dir Id, name string)
|
||||
}
|
||||
|
||||
type apiFsTestHarness struct {
|
||||
client *lib.Client
|
||||
dirInfoCache *lib.DirInfoCache
|
||||
// buffer we use both to create and to read files.
|
||||
fileContentsBuf []byte
|
||||
readBufPool *lib.ReadSpanBufPool
|
||||
readBufPool *lib.ReadSpanBufPool
|
||||
}
|
||||
|
||||
func (c *apiFsTestHarness) createDirectory(log *lib.Logger, owner msgs.InodeId, name string) (id msgs.InodeId, creationTime msgs.EggsTime) {
|
||||
@@ -106,7 +108,7 @@ func (c *apiFsTestHarness) rename(
|
||||
func (c *apiFsTestHarness) createFile(
|
||||
log *lib.Logger, owner msgs.InodeId, spanSize uint32, name string, size uint64, dataSeed uint64,
|
||||
) (msgs.InodeId, msgs.EggsTime) {
|
||||
return createFile(log, c.client, c.dirInfoCache, owner, spanSize, name, size, dataSeed, &c.fileContentsBuf)
|
||||
return createFile(log, c.client, c.dirInfoCache, owner, spanSize, name, size, dataSeed, c.readBufPool)
|
||||
}
|
||||
|
||||
func (c *apiFsTestHarness) readDirectory(log *lib.Logger, dir msgs.InodeId) (files []string, dirs []string) {
|
||||
@@ -121,21 +123,21 @@ func (c *apiFsTestHarness) readDirectory(log *lib.Logger, dir msgs.InodeId) (fil
|
||||
return files, dirs
|
||||
}
|
||||
|
||||
func checkFileData(actualData []byte, expectedData []byte) {
|
||||
func checkFileData(id any, from int, to int, actualData []byte, expectedData []byte) {
|
||||
if !bytes.Equal(actualData, expectedData) {
|
||||
dir, err := os.MkdirTemp("", "eggs-fstest-files.")
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("mismatching data, could not create temp directory"))
|
||||
panic(fmt.Errorf("mismatching data (%v,%v) for file %v, could not create temp directory", from, to, id))
|
||||
}
|
||||
expectedPath := path.Join(dir, "expected")
|
||||
actualPath := path.Join(dir, "actual")
|
||||
if err := os.WriteFile(expectedPath, expectedData, 0644); err != nil {
|
||||
panic(fmt.Errorf("mismatching data, could not create data file"))
|
||||
panic(fmt.Errorf("mismatching data (%v,%v), could not create data file", from, to))
|
||||
}
|
||||
if err := os.WriteFile(actualPath, actualData, 0644); err != nil {
|
||||
panic(fmt.Errorf("mismatching data, could not create data file"))
|
||||
panic(fmt.Errorf("mismatching data (%v,%v), could not create data file", from, to))
|
||||
}
|
||||
panic(fmt.Errorf("mismatching data, expected data is in %v, found data is in %v", expectedPath, actualPath))
|
||||
panic(fmt.Errorf("mismatching data (%v,%v), expected data is in %v, found data is in %v", from, to, expectedPath, actualPath))
|
||||
}
|
||||
|
||||
}
|
||||
@@ -156,19 +158,39 @@ func ensureLen(buf []byte, l int) []byte {
|
||||
}
|
||||
|
||||
func (c *apiFsTestHarness) checkFileData(log *lib.Logger, id msgs.InodeId, size uint64, dataSeed uint64) {
|
||||
fileData := readFile(log, c.readBufPool, c.client, id, &c.fileContentsBuf)
|
||||
fullSize := int(size)
|
||||
c.fileContentsBuf = ensureLen(c.fileContentsBuf, fullSize)
|
||||
expectedData := c.fileContentsBuf[:int(size)]
|
||||
wyhash.New(dataSeed).Read(expectedData[:int(size)])
|
||||
checkFileData(fileData, expectedData)
|
||||
actualData := c.readBufPool.Get(int(size))
|
||||
defer c.readBufPool.Put(actualData)
|
||||
*actualData = readFile(log, c.readBufPool, c.client, id, *actualData)
|
||||
expectedData := c.readBufPool.Get(int(size))
|
||||
defer c.readBufPool.Put(expectedData)
|
||||
wyhash.New(dataSeed).Read(*expectedData)
|
||||
checkFileData(id, 0, int(size), *actualData, *expectedData)
|
||||
}
|
||||
|
||||
func (c *apiFsTestHarness) removeFile(log *lib.Logger, ownerId msgs.InodeId, name string) {
|
||||
lookupResp := msgs.LookupResp{}
|
||||
if err := c.client.ShardRequest(log, ownerId.Shard(), &msgs.LookupReq{DirId: ownerId, Name: name}, &lookupResp); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := c.client.ShardRequest(log, ownerId.Shard(), &msgs.SoftUnlinkFileReq{OwnerId: ownerId, FileId: lookupResp.TargetId, Name: name, CreationTime: lookupResp.CreationTime}, &msgs.SoftUnlinkFileResp{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *apiFsTestHarness) removeDirectory(log *lib.Logger, ownerId msgs.InodeId, name string) {
|
||||
lookupResp := msgs.LookupResp{}
|
||||
if err := c.client.ShardRequest(log, ownerId.Shard(), &msgs.LookupReq{DirId: ownerId, Name: name}, &lookupResp); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := c.client.CDCRequest(log, &msgs.SoftUnlinkDirectoryReq{OwnerId: ownerId, TargetId: lookupResp.TargetId, Name: name, CreationTime: lookupResp.CreationTime}, &msgs.SoftUnlinkDirectoryResp{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
var _ = (fsTestHarness[msgs.InodeId])((*apiFsTestHarness)(nil))
|
||||
|
||||
type posixFsTestHarness struct {
|
||||
expectedDataBuf []byte
|
||||
actualDataBuf []byte
|
||||
bufPool *lib.ReadSpanBufPool
|
||||
}
|
||||
|
||||
func (*posixFsTestHarness) createDirectory(log *lib.Logger, owner string, name string) (fullPath string, creationTime msgs.EggsTime) {
|
||||
@@ -203,11 +225,34 @@ func (*posixFsTestHarness) rename(
|
||||
func (c *posixFsTestHarness) createFile(
|
||||
log *lib.Logger, dirFullPath string, spanSize uint32, name string, size uint64, dataSeed uint64,
|
||||
) (fileFullPath string, t msgs.EggsTime) {
|
||||
c.actualDataBuf = ensureLen(c.actualDataBuf, int(size))
|
||||
wyhash.New(dataSeed).Read(c.actualDataBuf[:size])
|
||||
actualDataBuf := c.bufPool.Get(int(size))
|
||||
defer c.bufPool.Put(actualDataBuf)
|
||||
rand := wyhash.New(dataSeed)
|
||||
rand.Read(*actualDataBuf)
|
||||
fileFullPath = path.Join(dirFullPath, name)
|
||||
log.LogStack(1, lib.DEBUG, "posix create file %v", fileFullPath)
|
||||
if err := os.WriteFile(fileFullPath, c.actualDataBuf, 0644); err != nil {
|
||||
f, err := os.Create(fileFullPath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
log.LogStack(1, lib.DEBUG, "posix create file %v (%v size)", fileFullPath, size)
|
||||
if size > 0 {
|
||||
// write in randomly sized chunks
|
||||
chunks := int(rand.Uint32()%10) + 1
|
||||
offsets := make([]int, chunks+1)
|
||||
offsets[0] = 0
|
||||
for i := 1; i < chunks; i++ {
|
||||
offsets[i] = int(rand.Uint64() % size)
|
||||
}
|
||||
offsets[chunks] = int(size)
|
||||
sort.Ints(offsets)
|
||||
for i := 0; i < chunks; i++ {
|
||||
log.Trace("writing from %v to %v", offsets[i], offsets[i+1])
|
||||
if _, err := f.Write((*actualDataBuf)[offsets[i]:offsets[i+1]]); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return fileFullPath, 0
|
||||
@@ -215,12 +260,12 @@ func (c *posixFsTestHarness) createFile(
|
||||
|
||||
func (c *posixFsTestHarness) readDirectory(log *lib.Logger, dirFullPath string) (files []string, dirs []string) {
|
||||
log.LogStack(1, lib.DEBUG, "posix readdir for %v", dirFullPath)
|
||||
fileInfo, err := ioutil.ReadDir(dirFullPath)
|
||||
fileInfo, err := os.ReadDir(dirFullPath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
for _, fi := range fileInfo {
|
||||
if fi.Mode().IsDir() {
|
||||
if fi.IsDir() {
|
||||
dirs = append(dirs, fi.Name())
|
||||
} else {
|
||||
files = append(files, fi.Name())
|
||||
@@ -230,21 +275,21 @@ func (c *posixFsTestHarness) readDirectory(log *lib.Logger, dirFullPath string)
|
||||
}
|
||||
|
||||
func (c *posixFsTestHarness) checkFileData(log *lib.Logger, fullFilePath string, size uint64, dataSeed uint64) {
|
||||
log.Debug("checking data for file %v", fullFilePath)
|
||||
fullSize := int(size)
|
||||
c.expectedDataBuf = ensureLen(c.expectedDataBuf, fullSize)
|
||||
wyhash.New(dataSeed).Read(c.expectedDataBuf[:int(size)])
|
||||
c.actualDataBuf = ensureLen(c.actualDataBuf, fullSize)
|
||||
expectedData := c.bufPool.Get(fullSize)
|
||||
defer c.bufPool.Put(expectedData)
|
||||
rand := wyhash.New(dataSeed)
|
||||
rand.Read(*expectedData)
|
||||
actualData := c.bufPool.Get(fullSize)
|
||||
defer c.bufPool.Put(actualData)
|
||||
f, err := os.Open(fullFilePath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer f.Close()
|
||||
// First we check the whole thing
|
||||
if _, err := io.ReadFull(f, c.actualDataBuf); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
checkFileData(c.actualDataBuf, c.expectedDataBuf)
|
||||
// then we start doing random reads around
|
||||
log.Debug("checking for file %v of expected len %v", fullFilePath, fullSize)
|
||||
// First do some random reads, hopefully stimulating span caches in some interesting way
|
||||
if fullSize > 1 {
|
||||
for i := 0; i < 10; i++ {
|
||||
offset := int(rand.Uint64() % uint64(fullSize-1))
|
||||
@@ -253,14 +298,31 @@ func (c *posixFsTestHarness) checkFileData(log *lib.Logger, fullFilePath string,
|
||||
if _, err := f.Seek(int64(offset), 0); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
expectedPartialData := c.expectedDataBuf[offset : offset+size]
|
||||
actualPartialData := c.actualDataBuf[offset : offset+size]
|
||||
expectedPartialData := (*expectedData)[offset : offset+size]
|
||||
actualPartialData := (*actualData)[offset : offset+size]
|
||||
if _, err := io.ReadFull(f, actualPartialData); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
checkFileData(actualPartialData, expectedPartialData)
|
||||
checkFileData(fullFilePath, offset, offset+size, actualPartialData, expectedPartialData)
|
||||
}
|
||||
}
|
||||
// Then we check the whole thing
|
||||
if _, err := f.Seek(0, 0); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
_, err = io.ReadFull(f, *actualData)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
checkFileData(fullFilePath, 0, fullSize, *actualData, *expectedData)
|
||||
}
|
||||
|
||||
func (c *posixFsTestHarness) removeFile(log *lib.Logger, ownerId string, name string) {
|
||||
os.Remove(path.Join(ownerId, name))
|
||||
}
|
||||
|
||||
func (c *posixFsTestHarness) removeDirectory(log *lib.Logger, ownerId string, name string) {
|
||||
os.Remove(path.Join(ownerId, name))
|
||||
}
|
||||
|
||||
var _ = (fsTestHarness[string])((*posixFsTestHarness)(nil))
|
||||
@@ -461,12 +523,12 @@ func (d *fsTestDir[Id]) check(log *lib.Logger, harness fsTestHarness[Id]) {
|
||||
panic(fmt.Errorf("bad number of edges -- got %v + %v, expected %v + %v", len(files), len(dirs), len(d.children.files), len(d.children.files)))
|
||||
}
|
||||
for _, fileName := range files {
|
||||
log.Debug("checking file %v", fileName)
|
||||
name, err := strconv.Atoi(fileName)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
file, present := d.children.files[name]
|
||||
log.Debug("checking file %v (size %v)", fileName, file.body.size)
|
||||
if !present {
|
||||
panic(fmt.Errorf("file %v not found", name))
|
||||
}
|
||||
@@ -491,6 +553,27 @@ func (d *fsTestDir[Id]) check(log *lib.Logger, harness fsTestHarness[Id]) {
|
||||
}
|
||||
}
|
||||
|
||||
func (d *fsTestDir[Id]) clean(log *lib.Logger, harness fsTestHarness[Id]) {
|
||||
files, dirs := harness.readDirectory(log, d.id)
|
||||
for _, fileName := range files {
|
||||
log.Debug("removing file %v", fileName)
|
||||
harness.removeFile(log, d.id, fileName)
|
||||
}
|
||||
for _, dirName := range dirs {
|
||||
log.Debug("cleaning dir %v", dirName)
|
||||
name, err := strconv.Atoi(dirName)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
dir, present := d.children.directories[name]
|
||||
if !present {
|
||||
panic(fmt.Errorf("directory %v not found", name))
|
||||
}
|
||||
dir.body.clean(log, harness)
|
||||
harness.removeDirectory(log, d.id, dirName)
|
||||
}
|
||||
}
|
||||
|
||||
// Just the first block service id we can find
|
||||
func findBlockServiceToPurge(log *lib.Logger, client *lib.Client) msgs.BlockServiceId {
|
||||
filesReq := msgs.VisitFilesReq{}
|
||||
@@ -519,15 +602,15 @@ func findBlockServiceToPurge(log *lib.Logger, client *lib.Client) msgs.BlockServ
|
||||
|
||||
func fsTestInternal[Id comparable](
|
||||
log *lib.Logger,
|
||||
state *fsTestState[Id],
|
||||
shuckleAddress string,
|
||||
opts *fsTestOpts,
|
||||
counters *lib.ClientCounters,
|
||||
harness fsTestHarness[Id],
|
||||
rootId Id,
|
||||
) {
|
||||
state := fsTestState[Id]{
|
||||
totalDirs: 1, // root dir
|
||||
rootDir: *newFsTestDir(rootId),
|
||||
if opts.checkThreads == 0 {
|
||||
panic(fmt.Errorf("must specify at least one check thread"))
|
||||
}
|
||||
branching := int(math.Log(float64(opts.numDirs)) / math.Log(float64(opts.depth)))
|
||||
rand := wyhash.New(42)
|
||||
@@ -582,30 +665,110 @@ func fsTestInternal[Id comparable](
|
||||
state.makeFile(log, harness, opts, rand, dir, state.totalDirs+state.totalFiles)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func fsTestInternalCheck[Id comparable](
|
||||
log *lib.Logger,
|
||||
state *fsTestState[Id],
|
||||
shuckleAddress string,
|
||||
opts *fsTestOpts,
|
||||
counters *lib.ClientCounters,
|
||||
harness fsTestHarness[Id],
|
||||
rootId Id,
|
||||
) {
|
||||
// finally, check that our view of the world is the real view of the world
|
||||
log.Info("checking directories/files")
|
||||
errsChans := make([](chan any), opts.checkThreads)
|
||||
for i := 0; i < opts.checkThreads; i++ {
|
||||
errChan := make(chan any)
|
||||
errsChans[i] = errChan
|
||||
go func() {
|
||||
defer func() {
|
||||
err := recover()
|
||||
if err != nil {
|
||||
log.Info("stacktrace for %v:", err)
|
||||
for _, line := range strings.Split(string(debug.Stack()), "\n") {
|
||||
log.Info(line)
|
||||
}
|
||||
}
|
||||
errChan <- err
|
||||
|
||||
}()
|
||||
state.rootDir.check(log, harness)
|
||||
}()
|
||||
}
|
||||
for i := 0; i < opts.checkThreads; i++ {
|
||||
err := <-errsChans[i]
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("checking thread %v failed: %v", i, err))
|
||||
}
|
||||
}
|
||||
state.rootDir.check(log, harness)
|
||||
// Now, try to migrate away from one block service, to stimulate that code path
|
||||
// in tests somewhere.
|
||||
{
|
||||
client, err := lib.NewClient(log, shuckleAddress, nil, counters, nil)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer client.Close()
|
||||
blockServiceToPurge := findBlockServiceToPurge(log, client)
|
||||
log.Info("will migrate block service %v", blockServiceToPurge)
|
||||
migrateStats := lib.MigrateStats{}
|
||||
err = lib.MigrateBlocksInAllShards(log, client, &migrateStats, blockServiceToPurge)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("could not migrate: %w", err))
|
||||
}
|
||||
if migrateStats.MigratedBlocks == 0 {
|
||||
panic(fmt.Errorf("migrate didn't migrate any blocks"))
|
||||
/*
|
||||
{
|
||||
client, err := lib.NewClient(log, shuckleAddress, nil, counters, nil)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer client.Close()
|
||||
blockServiceToPurge := findBlockServiceToPurge(log, client)
|
||||
log.Info("will migrate block service %v", blockServiceToPurge)
|
||||
migrateStats := lib.MigrateStats{}
|
||||
err = lib.MigrateBlocksInAllShards(log, client, &migrateStats, blockServiceToPurge)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("could not migrate: %w", err))
|
||||
}
|
||||
if migrateStats.MigratedBlocks == 0 {
|
||||
panic(fmt.Errorf("migrate didn't migrate any blocks"))
|
||||
}
|
||||
}
|
||||
// And check the state again, don't bother with multiple threads thoush
|
||||
state.rootDir.check(log, harness)
|
||||
*/
|
||||
// Now, remove everything -- the cleanup would do this anyway, but we want to stimulate
|
||||
// the removal paths in the filesystem tests.
|
||||
state.rootDir.clean(log, harness)
|
||||
}
|
||||
|
||||
func replaceMountPoint(mountPoint string, mountPointFuse string, fp string) string {
|
||||
return mountPointFuse + fp[len(mountPoint):]
|
||||
}
|
||||
|
||||
func convertDirToFuse(
|
||||
dir *fsTestDir[string],
|
||||
mountPoint string,
|
||||
mountPointFuse string,
|
||||
) *fsTestDir[string] {
|
||||
newDir := fsTestDir[string]{
|
||||
id: replaceMountPoint(mountPoint, mountPointFuse, dir.id),
|
||||
children: fsTestChildren[string]{
|
||||
files: make(map[int]fsTestChild[fsTestFile[string]]),
|
||||
directories: make(map[int]fsTestChild[fsTestDir[string]]),
|
||||
},
|
||||
}
|
||||
// And check the state again
|
||||
state.rootDir.check(log, harness)
|
||||
for i, file := range dir.children.files {
|
||||
newFile := file
|
||||
newFile.body.id = replaceMountPoint(mountPoint, mountPointFuse, newFile.body.id)
|
||||
newDir.children.files[i] = newFile
|
||||
}
|
||||
for i, childDir := range dir.children.directories {
|
||||
newChildDir := childDir
|
||||
newChildDir.body = *convertDirToFuse(&childDir.body, mountPoint, mountPointFuse)
|
||||
newDir.children.directories[i] = newChildDir
|
||||
}
|
||||
return &newDir
|
||||
}
|
||||
|
||||
func convertToFuse(
|
||||
state *fsTestState[string],
|
||||
mountPoint string,
|
||||
mountPointFuse string,
|
||||
) *fsTestState[string] {
|
||||
newState := *state
|
||||
newState.rootDir = *convertDirToFuse(&state.rootDir, mountPoint, mountPointFuse)
|
||||
return &newState
|
||||
}
|
||||
|
||||
func fsTest(
|
||||
@@ -626,9 +789,21 @@ func fsTest(
|
||||
dirInfoCache: lib.NewDirInfoCache(),
|
||||
readBufPool: lib.NewReadSpanBufPool(),
|
||||
}
|
||||
fsTestInternal[msgs.InodeId](log, shuckleAddress, opts, counters, harness, msgs.ROOT_DIR_INODE_ID)
|
||||
state := fsTestState[msgs.InodeId]{
|
||||
totalDirs: 1, // root dir
|
||||
rootDir: *newFsTestDir(msgs.ROOT_DIR_INODE_ID),
|
||||
}
|
||||
fsTestInternal[msgs.InodeId](log, &state, shuckleAddress, opts, counters, harness, msgs.ROOT_DIR_INODE_ID)
|
||||
fsTestInternalCheck[msgs.InodeId](log, &state, shuckleAddress, opts, counters, harness, msgs.ROOT_DIR_INODE_ID)
|
||||
} else {
|
||||
harness := &posixFsTestHarness{}
|
||||
fsTestInternal[string](log, shuckleAddress, opts, counters, harness, realFs)
|
||||
harness := &posixFsTestHarness{
|
||||
bufPool: lib.NewReadSpanBufPool(),
|
||||
}
|
||||
state := fsTestState[string]{
|
||||
totalDirs: 1, // root dir
|
||||
rootDir: *newFsTestDir(realFs),
|
||||
}
|
||||
fsTestInternal[string](log, &state, shuckleAddress, opts, counters, harness, realFs)
|
||||
fsTestInternalCheck[string](log, &state, shuckleAddress, opts, counters, harness, realFs)
|
||||
}
|
||||
}
|
||||
|
||||
+191
-20
@@ -4,17 +4,20 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"runtime/debug"
|
||||
"runtime/pprof"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"xtx/eggsfs/lib"
|
||||
"xtx/eggsfs/managedprocess"
|
||||
"xtx/eggsfs/msgs"
|
||||
"xtx/eggsfs/rs"
|
||||
|
||||
"golang.org/x/exp/constraints"
|
||||
)
|
||||
@@ -123,12 +126,65 @@ func runTest(
|
||||
}
|
||||
}
|
||||
|
||||
func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fuseMountPoint string, short bool, filter *regexp.Regexp) {
|
||||
defer func() { handleRecover(log, terminateChan, recover()) }()
|
||||
func getKmodDirRefreshTime() uint64 {
|
||||
bs, err := os.ReadFile("/proc/sys/fs/eggsfs/dir_refresh_time_ms")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
ms, err := strconv.ParseUint(strings.TrimSpace(string(bs)), 10, 64)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
|
||||
// We want to immediately clean up everything when we run the GC manually
|
||||
func setKmodDirRefreshTime(ms uint64) {
|
||||
out, err := exec.Command("sudo", "sh", "-c", fmt.Sprintf("echo %v > /proc/sys/fs/eggsfs/dir_refresh_time_ms", ms)).CombinedOutput()
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("could not mount filesystem (%w): %s", err, out))
|
||||
}
|
||||
}
|
||||
|
||||
type cfgOverrides map[string]string
|
||||
|
||||
func (i *cfgOverrides) Set(value string) error {
|
||||
parts := strings.Split(value, "=")
|
||||
if len(parts) != 2 {
|
||||
return fmt.Errorf("invalid cfg override %q", value)
|
||||
}
|
||||
(*i)[parts[0]] = parts[1]
|
||||
return nil
|
||||
}
|
||||
|
||||
func (i *cfgOverrides) String() string {
|
||||
pairs := []string{}
|
||||
for k, v := range *i {
|
||||
pairs = append(pairs, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
return strings.Join(pairs, ",")
|
||||
}
|
||||
|
||||
func (i *cfgOverrides) int(k string, def int) int {
|
||||
s, present := (*i)[k]
|
||||
if !present {
|
||||
return def
|
||||
}
|
||||
n, err := strconv.Atoi(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func runTests(terminateChan chan any, log *lib.Logger, overrides *cfgOverrides, shuckleAddress string, mountPoint string, fuseMountPoint string, kmod bool, short bool, filter *regexp.Regexp) {
|
||||
defer func() { handleRecover(log, terminateChan, recover()) }()
|
||||
client, err := lib.NewClient(log, shuckleAddress, nil, nil, nil)
|
||||
defer client.Close()
|
||||
|
||||
defaultSpanPolicy := &msgs.SpanPolicy{}
|
||||
defaultStripePolicy := &msgs.StripePolicy{}
|
||||
{
|
||||
client, err := lib.NewClient(log, shuckleAddress, nil, nil, nil)
|
||||
// We want to immediately clean up everything when we run the GC manually
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
@@ -138,6 +194,44 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
|
||||
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, snapshotPolicy); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
dirInfoCache := lib.NewDirInfoCache()
|
||||
_, err = client.ResolveDirectoryInfoEntry(log, dirInfoCache, msgs.ROOT_DIR_INODE_ID, defaultSpanPolicy)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
// We also want to stimulate many spans while not writing huge files.
|
||||
// This is the same as the default but the second and third policy are
|
||||
/// 500KiB and 1MiB
|
||||
spanPolicy := &msgs.SpanPolicy{
|
||||
Entries: []msgs.SpanPolicyEntry{
|
||||
{
|
||||
MaxSize: 1 << 16,
|
||||
Parity: rs.MkParity(1, 4),
|
||||
},
|
||||
{
|
||||
MaxSize: 500 << 10,
|
||||
Parity: rs.MkParity(4, 4),
|
||||
},
|
||||
{
|
||||
MaxSize: 1 << 20,
|
||||
Parity: rs.MkParity(10, 4),
|
||||
},
|
||||
},
|
||||
}
|
||||
// Also reduce stripe size to get many stripes
|
||||
_, err = client.ResolveDirectoryInfoEntry(log, dirInfoCache, msgs.ROOT_DIR_INODE_ID, defaultStripePolicy)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, spanPolicy); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
stripePolicy := &msgs.StripePolicy{
|
||||
TargetStripeSize: 4096 * 5,
|
||||
}
|
||||
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, stripePolicy); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
fileHistoryOpts := fileHistoryTestOpts{
|
||||
@@ -162,21 +256,25 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
|
||||
)
|
||||
|
||||
fsTestOpts := fsTestOpts{
|
||||
numDirs: 1 * 1000, // we need at least 256 directories, to have at least one dir per shard
|
||||
numFiles: 20 * 1000, // around 20 files per dir
|
||||
depth: 4,
|
||||
// 0.03 * 20*1000 * 5MiB = ~3GiB of file data
|
||||
emptyFileProb: 0.8,
|
||||
inlineFileProb: 0.17,
|
||||
maxFileSize: 10 << 20, // 10MiB
|
||||
spanSize: 1 << 20, // 1MiB
|
||||
depth: 4,
|
||||
maxFileSize: overrides.int("fsTest.maxFileSize", 10<<20), // 10MiB
|
||||
spanSize: 1 << 20, // 1MiB
|
||||
checkThreads: overrides.int("fsTest.checkThreads", 5),
|
||||
}
|
||||
|
||||
// 0.03 * 20*1000 * 5MiB = ~3GiB of file data
|
||||
if short {
|
||||
fsTestOpts.numDirs = 200
|
||||
fsTestOpts.numFiles = 10 * 200
|
||||
fsTestOpts.numDirs = overrides.int("fsTest.numDirs", 200)
|
||||
fsTestOpts.numFiles = overrides.int("fsTest.numFiles", 10*200)
|
||||
fsTestOpts.emptyFileProb = 0.1
|
||||
fsTestOpts.inlineFileProb = 0.3
|
||||
} else {
|
||||
fsTestOpts.numDirs = overrides.int("fsTest.numDirs", 1*1000) // we need at least 256 directories, to have at least one dir per shard
|
||||
fsTestOpts.numFiles = overrides.int("fsTest.numFiles", 20*1000) // around 20 files per dir
|
||||
fsTestOpts.emptyFileProb = 0.8
|
||||
fsTestOpts.inlineFileProb = 0.17
|
||||
}
|
||||
|
||||
runTest(
|
||||
log,
|
||||
shuckleAddress,
|
||||
@@ -188,17 +286,65 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
|
||||
},
|
||||
)
|
||||
|
||||
if kmod {
|
||||
originalDirRefreshTime := getKmodDirRefreshTime()
|
||||
defer setKmodDirRefreshTime(originalDirRefreshTime)
|
||||
|
||||
setKmodDirRefreshTime(0)
|
||||
}
|
||||
|
||||
if kmod {
|
||||
preadddirOpts := preadddirOpts{
|
||||
numDirs: 1000,
|
||||
filesPerDir: 100,
|
||||
loops: 1000,
|
||||
threads: 10,
|
||||
}
|
||||
runTest(
|
||||
log,
|
||||
shuckleAddress,
|
||||
filter,
|
||||
"parallel readdir",
|
||||
fmt.Sprintf("%v dirs, %v files per dir, %v loops, %v threads", preadddirOpts.numDirs, preadddirOpts.filesPerDir, preadddirOpts.loops, preadddirOpts.threads),
|
||||
func(counters *lib.ClientCounters) {
|
||||
preaddirTest(log, mountPoint, &preadddirOpts)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
runTest(
|
||||
log,
|
||||
shuckleAddress,
|
||||
filter,
|
||||
"fuse fs",
|
||||
"mounted fs",
|
||||
fmt.Sprintf("%v dirs, %v files, %v depth", fsTestOpts.numDirs, fsTestOpts.numFiles, fsTestOpts.depth),
|
||||
func(counters *lib.ClientCounters) {
|
||||
fsTest(log, shuckleAddress, &fsTestOpts, counters, fuseMountPoint)
|
||||
fsTest(log, shuckleAddress, &fsTestOpts, counters, mountPoint)
|
||||
},
|
||||
)
|
||||
|
||||
if kmod {
|
||||
setKmodDirRefreshTime(1000) // 1 sec
|
||||
runTest(
|
||||
log,
|
||||
shuckleAddress,
|
||||
filter,
|
||||
"mounted fs (1s dir refresh)",
|
||||
fmt.Sprintf("%v dirs, %v files, %v depth", fsTestOpts.numDirs, fsTestOpts.numFiles, fsTestOpts.depth),
|
||||
func(counters *lib.ClientCounters) {
|
||||
fsTest(log, shuckleAddress, &fsTestOpts, counters, mountPoint)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// Restore default values for policies, otherwise things will be unduly slow below
|
||||
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, defaultSpanPolicy); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := client.MergeDirectoryInfo(log, msgs.ROOT_DIR_INODE_ID, defaultStripePolicy); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
largeFileOpts := largeFileTestOpts{
|
||||
fileSize: 1 << 30, // 1GiB
|
||||
}
|
||||
@@ -209,7 +355,7 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
|
||||
"large file",
|
||||
fmt.Sprintf("%vGB", float64(largeFileOpts.fileSize)/1e9),
|
||||
func(counters *lib.ClientCounters) {
|
||||
largeFileTest(log, &largeFileOpts, fuseMountPoint)
|
||||
largeFileTest(log, &largeFileOpts, mountPoint)
|
||||
},
|
||||
)
|
||||
|
||||
@@ -225,7 +371,7 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
|
||||
"rsync large files",
|
||||
fmt.Sprintf("%v files, %v dirs, %vMB file size", rsyncOpts.numFiles, rsyncOpts.numDirs, float64(rsyncOpts.maxFileSize)/1e6),
|
||||
func(counters *lib.ClientCounters) {
|
||||
rsyncTest(log, &rsyncOpts, fuseMountPoint)
|
||||
rsyncTest(log, &rsyncOpts, mountPoint)
|
||||
},
|
||||
)
|
||||
|
||||
@@ -241,7 +387,7 @@ func runTests(terminateChan chan any, log *lib.Logger, shuckleAddress string, fu
|
||||
"rsync small files",
|
||||
fmt.Sprintf("%v files, %v dirs, %vMB file size", rsyncOpts.numFiles, rsyncOpts.numDirs, float64(rsyncOpts.maxFileSize)/1e6),
|
||||
func(counters *lib.ClientCounters) {
|
||||
rsyncTest(log, &rsyncOpts, fuseMountPoint)
|
||||
rsyncTest(log, &rsyncOpts, mountPoint)
|
||||
},
|
||||
)
|
||||
|
||||
@@ -256,6 +402,8 @@ func noRunawayArgs() {
|
||||
}
|
||||
|
||||
func main() {
|
||||
overrides := make(cfgOverrides)
|
||||
|
||||
buildType := flag.String("build-type", "alpine", "C++ build type, one of alpine/release/debug/sanitized/valgrind")
|
||||
verbose := flag.Bool("verbose", false, "")
|
||||
trace := flag.Bool("trace", false, "")
|
||||
@@ -269,6 +417,8 @@ func main() {
|
||||
short := flag.Bool("short", false, "Run a shorter version of the tests (useful with packet drop flags)")
|
||||
repoDir := flag.String("repo-dir", "", "Used to build C++/Go binaries. If not provided, the path will be derived form the filename at build time (so will only work locally).")
|
||||
binariesDir := flag.String("binaries-dir", "", "If provided, nothing will be built, instead it'll be assumed that the binaries will be in the specified directory.")
|
||||
kmod := flag.Bool("kmod", false, "Whether to mount with the kernel module, rather than FUSE. Note that the tests will not attempt to run the kernel module and load it, they'll just mount with 'mount -t eggsfs'.")
|
||||
flag.Var(&overrides, "cfg", "Config overrides")
|
||||
flag.Parse()
|
||||
noRunawayArgs()
|
||||
|
||||
@@ -438,11 +588,32 @@ func main() {
|
||||
Profile: *profile,
|
||||
})
|
||||
|
||||
var mountPoint string
|
||||
if *kmod {
|
||||
mountPoint = path.Join(*dataDir, "kmod", "mnt")
|
||||
err := os.MkdirAll(mountPoint, 0777)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
out, err := exec.Command("sudo", "mount", "-t", "eggsfs", fmt.Sprintf("127.0.0.1:%v", shucklePort), mountPoint).CombinedOutput()
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("could not mount filesystem (%w): %s", err, out))
|
||||
}
|
||||
defer func() {
|
||||
out, err := exec.Command("sudo", "umount", mountPoint).CombinedOutput()
|
||||
if err != nil {
|
||||
fmt.Printf("could not umount fs (%v): %s", err, out)
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
mountPoint = fuseMountPoint
|
||||
}
|
||||
|
||||
fmt.Printf("operational 🤖\n")
|
||||
|
||||
// start tests
|
||||
go func() {
|
||||
runTests(terminateChan, log, shuckleAddress, fuseMountPoint, *short, filterRe)
|
||||
runTests(terminateChan, log, &overrides, shuckleAddress, mountPoint, fuseMountPoint, *kmod, *short, filterRe)
|
||||
}()
|
||||
|
||||
// wait for things to finish
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"strconv"
|
||||
"xtx/eggsfs/lib"
|
||||
)
|
||||
|
||||
type preadddirOpts struct {
|
||||
filesPerDir int
|
||||
loops int
|
||||
threads int
|
||||
numDirs int
|
||||
}
|
||||
|
||||
func preaddirCheck(
|
||||
log *lib.Logger,
|
||||
thread int,
|
||||
mountPoint string,
|
||||
opts *preadddirOpts,
|
||||
) []string {
|
||||
errors := []string{}
|
||||
for i := 0; i < opts.numDirs; i++ {
|
||||
dpath := path.Join(mountPoint, strconv.Itoa(i))
|
||||
entries, err := os.ReadDir(dpath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
found := make([]bool, opts.filesPerDir)
|
||||
for _, entry := range entries {
|
||||
i, err := strconv.Atoi(entry.Name())
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
found[i] = true
|
||||
}
|
||||
for i, b := range found {
|
||||
if !b {
|
||||
err := fmt.Sprintf("[%v] %v/%v", thread, dpath, i)
|
||||
log.Error(err)
|
||||
errors = append(errors, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return errors
|
||||
}
|
||||
|
||||
func preaddirTest(
|
||||
log *lib.Logger,
|
||||
mountPoint string,
|
||||
opts *preadddirOpts,
|
||||
) {
|
||||
for i := 0; i < opts.numDirs; i++ {
|
||||
dpath := path.Join(mountPoint, strconv.Itoa(i))
|
||||
log.Debug("creating directory %v", dpath)
|
||||
if err := os.Mkdir(dpath, 0777); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
for i := 0; i < opts.filesPerDir; i++ {
|
||||
fpath := path.Join(dpath, strconv.Itoa(i))
|
||||
log.Debug("creating file %v", fpath)
|
||||
f, err := os.Create(fpath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
errsChans := make([](chan any), opts.threads)
|
||||
errors := make([][]string, opts.threads)
|
||||
for i := 0; i < opts.threads; i++ {
|
||||
errsChans[i] = make(chan any)
|
||||
go func(thread int) {
|
||||
defer func() { errsChans[thread] <- recover() }()
|
||||
errors[thread] = preaddirCheck(log, thread, mountPoint, opts)
|
||||
}(i)
|
||||
}
|
||||
log.Info("waiting for threads")
|
||||
for i := 0; i < opts.threads; i++ {
|
||||
log.Debug("reading from %v", i)
|
||||
err := <-errsChans[i]
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("checking thread %v failed: %v", i, err))
|
||||
}
|
||||
}
|
||||
log.Info("checking errors")
|
||||
var flatErrors string
|
||||
for i := range errors {
|
||||
for j := range errors[i] {
|
||||
if flatErrors == "" {
|
||||
flatErrors = errors[i][j]
|
||||
} else {
|
||||
flatErrors = flatErrors + ", " + errors[i][j]
|
||||
}
|
||||
}
|
||||
}
|
||||
if flatErrors != "" {
|
||||
panic(fmt.Sprintf("missing %s", flatErrors))
|
||||
}
|
||||
}
|
||||
+25
-16
@@ -1,8 +1,8 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"xtx/eggsfs/crc32c"
|
||||
"xtx/eggsfs/lib"
|
||||
"xtx/eggsfs/msgs"
|
||||
@@ -65,7 +65,7 @@ func createFile(
|
||||
name string,
|
||||
size uint64,
|
||||
dataSeed uint64,
|
||||
buf *[]byte,
|
||||
bufPool *lib.ReadSpanBufPool,
|
||||
) (id msgs.InodeId, creationTime msgs.EggsTime) {
|
||||
// construct
|
||||
constructReq := msgs.ConstructFileReq{
|
||||
@@ -94,13 +94,11 @@ func createFile(
|
||||
if uint32(size-offset) < thisSpanSize {
|
||||
thisSpanSize = uint32(size - offset)
|
||||
}
|
||||
if len(*buf) < int(thisSpanSize) {
|
||||
*buf = append(*buf, make([]byte, int(thisSpanSize)-len(*buf))...)
|
||||
}
|
||||
spanBuf := (*buf)[:thisSpanSize]
|
||||
rand.Read(spanBuf)
|
||||
spanBuf := bufPool.Get(int(thisSpanSize))
|
||||
rand.Read(*spanBuf)
|
||||
var err error
|
||||
*buf, err = client.CreateSpan(log, []msgs.BlockServiceId{}, &spanPolicy, &blockPolicies, &stripePolicy, constructResp.Id, constructResp.Cookie, offset, uint32(thisSpanSize), spanBuf)
|
||||
*spanBuf, err = client.CreateSpan(log, []msgs.BlockServiceId{}, &spanPolicy, &blockPolicies, &stripePolicy, constructResp.Id, constructResp.Cookie, offset, uint32(thisSpanSize), *spanBuf)
|
||||
bufPool.Put(spanBuf)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
@@ -118,11 +116,11 @@ func createFile(
|
||||
return constructResp.Id, linkResp.CreationTime
|
||||
}
|
||||
|
||||
func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client, id msgs.InodeId, buf *[]byte) []byte {
|
||||
func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client, id msgs.InodeId, buf []byte) []byte {
|
||||
statResp := msgs.StatFileResp{}
|
||||
shardReq(log, client, id.Shard(), &msgs.StatFileReq{Id: id}, &statResp)
|
||||
if len(*buf) < int(statResp.Size) {
|
||||
*buf = append(*buf, make([]byte, int(statResp.Size)-len(*buf))...)
|
||||
if len(buf) < int(statResp.Size) {
|
||||
buf = append(buf, make([]byte, int(statResp.Size)-len(buf))...)
|
||||
}
|
||||
spansReq := msgs.FileSpansReq{
|
||||
FileId: id,
|
||||
@@ -138,16 +136,27 @@ func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client,
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
read, err := bytes.NewBuffer((*buf)[readSoFar:readSoFar]).ReadFrom(spanR)
|
||||
spanR.Close()
|
||||
spanRead := 0
|
||||
for {
|
||||
r, err := spanR.Read(buf[readSoFar+spanRead:])
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
spanR.Close()
|
||||
panic(err)
|
||||
}
|
||||
spanRead += r
|
||||
}
|
||||
spanR.Close() // TODO replace with Put?
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
spanCrc := msgs.Crc(crc32c.Sum(0, (*buf)[readSoFar:readSoFar+int(read)]))
|
||||
spanCrc := msgs.Crc(crc32c.Sum(0, buf[readSoFar:readSoFar+spanRead]))
|
||||
if spanCrc != span.Header.Crc {
|
||||
panic(fmt.Errorf("expected crc %v for span, but got %v", span.Header.Crc, spanCrc))
|
||||
}
|
||||
readSoFar += int(read)
|
||||
readSoFar += spanRead
|
||||
}
|
||||
if spansResp.NextOffset == 0 {
|
||||
break
|
||||
@@ -157,7 +166,7 @@ func readFile(log *lib.Logger, bufPool *lib.ReadSpanBufPool, client *lib.Client,
|
||||
if readSoFar != int(statResp.Size) {
|
||||
panic(fmt.Errorf("readSoFar=%v != statResp.Size=%v", readSoFar, statResp.Size))
|
||||
}
|
||||
return (*buf)[:readSoFar]
|
||||
return buf[:readSoFar]
|
||||
}
|
||||
|
||||
func readDir(log *lib.Logger, client *lib.Client, dir msgs.InodeId) []edge {
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
package lib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/cipher"
|
||||
"encoding/binary"
|
||||
"xtx/eggsfs/msgs"
|
||||
)
|
||||
|
||||
func BlockWriteCertificate(cipher cipher.Block, blockServiceId msgs.BlockServiceId, req *msgs.WriteBlockReq) [8]byte {
|
||||
w := bytes.NewBuffer([]byte{})
|
||||
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
|
||||
w.Write([]byte{'w'})
|
||||
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
|
||||
binary.Write(w, binary.LittleEndian, uint32(req.Crc))
|
||||
binary.Write(w, binary.LittleEndian, uint32(req.Size))
|
||||
return CBCMAC(cipher, w.Bytes())
|
||||
}
|
||||
|
||||
func CheckBlockWriteCertificate(cipher cipher.Block, blockServiceId msgs.BlockServiceId, req *msgs.WriteBlockReq) ([8]byte, bool) {
|
||||
expectedMac := BlockWriteCertificate(cipher, blockServiceId, req)
|
||||
return expectedMac, expectedMac == req.Certificate
|
||||
}
|
||||
|
||||
func CheckBlockEraseCertificate(blockServiceId msgs.BlockServiceId, cipher cipher.Block, req *msgs.EraseBlockReq) ([8]byte, bool) {
|
||||
// compute mac
|
||||
w := bytes.NewBuffer([]byte{})
|
||||
binary.Write(w, binary.LittleEndian, uint64(blockServiceId))
|
||||
w.Write([]byte{'e'})
|
||||
binary.Write(w, binary.LittleEndian, uint64(req.BlockId))
|
||||
expectedMac := CBCMAC(cipher, w.Bytes())
|
||||
return expectedMac, expectedMac == req.Certificate
|
||||
}
|
||||
@@ -135,6 +135,7 @@ func ReadBlocksResponse(
|
||||
}
|
||||
|
||||
func WriteBlocksResponse(log *Logger, w io.Writer, resp msgs.BlocksResponse) error {
|
||||
log.Trace("writing response %T %+v", resp, resp)
|
||||
if err := binary.Write(w, binary.LittleEndian, msgs.BLOCKS_RESP_PROTOCOL_VERSION); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -148,6 +149,7 @@ func WriteBlocksResponse(log *Logger, w io.Writer, resp msgs.BlocksResponse) err
|
||||
}
|
||||
|
||||
func WriteBlocksResponseError(log *Logger, w io.Writer, err msgs.ErrCode) error {
|
||||
log.Debug("writing blocks error %v", err)
|
||||
if err := binary.Write(w, binary.LittleEndian, msgs.BLOCKS_RESP_PROTOCOL_VERSION); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
+18
-32
@@ -530,18 +530,22 @@ func (client *Client) ResolvePath(log *Logger, path string) (msgs.InodeId, error
|
||||
return id, nil
|
||||
}
|
||||
|
||||
type Taintable interface {
|
||||
Taint()
|
||||
// Some connection we can reuse
|
||||
type Puttable interface {
|
||||
// Note: this can only be called if the current request (usually for blocks)
|
||||
// has been _fully consumed without errors_. If it hasn't, then you should
|
||||
// just close the connection.
|
||||
Put()
|
||||
}
|
||||
|
||||
type TaintableReadCloser interface {
|
||||
type PuttableReadCloser interface {
|
||||
io.ReadCloser
|
||||
Taintable
|
||||
Puttable
|
||||
}
|
||||
|
||||
type TaintableCloser interface {
|
||||
type PuttableCloser interface {
|
||||
io.Closer
|
||||
Taintable
|
||||
Puttable
|
||||
}
|
||||
|
||||
type BlocksConn interface {
|
||||
@@ -549,51 +553,34 @@ type BlocksConn interface {
|
||||
io.Reader
|
||||
io.ReaderFrom
|
||||
io.Closer
|
||||
Taintable
|
||||
Puttable
|
||||
}
|
||||
|
||||
type trackedBlocksConn struct {
|
||||
blockService msgs.BlockServiceId
|
||||
conn *net.TCPConn
|
||||
tainted bool
|
||||
factory *blocksConnFactory
|
||||
}
|
||||
|
||||
func (c *trackedBlocksConn) Read(p []byte) (int, error) {
|
||||
read, err := c.conn.Read(p)
|
||||
if err != nil {
|
||||
c.tainted = true
|
||||
}
|
||||
return read, err
|
||||
return c.conn.Read(p)
|
||||
}
|
||||
|
||||
func (c *trackedBlocksConn) Write(p []byte) (int, error) {
|
||||
written, err := c.conn.Write(p)
|
||||
if err != nil {
|
||||
c.tainted = true
|
||||
}
|
||||
return written, err
|
||||
return c.conn.Write(p)
|
||||
}
|
||||
|
||||
func (c *trackedBlocksConn) ReadFrom(r io.Reader) (int64, error) {
|
||||
n, err := c.conn.ReadFrom(r)
|
||||
if err != nil {
|
||||
c.tainted = true
|
||||
}
|
||||
return n, err
|
||||
return c.conn.ReadFrom(r)
|
||||
}
|
||||
|
||||
func (c *trackedBlocksConn) Close() error {
|
||||
if c.tainted {
|
||||
return c.conn.Close()
|
||||
} else {
|
||||
c.factory.put(c.blockService, time.Now(), c.conn)
|
||||
return nil
|
||||
}
|
||||
return c.conn.Close()
|
||||
}
|
||||
|
||||
func (c *trackedBlocksConn) Taint() {
|
||||
c.tainted = true
|
||||
func (c *trackedBlocksConn) Put() {
|
||||
c.factory.put(c.blockService, time.Now(), c.conn)
|
||||
c.conn = nil
|
||||
}
|
||||
|
||||
// The first ip1/port1 cannot be zeroed, the second one can. One of them
|
||||
@@ -614,7 +601,6 @@ func (c *Client) GetBlocksConn(log *Logger, blockServiceId msgs.BlockServiceId,
|
||||
return &trackedBlocksConn{
|
||||
blockService: blockServiceId,
|
||||
conn: conn.(*net.TCPConn),
|
||||
tainted: false,
|
||||
factory: &c.blocksConns,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -17,10 +17,10 @@ func ReadShuckleRequest(
|
||||
) (msgs.ShuckleRequest, error) {
|
||||
var protocol uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &protocol); err != nil {
|
||||
return nil, fmt.Errorf("could not read protocol: %w", err)
|
||||
return nil, err
|
||||
}
|
||||
if protocol != msgs.SHUCKLE_REQ_PROTOCOL_VERSION {
|
||||
return nil, fmt.Errorf("bad shuckle protocol, expected %v, got %v", msgs.SHUCKLE_REQ_PROTOCOL_VERSION, protocol)
|
||||
return nil, fmt.Errorf("bad shuckle protocol, expected %08x, got %08x", msgs.SHUCKLE_REQ_PROTOCOL_VERSION, protocol)
|
||||
}
|
||||
var len uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &len); err != nil {
|
||||
@@ -28,7 +28,7 @@ func ReadShuckleRequest(
|
||||
}
|
||||
data := make([]byte, len)
|
||||
if _, err := io.ReadFull(r, data); err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("could not read response body: %w", err)
|
||||
}
|
||||
kind := msgs.ShuckleMessageKind(data[0])
|
||||
var req msgs.ShuckleRequest
|
||||
|
||||
+37
-33
@@ -74,6 +74,8 @@ func ensureLen(buf []byte, l int) []byte {
|
||||
return buf
|
||||
}
|
||||
|
||||
const EGGSFS_PAGE_SIZE int = 4096
|
||||
|
||||
func prepareSpanInitiateReq(
|
||||
blacklist []msgs.BlockServiceId,
|
||||
spanPolicies *msgs.SpanPolicy,
|
||||
@@ -104,6 +106,8 @@ func prepareSpanInitiateReq(
|
||||
B := spanPolicy.Parity.Blocks()
|
||||
blockSize := (len(data) + D - 1) / D
|
||||
cellSize := (blockSize + S - 1) / S
|
||||
// Round up cell to page size
|
||||
cellSize = EGGSFS_PAGE_SIZE * ((cellSize + EGGSFS_PAGE_SIZE - 1) / EGGSFS_PAGE_SIZE)
|
||||
blockSize = cellSize * S
|
||||
storageClass := blockPolicies.Pick(uint32(blockSize)).StorageClass
|
||||
|
||||
@@ -341,7 +345,7 @@ func (c *Client) CreateFile(
|
||||
type mirroredSpanReader struct {
|
||||
cursor int
|
||||
block int
|
||||
blockConn TaintableReadCloser
|
||||
blockConn PuttableReadCloser
|
||||
cellBuf *[]byte
|
||||
cellCrcs []msgs.Crc // starting from the _next_ stripe crc
|
||||
}
|
||||
@@ -350,15 +354,15 @@ func (r *mirroredSpanReader) Close() error {
|
||||
return r.blockConn.Close()
|
||||
}
|
||||
|
||||
func (r *mirroredSpanReader) Taint() {
|
||||
r.blockConn.Taint()
|
||||
func (r *mirroredSpanReader) Put() {
|
||||
r.blockConn.Put()
|
||||
}
|
||||
|
||||
type rsNormalSpanReader struct {
|
||||
bufPool *ReadSpanBufPool
|
||||
cursor int
|
||||
haveBlocks []uint8 // which blocks are we fetching. most of the times it'll just be the data blocks
|
||||
blockConns []TaintableReadCloser
|
||||
blockConns []PuttableReadCloser
|
||||
blocksRunningCrcs []msgs.Crc
|
||||
stripeBuf *[]byte
|
||||
stripeCrcs []msgs.Crc // starting from the _next_ stripe CRC.
|
||||
@@ -366,9 +370,9 @@ type rsNormalSpanReader struct {
|
||||
}
|
||||
|
||||
func (sr *rsNormalSpanReader) Close() error {
|
||||
sr.bufPool.put(sr.stripeBuf)
|
||||
sr.bufPool.Put(sr.stripeBuf)
|
||||
for _, b := range sr.parityBuffers {
|
||||
sr.bufPool.put(b)
|
||||
sr.bufPool.Put(b)
|
||||
}
|
||||
var lastErr error
|
||||
for _, c := range sr.blockConns {
|
||||
@@ -380,9 +384,9 @@ func (sr *rsNormalSpanReader) Close() error {
|
||||
return lastErr
|
||||
}
|
||||
|
||||
func (sr *rsNormalSpanReader) Taint() {
|
||||
func (sr *rsNormalSpanReader) Put() {
|
||||
for _, c := range sr.blockConns {
|
||||
c.Taint()
|
||||
c.Put()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -398,12 +402,12 @@ type rsCorruptedSpanReader struct {
|
||||
|
||||
func (r *rsCorruptedSpanReader) Close() error {
|
||||
for _, b := range r.dataBlocks {
|
||||
r.bufPool.put(b)
|
||||
r.bufPool.Put(b)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*rsCorruptedSpanReader) Taint() {}
|
||||
func (*rsCorruptedSpanReader) Put() {}
|
||||
|
||||
type spanReader struct {
|
||||
bufPool *ReadSpanBufPool
|
||||
@@ -415,16 +419,16 @@ type spanReader struct {
|
||||
stripes uint8
|
||||
cellSize uint32
|
||||
blocksCrcs []msgs.Crc
|
||||
blockConn func(block int, offset uint32, size uint32) (TaintableReadCloser, error)
|
||||
r TaintableCloser
|
||||
blockConn func(block int, offset uint32, size uint32) (PuttableReadCloser, error)
|
||||
r PuttableCloser
|
||||
}
|
||||
|
||||
func (sr *spanReader) Close() error {
|
||||
return sr.r.Close()
|
||||
}
|
||||
|
||||
func (sr *spanReader) Taint() {
|
||||
sr.r.Taint()
|
||||
func (sr *spanReader) Put() {
|
||||
sr.r.Put()
|
||||
}
|
||||
|
||||
func (sr *spanReader) repairCorruptedStripe(
|
||||
@@ -435,7 +439,7 @@ func (sr *spanReader) repairCorruptedStripe(
|
||||
parityData []*[]byte,
|
||||
haveBlocks []uint8, // the blocks we have been using so far
|
||||
haveBlocksCrc []msgs.Crc, // the CRCs so far for the blocks that we have
|
||||
haveBlocksConns []TaintableReadCloser, // the connections to the blocks we already have
|
||||
haveBlocksConns []PuttableReadCloser, // the connections to the blocks we already have
|
||||
) (*rsCorruptedSpanReader, error) {
|
||||
D := sr.parity.DataBlocks()
|
||||
B := sr.parity.Blocks()
|
||||
@@ -448,13 +452,13 @@ func (sr *spanReader) repairCorruptedStripe(
|
||||
for i := D; i < B; i++ {
|
||||
b := blocksData[i]
|
||||
if b != nil {
|
||||
bufPool.put(b)
|
||||
bufPool.Put(b)
|
||||
}
|
||||
}
|
||||
}()
|
||||
// make space for full sized remaining data blocks
|
||||
for b := 0; b < D; b++ {
|
||||
blocksData[b] = bufPool.get(remainingBlockSize)
|
||||
blocksData[b] = bufPool.Get(remainingBlockSize)
|
||||
}
|
||||
// load current stripe data into the blocks buffers
|
||||
{
|
||||
@@ -463,7 +467,7 @@ func (sr *spanReader) repairCorruptedStripe(
|
||||
if int(b) < D {
|
||||
copy(*blocksData[int(b)], (*stripeData)[int(b)*int(sr.cellSize):])
|
||||
} else {
|
||||
blocksData[b] = bufPool.get(remainingBlockSize)
|
||||
blocksData[b] = bufPool.Get(remainingBlockSize)
|
||||
copy(*blocksData[int(b)], *parityData[parityIx])
|
||||
parityIx++
|
||||
}
|
||||
@@ -499,7 +503,7 @@ func (sr *spanReader) repairCorruptedStripe(
|
||||
}
|
||||
// Find and download blocks to recover
|
||||
var tmpBuf *[]byte
|
||||
defer bufPool.put(tmpBuf)
|
||||
defer bufPool.Put(tmpBuf)
|
||||
for b := 0; b < B && numGoodBlocks < D; b++ {
|
||||
if badBlocks[b] || goodBlocks[b] { // we know this is bad
|
||||
continue
|
||||
@@ -516,7 +520,7 @@ func (sr *spanReader) repairCorruptedStripe(
|
||||
continue
|
||||
}
|
||||
if tmpBuf == nil {
|
||||
tmpBuf = bufPool.get(int(blockSize))
|
||||
tmpBuf = bufPool.Get(int(blockSize))
|
||||
}
|
||||
if _, err := io.ReadFull(conn, *tmpBuf); err != nil {
|
||||
conn.Close()
|
||||
@@ -528,7 +532,7 @@ func (sr *spanReader) repairCorruptedStripe(
|
||||
crc := crc32c.Sum(0, *tmpBuf)
|
||||
if crc == uint32(sr.blocksCrcs[b]) {
|
||||
// this is a good one
|
||||
blocksData[b] = bufPool.get(remainingBlockSize)
|
||||
blocksData[b] = bufPool.Get(remainingBlockSize)
|
||||
copy(*blocksData[b], (*tmpBuf)[blockStart:])
|
||||
goodBlocks[b] = true
|
||||
numGoodBlocks++
|
||||
@@ -822,8 +826,8 @@ func readSpanFromBlocks(
|
||||
//
|
||||
// We currently make the assumption that the connections that are available
|
||||
// at the beginning will be available throughout the duration of span reading.
|
||||
blockConn func(block int, offset uint32, size uint32) (TaintableReadCloser, error),
|
||||
) (TaintableReadCloser, error) {
|
||||
blockConn func(block int, offset uint32, size uint32) (PuttableReadCloser, error),
|
||||
) (PuttableReadCloser, error) {
|
||||
D := parity.DataBlocks()
|
||||
B := parity.Blocks()
|
||||
sr := spanReader{
|
||||
@@ -839,7 +843,7 @@ func readSpanFromBlocks(
|
||||
if D == 1 {
|
||||
mr := &mirroredSpanReader{
|
||||
cursor: 0,
|
||||
cellBuf: sr.bufPool.get(int(cellSize)),
|
||||
cellBuf: sr.bufPool.Get(int(cellSize)),
|
||||
cellCrcs: stripesCrc,
|
||||
}
|
||||
for b := 0; b < B; b++ {
|
||||
@@ -861,7 +865,7 @@ func readSpanFromBlocks(
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
conns := make([]TaintableReadCloser, 0)
|
||||
conns := make([]PuttableReadCloser, 0)
|
||||
haveBlocks := make([]uint8, 0)
|
||||
parityBuffers := []*[]byte{}
|
||||
for i := 0; i < B; i++ {
|
||||
@@ -875,7 +879,7 @@ func readSpanFromBlocks(
|
||||
conns = append(conns, conn)
|
||||
haveBlocks = append(haveBlocks, uint8(i))
|
||||
if i >= D {
|
||||
parityBuffers = append(parityBuffers, sr.bufPool.get(int(cellSize)))
|
||||
parityBuffers = append(parityBuffers, sr.bufPool.Get(int(cellSize)))
|
||||
}
|
||||
if len(haveBlocks) == D {
|
||||
break
|
||||
@@ -884,7 +888,7 @@ func readSpanFromBlocks(
|
||||
if len(haveBlocks) != D {
|
||||
return nil, fmt.Errorf("couldn't get enough block connections (need at least %v, got %v)", D, len(conns))
|
||||
}
|
||||
stripeBuf := sr.bufPool.get(int(cellSize) * D)
|
||||
stripeBuf := sr.bufPool.Get(int(cellSize) * D)
|
||||
rsr := &rsNormalSpanReader{
|
||||
bufPool: sr.bufPool,
|
||||
cursor: 0,
|
||||
@@ -931,7 +935,7 @@ func (*inlineSpanReader) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*inlineSpanReader) Taint() {}
|
||||
func (*inlineSpanReader) Put() {}
|
||||
|
||||
type ReadSpanBufPool struct {
|
||||
pool sync.Pool
|
||||
@@ -951,7 +955,7 @@ func NewReadSpanBufPool() *ReadSpanBufPool {
|
||||
|
||||
// This does _not_ zero the memory in the bufs -- i.e. there might
|
||||
// be garbage in it.
|
||||
func (pool *ReadSpanBufPool) get(l int) *[]byte {
|
||||
func (pool *ReadSpanBufPool) Get(l int) *[]byte {
|
||||
buf := pool.pool.Get().(*[]byte)
|
||||
if cap(*buf) >= l {
|
||||
*buf = (*buf)[:l]
|
||||
@@ -962,7 +966,7 @@ func (pool *ReadSpanBufPool) get(l int) *[]byte {
|
||||
return buf
|
||||
}
|
||||
|
||||
func (pool *ReadSpanBufPool) put(buf *[]byte) {
|
||||
func (pool *ReadSpanBufPool) Put(buf *[]byte) {
|
||||
if buf != nil {
|
||||
pool.pool.Put(buf)
|
||||
}
|
||||
@@ -974,8 +978,7 @@ func (c *Client) ReadSpan(
|
||||
blacklist []msgs.BlockServiceId,
|
||||
blockServices []msgs.BlockService,
|
||||
fetchedSpan *msgs.FetchedSpan,
|
||||
) (TaintableReadCloser, error) {
|
||||
log.DebugStack(1, "starting to read span")
|
||||
) (PuttableReadCloser, error) {
|
||||
if fetchedSpan.Header.StorageClass == msgs.INLINE_STORAGE {
|
||||
data := fetchedSpan.Body.(*msgs.FetchedInlineSpan).Body
|
||||
dataCrc := msgs.Crc(crc32c.Sum(0, data))
|
||||
@@ -991,11 +994,12 @@ func (c *Client) ReadSpan(
|
||||
}
|
||||
|
||||
body := fetchedSpan.Body.(*msgs.FetchedBlocksSpan)
|
||||
log.DebugStack(1, "span parity %v", body.Parity)
|
||||
blocksCrcs := make([]msgs.Crc, body.Parity.Blocks())
|
||||
for i := range blocksCrcs {
|
||||
blocksCrcs[i] = body.Blocks[i].Crc
|
||||
}
|
||||
blockConn := func(blockIx int, offset uint32, size uint32) (TaintableReadCloser, error) {
|
||||
blockConn := func(blockIx int, offset uint32, size uint32) (PuttableReadCloser, error) {
|
||||
log.DebugStack(1, "requested connection for block ix %v, offset %v, size %v", blockIx, offset, size)
|
||||
block := body.Blocks[blockIx]
|
||||
blockService := blockServices[block.BlockServiceIx]
|
||||
|
||||
+4
-2
@@ -37,7 +37,9 @@ func (c *mockedBlockConn) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*mockedBlockConn) Taint() {}
|
||||
func (c *mockedBlockConn) Put() {
|
||||
c.Close()
|
||||
}
|
||||
|
||||
func testSpan(
|
||||
t *testing.T,
|
||||
@@ -142,7 +144,7 @@ func testSpan(
|
||||
openBlockConns := int64(0)
|
||||
spanReader, err := readSpanFromBlocks(
|
||||
bufPool, sizeWithZeros, req.Crc, req.Parity, req.Stripes, req.CellSize, blocksCrcs, stripesCrcs,
|
||||
func(i int, offset uint32, size uint32) (TaintableReadCloser, error) {
|
||||
func(i int, offset uint32, size uint32) (PuttableReadCloser, error) {
|
||||
for _, b := range badConnections {
|
||||
if int(b) == i {
|
||||
return nil, nil
|
||||
|
||||
+6
-1
@@ -112,6 +112,10 @@ func (id *InodeId) UnmarshalJSON(b []byte) error {
|
||||
if err := json.Unmarshal(b, &ids); err != nil {
|
||||
return err
|
||||
}
|
||||
if ids == "ROOT" {
|
||||
*id = ROOT_DIR_INODE_ID
|
||||
return nil
|
||||
}
|
||||
idu, err := strconv.ParseUint(ids, 0, 63)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -506,6 +510,8 @@ type SoftUnlinkFileResp struct{}
|
||||
type FileSpansReq struct {
|
||||
FileId InodeId
|
||||
ByteOffset uint64
|
||||
// if 0, no limit.
|
||||
Limit uint32
|
||||
}
|
||||
|
||||
type FetchedBlock struct {
|
||||
@@ -838,7 +844,6 @@ type ExpireTransientFileResp struct{}
|
||||
type MakeDirectoryReq struct {
|
||||
OwnerId InodeId
|
||||
Name string
|
||||
Info DirectoryInfo
|
||||
}
|
||||
|
||||
type MakeDirectoryResp struct {
|
||||
|
||||
+543
-543
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,10 @@
|
||||
*.qcow2
|
||||
compile_commands.json
|
||||
*.o
|
||||
*.d
|
||||
*.cmd
|
||||
Module.symvers
|
||||
modules.order
|
||||
*.ko
|
||||
*.mod
|
||||
*.mod.c
|
||||
@@ -0,0 +1,8 @@
|
||||
To get a `compile_commands.json`:
|
||||
|
||||
* Call the kmod source dir (where this file is located) `$KMOD_DIR`
|
||||
* Download some version of the linux source (I used `linux-5.4.237` to be in line with Ubuntu 20.20), say `$LINUX_DIR`
|
||||
* `cd $LINUX_DIR && make LLVM=1 defconfig && make LLVM=1 -j`
|
||||
* `cd $KMOD_DIR && make KDIR=$LINUX_DIR LLVM=1 kmod`
|
||||
* `$LINUX_DIR/scripts/gen_compile_commands.py -d $KMOD_DIR` to generate `compile_commands.json`
|
||||
* Replace all occurrences of `"directory": $KMOD_DIR` with `"directory": $LINUX_DIR` in the generated `$KMOD_DIR/compile_commands.json`
|
||||
@@ -0,0 +1,43 @@
|
||||
KDIR ?= /data/home/fmazzol/src/eggsfs/kmod/linux-5.4.237/
|
||||
LLVM ?= 0
|
||||
|
||||
obj-m += eggsfs.o
|
||||
|
||||
eggsfs-objs += \
|
||||
dir.o \
|
||||
err.o \
|
||||
file.o \
|
||||
inode.o \
|
||||
kmod.o \
|
||||
metadata.o \
|
||||
namei.o \
|
||||
net.o \
|
||||
skb.o \
|
||||
super.o \
|
||||
sysctl.o \
|
||||
sysfs.o \
|
||||
trace.o \
|
||||
shuckle.o \
|
||||
block.o \
|
||||
file.o \
|
||||
rs.o \
|
||||
crc.o \
|
||||
span.o
|
||||
|
||||
EXTRA_CFLAGS = -I$(src) -g -DDEBUG -fdiagnostics-color=always -Wno-declaration-after-statement
|
||||
|
||||
export CF = -Wbitwise -Wcontext -Wcast_truncate -Wsparse-all -Wno-shadow -Wnosizeof-void -DCONFIG_SPARSE_RCU_POINTER
|
||||
|
||||
# C=1 builds with sparse
|
||||
kmod:
|
||||
make -C $(KDIR) M=$(PWD) C=1 modules
|
||||
|
||||
clean:
|
||||
make -C $(KDIR) M=$(PWD) clean
|
||||
rm -f bincode_tests
|
||||
|
||||
bincode_tests: bincode_tests.c bincodegen.h bincode.h
|
||||
clang -Wall -g -O2 -fsanitize=undefined,address,integer,function -fno-sanitize-recover=all bincode_tests.c -o bincode_tests
|
||||
|
||||
writefile: writefile.c
|
||||
gcc -Wall -g -O2 writefile.c -o writefile
|
||||
+143
@@ -0,0 +1,143 @@
|
||||
#ifndef _EGGSFS_BINCODE_H
|
||||
#define _EGGSFS_BINCODE_H
|
||||
|
||||
#ifndef EGGSFS_BINCODE_TESTS
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/bug.h>
|
||||
#include <asm/unaligned.h>
|
||||
#endif
|
||||
|
||||
struct eggsfs_bincode_put_ctx { char* start; char* cursor; char* end; };
|
||||
|
||||
struct eggsfs_bincode_get_ctx { char* buf; char* end; u16 err; };
|
||||
|
||||
struct eggsfs_bincode_bytes { char* buf; u8 len; };
|
||||
|
||||
#define eggsfs_bincode_get_finish_list_el(end) ((void)(end))
|
||||
|
||||
#include "bincodegen.h"
|
||||
|
||||
inline static void eggsfs_bincode_put_u64(struct eggsfs_bincode_put_ctx* ctx, u64 x) {
|
||||
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
|
||||
put_unaligned_le64(x, ctx->cursor);
|
||||
ctx->cursor += sizeof(x);
|
||||
}
|
||||
inline static void eggsfs_bincode_put_u32(struct eggsfs_bincode_put_ctx* ctx, u32 x) {
|
||||
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
|
||||
put_unaligned_le32(x, ctx->cursor);
|
||||
ctx->cursor += sizeof(x);
|
||||
}
|
||||
inline static void eggsfs_bincode_put_u16(struct eggsfs_bincode_put_ctx* ctx, u16 x) {
|
||||
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
|
||||
put_unaligned_le16(x, ctx->cursor);
|
||||
ctx->cursor += sizeof(x);
|
||||
}
|
||||
inline static void eggsfs_bincode_put_u8(struct eggsfs_bincode_put_ctx* ctx, u8 x) {
|
||||
BUG_ON(ctx->end - ctx->cursor < sizeof(x));
|
||||
*(u8*)(ctx->cursor) = x;
|
||||
ctx->cursor += sizeof(x);
|
||||
}
|
||||
|
||||
inline static u64 eggsfs_bincode_get_u64(struct eggsfs_bincode_get_ctx* ctx) {
|
||||
if (likely(ctx->err == 0)) {
|
||||
if (unlikely(ctx->end - ctx->buf < sizeof(u64))) {
|
||||
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
|
||||
return 0;
|
||||
} else {
|
||||
u64 x = get_unaligned_le64(ctx->buf);
|
||||
ctx->buf += sizeof(x);
|
||||
return x;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
inline static u32 eggsfs_bincode_get_u32(struct eggsfs_bincode_get_ctx* ctx) {
|
||||
if (likely(ctx->err == 0)) {
|
||||
if (unlikely(ctx->end - ctx->buf < sizeof(u32))) {
|
||||
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
|
||||
return 0;
|
||||
} else {
|
||||
u32 x = get_unaligned_le32(ctx->buf);
|
||||
ctx->buf += sizeof(x);
|
||||
return x;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
inline static u16 eggsfs_bincode_get_u16(struct eggsfs_bincode_get_ctx* ctx) {
|
||||
if (likely(ctx->err == 0)) {
|
||||
if (unlikely(ctx->end - ctx->buf < sizeof(u16))) {
|
||||
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
|
||||
return 0;
|
||||
} else {
|
||||
u16 x = get_unaligned_le16(ctx->buf);
|
||||
ctx->buf += sizeof(x);
|
||||
return x;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
inline static u8 eggsfs_bincode_get_u8(struct eggsfs_bincode_get_ctx* ctx) {
|
||||
if (likely(ctx->err == 0)) {
|
||||
if (unlikely(ctx->end - ctx->buf < sizeof(u8))) {
|
||||
ctx->err = EGGSFS_ERR_MALFORMED_RESPONSE;
|
||||
return 0;
|
||||
} else {
|
||||
u8 x = *(u8*)(ctx->buf);
|
||||
ctx->buf += sizeof(x);
|
||||
return x;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// >>> format(struct.unpack('<I', b'SHA\0')[0], 'x')
|
||||
// '414853'
|
||||
static const u32 EGGSFS_SHARD_REQ_PROTOCOL_VERSION = 0x414853;
|
||||
|
||||
// >>> format(struct.unpack('<I', b'SHA\1')[0], 'x')
|
||||
// '1414853'
|
||||
static const u32 EGGSFS_SHARD_RESP_PROTOCOL_VERSION = 0x1414853;
|
||||
|
||||
// >>> format(struct.unpack('<I', b'CDC\0')[0], 'x')
|
||||
// '434443'
|
||||
static const u32 EGGSFS_CDC_REQ_PROTOCOL_VERSION = 0x434443;
|
||||
|
||||
// >>> format(struct.unpack('<I', b'CDC\1')[0], 'x')
|
||||
// '1434443'
|
||||
static const u32 EGGSFS_CDC_RESP_PROTOCOL_VERSION = 0x1434443;
|
||||
|
||||
// >>> format(struct.unpack('<I', b'SHU\0')[0], 'x')
|
||||
// '554853'
|
||||
static const u32 EGGSFS_SHUCKLE_REQ_PROTOCOL_VERSION = 0x554853;
|
||||
|
||||
// >>> format(struct.unpack('<I', b'SHU\1')[0], 'x')
|
||||
// '1554853'
|
||||
static const u32 EGGSFS_SHUCKLE_RESP_PROTOCOL_VERSION = 0x1554853;
|
||||
|
||||
// >>> format(struct.unpack('<I', b'BLO\0')[0], 'x')
|
||||
// '4f4c42'
|
||||
static const u32 EGGSFS_BLOCKS_REQ_PROTOCOL_VERSION = 0x4f4c42;
|
||||
|
||||
// >>> format(struct.unpack('<I', b'BLO\1')[0], 'x')
|
||||
// '14f4c42'
|
||||
static const u32 EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION = 0x14f4c42;
|
||||
|
||||
static const u8 SPAN_POLICY_TAG = 2;
|
||||
static const u8 BLOCK_POLICY_TAG = 3;
|
||||
static const u8 STRIPE_POLICY_TAG = 4;
|
||||
|
||||
static const u8 EGGSFS_EMPTY_STORAGE = 0;
|
||||
static const u8 EGGSFS_INLINE_STORAGE = 1;
|
||||
static const u8 EGGSFS_HDD_STORAGE = 2;
|
||||
static const u8 EGGSFS_FLASH_STORAGE = 3;
|
||||
|
||||
#define EGGSFS_GET_EXTRA(x) (((x) & (1ull<<63)) >> 63)
|
||||
#define EGGSFS_GET_EXTRA_ID(x) ((x) & ~(1ull<<63))
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,163 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint16_t u16;
|
||||
typedef uint32_t u32;
|
||||
typedef uint64_t u64;
|
||||
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#define UDP_MTU 1472
|
||||
|
||||
#define BUG_ON(x) \
|
||||
if (unlikely(x)) { \
|
||||
fprintf(stderr, "bug: " #x); \
|
||||
fprintf(stderr, "\n"); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
static inline u64 get_unaligned_le64(const void* p) {
|
||||
u64 x;
|
||||
memcpy(&x, p, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
static inline u32 get_unaligned_le32(const void* p) {
|
||||
u32 x;
|
||||
memcpy(&x, p, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
static inline u16 get_unaligned_le16(const void* p) {
|
||||
u16 x;
|
||||
memcpy(&x, p, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline u64 get_unaligned_be64(const void* p) {
|
||||
u64 x;
|
||||
memcpy(&x, p, sizeof(x));
|
||||
return __bswap_64(x);
|
||||
}
|
||||
static inline u32 get_unaligned_be32(const void* p) {
|
||||
u32 x;
|
||||
memcpy(&x, p, sizeof(x));
|
||||
return __bswap_32(x);
|
||||
}
|
||||
|
||||
static inline void put_unaligned_le64(u64 x, void* p) {
|
||||
memcpy(p, &x, sizeof(x));
|
||||
}
|
||||
static inline void put_unaligned_le32(u32 x, void* p) {
|
||||
memcpy(p, &x, sizeof(x));
|
||||
}
|
||||
static inline void put_unaligned_le16(u16 x, void* p) {
|
||||
memcpy(p, &x, sizeof(x));
|
||||
}
|
||||
|
||||
static inline void put_unaligned_be64(u64 x, void* p) {
|
||||
x = __bswap_64(x);
|
||||
memcpy(p, &x, sizeof(x));
|
||||
}
|
||||
static inline void put_unaligned_be32(u32 x, void* p) {
|
||||
x = __bswap_32(x);
|
||||
memcpy(p, &x, sizeof(x));
|
||||
}
|
||||
|
||||
#define EGGSFS_BINCODE_TESTS
|
||||
#include "bincode.h"
|
||||
|
||||
int main(void) {
|
||||
{
|
||||
char lookup_req[EGGSFS_LOOKUP_REQ_MAX_SIZE];
|
||||
struct eggsfs_bincode_put_ctx put_ctx = {
|
||||
.start = lookup_req,
|
||||
.cursor = lookup_req,
|
||||
.end = lookup_req + sizeof(lookup_req),
|
||||
};
|
||||
{
|
||||
eggsfs_lookup_req_put_start(&put_ctx, start);
|
||||
eggsfs_lookup_req_put_dir_id(&put_ctx, start, dir_id, 123);
|
||||
eggsfs_lookup_req_put_name(&put_ctx, dir_id, name, "foo", 3);
|
||||
eggsfs_lookup_req_put_end(&ctx, name, end);
|
||||
}
|
||||
struct eggsfs_bincode_get_ctx get_ctx = {
|
||||
.buf = put_ctx.start,
|
||||
.end = put_ctx.cursor,
|
||||
.err = 0,
|
||||
};
|
||||
{
|
||||
eggsfs_lookup_req_get_start(&get_ctx, start);
|
||||
eggsfs_lookup_req_get_dir_id(&get_ctx, start, dir_id);
|
||||
eggsfs_lookup_req_get_name(&get_ctx, dir_id, name);
|
||||
eggsfs_lookup_req_get_end(&get_ctx, name, end);
|
||||
eggsfs_lookup_req_get_finish(&get_ctx, end);
|
||||
BUG_ON(get_ctx.err != 0);
|
||||
BUG_ON(dir_id.x != 123);
|
||||
BUG_ON(name.str.len != 3);
|
||||
BUG_ON(strncmp("foo", name.str.buf, 3) != 0);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
char read_dir_resp[UDP_MTU];
|
||||
struct eggsfs_bincode_put_ctx put_ctx = {
|
||||
.start = read_dir_resp,
|
||||
.cursor = read_dir_resp,
|
||||
.end = read_dir_resp + sizeof(read_dir_resp),
|
||||
};
|
||||
{
|
||||
eggsfs_read_dir_resp_put_start(&put_ctx, start);
|
||||
eggsfs_read_dir_resp_put_next_hash(&put_ctx, start, next_hash, 123);
|
||||
eggsfs_read_dir_resp_put_results(&put_ctx, next_hash, results, 3);
|
||||
for (int i = 0; i < 3; i++) {
|
||||
eggsfs_current_edge_put_start(&put_ctx, edge_start);
|
||||
eggsfs_current_edge_put_target_id(&put_ctx, edge_start, target_id, i);
|
||||
eggsfs_current_edge_put_name_hash(&put_ctx, target_id, name_hash, i + 42);
|
||||
char name_str[1];
|
||||
snprintf(name_str, 1, "%d", i);
|
||||
eggsfs_current_edge_put_name(&put_ctx, name_hash, name, name_str, sizeof(name_str));
|
||||
eggsfs_current_edge_put_creation_time(&put_ctx, name, creation_time, i + 1000);
|
||||
eggsfs_current_edge_put_end(&put_ctx, creation_time, end);
|
||||
}
|
||||
eggsfs_read_dir_resp_put_end(&ctx, results, end);
|
||||
}
|
||||
struct eggsfs_bincode_get_ctx get_ctx = {
|
||||
.buf = put_ctx.start,
|
||||
.end = put_ctx.cursor,
|
||||
.err = 0,
|
||||
};
|
||||
{
|
||||
eggsfs_read_dir_resp_get_start(&get_ctx, start);
|
||||
eggsfs_read_dir_resp_get_next_hash(&get_ctx, start, next_hash);
|
||||
eggsfs_read_dir_resp_get_results(&get_ctx, next_hash, results);
|
||||
for (int i = 0; i < results.len; i++) {
|
||||
eggsfs_current_edge_get_start(&get_ctx, edge_start);
|
||||
eggsfs_current_edge_get_target_id(&get_ctx, edge_start, target_id);
|
||||
eggsfs_current_edge_get_name_hash(&get_ctx, target_id, name_hash);
|
||||
eggsfs_current_edge_get_name(&get_ctx, name_hash, name);
|
||||
eggsfs_current_edge_get_creation_time(&get_ctx, name, creation_time);
|
||||
eggsfs_current_edge_get_end(&put_ctx, creation_time, end);
|
||||
eggsfs_bincode_get_finish_list_el(end);
|
||||
if (get_ctx.err == 0) {
|
||||
BUG_ON(target_id.x != i);
|
||||
BUG_ON(name_hash.x != i + 42);
|
||||
BUG_ON(name.str.len != 1);
|
||||
char name_str[1];
|
||||
snprintf(name_str, 1, "%d", i);
|
||||
BUG_ON(strncmp(name.str.buf, name_str, 1) != 0);
|
||||
BUG_ON(creation_time.x != i + 1000);
|
||||
}
|
||||
}
|
||||
eggsfs_read_dir_resp_get_end(&get_ctx, results, end);
|
||||
eggsfs_read_dir_resp_get_finish(&get_ctx, end);
|
||||
BUG_ON(get_ctx.err != 0);
|
||||
BUG_ON(next_hash.x != 123);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
+5142
File diff suppressed because it is too large
Load Diff
+765
@@ -0,0 +1,765 @@
|
||||
// Plan for socket reclamation:
|
||||
// * Have a hashmap or anyway some sort of mapping from port/ip to socket
|
||||
// * Every n seconds in get_socket function purge inactive ones
|
||||
//
|
||||
// Plan for writing. At any given point, we are:
|
||||
// * Writing the current data blocks and the previous parity blocks.
|
||||
// * Computing the current parity blocks.
|
||||
// We can do this by just firing all the requests, start computing the
|
||||
// parity blocks, and then waiting on all the responses.
|
||||
// This should ideally keep us writing close to 100% of the time, with
|
||||
// a small tail of writing the parity blocks for the last span.
|
||||
#include <linux/inet.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
#include "block.h"
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
#include "skb.h"
|
||||
#include "super.h"
|
||||
#include "crc.h"
|
||||
#include "trace.h"
|
||||
|
||||
#define EGGSFS_BLOCKS_REQ_HEADER_SIZE (4 + 8 + 1) // protocol + block service id + kind
|
||||
|
||||
struct eggsfs_block_socket {
|
||||
struct socket* sock;
|
||||
atomic_t sock_err;
|
||||
struct list_head requests;
|
||||
spinlock_t requests_lock;
|
||||
struct sockaddr_in addr;
|
||||
// saved callbacks
|
||||
void (*saved_state_change)(struct sock *sk);
|
||||
void (*saved_data_ready)(struct sock *sk);
|
||||
void (*saved_write_space)(struct sock *sk);
|
||||
// only used for block writes
|
||||
struct work_struct write_work;
|
||||
};
|
||||
|
||||
static u64 WHICH_BLOCK_IP = 0;
|
||||
|
||||
static struct kmem_cache* eggsfs_fetch_block_request_cachep;
|
||||
static struct kmem_cache* eggsfs_write_block_request_cachep;
|
||||
|
||||
static void eggsfs_fetch_block_state_check(struct sock* sk) {
|
||||
// TODO check if connection has been closed before its time and error out
|
||||
// eggsfs_debug_print("socket state check triggered: %d", sk->sk_state);
|
||||
}
|
||||
|
||||
static void eggsfs_fetch_block_state_change(struct sock* sk) {
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
eggsfs_fetch_block_state_check(sk);
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
// Returns a negative result if the socket has to be considered corrupted.
|
||||
// If a "recoverable" error occurs (e.g. the server just replies with an error)
|
||||
// `req->err` will be filled in and receiving can continue. If an unrecoverable
|
||||
// error occurs `req->err` will be filled in also, but it's expected that the
|
||||
// caller stops operating on this socket.
|
||||
static int eggsfs_fetch_block_receive_single_req(
|
||||
struct eggsfs_fetch_block_request* req,
|
||||
struct sk_buff* skb,
|
||||
unsigned int offset, size_t len0
|
||||
) {
|
||||
size_t len = len0;
|
||||
|
||||
#define HEADER_COPY(buf, count) ({ \
|
||||
int read = count > 0 ? eggsfs_skb_copy(buf, skb, offset, count) : 0; \
|
||||
offset += read; \
|
||||
len -= read; \
|
||||
req->header_read += read; \
|
||||
read; \
|
||||
})
|
||||
|
||||
// Header
|
||||
{
|
||||
int header_left = EGGSFS_BLOCKS_RESP_HEADER_SIZE - (int)req->header_read;
|
||||
int read = HEADER_COPY(req->header.buf + req->header_read, header_left);
|
||||
header_left -= read;
|
||||
if (header_left > 0) { return len0-len; }
|
||||
}
|
||||
|
||||
// Protocol check
|
||||
if (unlikely(le32_to_cpu(req->header.data.protocol) != EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
|
||||
eggsfs_info_print("bad blocks resp protocol, expected %*pE, got %*pE", 4, &EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->header.data.protocol);
|
||||
return eggsfs_error_to_linux(EGGSFS_ERR_MALFORMED_RESPONSE);
|
||||
}
|
||||
|
||||
// Error check
|
||||
if (unlikely(req->header.data.kind == 0)) {
|
||||
int error_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)req->header_read;
|
||||
int read = HEADER_COPY(req->header.buf + req->header_read, error_left);
|
||||
error_left -= read;
|
||||
if (error_left > 0) { return len0-len; }
|
||||
eggsfs_info_print("failed=%u", req->header.data.error);
|
||||
// we got an error "nicely", the socket can carry on living.
|
||||
req->err = eggsfs_error_to_linux(le16_to_cpu(req->header.data.error));
|
||||
return len0-len;
|
||||
}
|
||||
|
||||
#undef HEADER_COPY
|
||||
|
||||
// Actual data copy
|
||||
|
||||
BUG_ON(req->bytes_left == 0);
|
||||
|
||||
struct page* page = list_first_entry(&req->pages, struct page, lru);
|
||||
BUG_ON(!page);
|
||||
char* page_ptr = kmap_atomic(page);
|
||||
|
||||
struct skb_seq_state seq;
|
||||
skb_prepare_seq_read(skb, offset, offset + min(req->bytes_left, (u32)len), &seq);
|
||||
|
||||
u32 block_bytes_read = 0;
|
||||
while (block_bytes_read < req->bytes_left) {
|
||||
const u8* data;
|
||||
u32 avail = skb_seq_read(block_bytes_read, &data, &seq);
|
||||
if (avail == 0) { break; }
|
||||
|
||||
kernel_fpu_begin(); // crc
|
||||
while (avail) {
|
||||
u32 this_len = min3(avail, (u32)PAGE_SIZE - req->page_offset, req->bytes_left - block_bytes_read);
|
||||
req->crc = eggsfs_crc32c(req->crc, data, this_len);
|
||||
memcpy(page_ptr + req->page_offset, data, this_len);
|
||||
data += this_len;
|
||||
req->page_offset += this_len;
|
||||
avail -= this_len;
|
||||
block_bytes_read += this_len;
|
||||
|
||||
if (req->page_offset >= PAGE_SIZE) {
|
||||
BUG_ON(req->page_offset > PAGE_SIZE);
|
||||
|
||||
kunmap_atomic(page_ptr);
|
||||
list_rotate_left(&req->pages);
|
||||
page = list_first_entry(&req->pages, struct page, lru);
|
||||
page_ptr = kmap_atomic(page);
|
||||
req->page_offset = 0;
|
||||
}
|
||||
}
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
// We might have not terminated with avail == 0, but because we're done with this request
|
||||
skb_abort_seq_read(&seq);
|
||||
|
||||
req->bytes_left -= block_bytes_read;
|
||||
|
||||
// The last page is partially filled, zero it and return to first page
|
||||
if (!req->bytes_left && req->page_offset) {
|
||||
memset(page_ptr + req->page_offset, 0, PAGE_SIZE - req->page_offset);
|
||||
list_rotate_left(&req->pages);
|
||||
req->page_offset = 0;
|
||||
}
|
||||
|
||||
kunmap_atomic(page_ptr);
|
||||
|
||||
return (len0-len) + block_bytes_read;
|
||||
}
|
||||
|
||||
void eggsfs_put_fetch_block_socket(struct eggsfs_block_socket *socket) {
|
||||
// TODO terminate existing requests
|
||||
|
||||
read_lock(&socket->sock->sk->sk_callback_lock);
|
||||
socket->sock->sk->sk_data_ready = socket->saved_data_ready;
|
||||
socket->sock->sk->sk_state_change = socket->saved_state_change;
|
||||
socket->sock->sk->sk_write_space = socket->saved_write_space;
|
||||
read_unlock(&socket->sock->sk->sk_callback_lock);
|
||||
|
||||
sock_release(socket->sock);
|
||||
kfree(socket);
|
||||
}
|
||||
|
||||
static int eggsfs_fetch_block_receive(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len) {
|
||||
struct eggsfs_block_socket* socket = rd_desc->arg.data;
|
||||
|
||||
int initial_len = len;
|
||||
int sock_err = atomic_read(&socket->sock_err);
|
||||
for (;;) {
|
||||
spin_lock_bh(&socket->requests_lock);
|
||||
struct eggsfs_fetch_block_request* req = list_empty(&socket->requests) ?
|
||||
NULL :
|
||||
list_first_entry(&socket->requests, struct eggsfs_fetch_block_request, socket_requests);
|
||||
spin_unlock_bh(&socket->requests_lock);
|
||||
if (req == NULL) { break; }
|
||||
int consumed = sock_err;
|
||||
if (consumed == 0) {
|
||||
consumed = eggsfs_fetch_block_receive_single_req(req, skb, offset, len);
|
||||
if (consumed < 0) {
|
||||
sock_err = consumed;
|
||||
} else {
|
||||
offset += consumed;
|
||||
len -= consumed;
|
||||
}
|
||||
} else {
|
||||
req->err = consumed;
|
||||
}
|
||||
if (req->err || req->bytes_left == 0) {
|
||||
spin_lock_bh(&socket->requests_lock);
|
||||
BUG_ON(atomic_read(&req->refcount) < 1);
|
||||
bool should_free = atomic_dec_and_test(&req->refcount);
|
||||
list_del(&req->socket_requests);
|
||||
spin_unlock_bh(&socket->requests_lock);
|
||||
complete_all(&req->comp);
|
||||
// Important to do the actual freeing after the completion: otherwise
|
||||
// there might be races between this and a consumer taking ownership
|
||||
// of the pages.
|
||||
if (should_free) {
|
||||
put_pages_list(&req->pages);
|
||||
kmem_cache_free(eggsfs_fetch_block_request_cachep, req);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (sock_err == 0) {
|
||||
BUG_ON(len != 0); // can't have leftovers
|
||||
}
|
||||
return initial_len;
|
||||
}
|
||||
|
||||
static void eggsfs_fetch_block_data_ready(struct sock* sk) {
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
read_descriptor_t rd_desc;
|
||||
// Taken from iscsi -- we set count to 1 because we want the network layer to
|
||||
// hand us all the skbs that are available.
|
||||
rd_desc.arg.data = sk->sk_user_data;
|
||||
rd_desc.count = 1;
|
||||
tcp_read_sock(sk, &rd_desc, eggsfs_fetch_block_receive);
|
||||
eggsfs_fetch_block_state_check(sk);
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
struct eggsfs_block_socket* eggsfs_get_fetch_block_socket(struct eggsfs_block_service *block_service) {
|
||||
struct eggsfs_block_socket* socket = kzalloc(sizeof(struct eggsfs_block_socket), GFP_KERNEL);
|
||||
if (socket == NULL) { return ERR_PTR(-ENOMEM); }
|
||||
|
||||
int err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &socket->sock);
|
||||
if (err != 0) { goto out_err; }
|
||||
|
||||
atomic_set(&socket->sock_err, 0);
|
||||
|
||||
socket->saved_data_ready = socket->sock->sk->sk_data_ready;
|
||||
socket->saved_state_change = socket->sock->sk->sk_state_change;
|
||||
socket->saved_write_space = socket->sock->sk->sk_write_space;
|
||||
|
||||
// Try both ips (if we have them) before giving up
|
||||
int block_ip = WHICH_BLOCK_IP++;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < 1 + (block_service->port2 != 0); i++, block_ip++) { // we might not have a second address
|
||||
socket->addr.sin_family = AF_INET;
|
||||
if (block_service->port2 == 0 || block_ip & 1) {
|
||||
socket->addr.sin_addr.s_addr = htonl(block_service->ip1);
|
||||
socket->addr.sin_port = htons(block_service->port1);
|
||||
} else {
|
||||
socket->addr.sin_addr.s_addr = htonl(block_service->ip2);
|
||||
socket->addr.sin_port = htons(block_service->port2);
|
||||
}
|
||||
|
||||
// TODO we should probably use TFO, but it's annoying because our custom callbacks
|
||||
// break the connection, so we'd need to coordinate with that in a more annoying way.
|
||||
// Also this makes it easier to try both IPs immediately.
|
||||
err = kernel_connect(socket->sock, (struct sockaddr*)&socket->addr, sizeof(socket->addr), 0);
|
||||
if (err < 0) {
|
||||
eggsfs_warn_print("could not connect to block service at %pI4:%d: %d", &socket->addr.sin_addr, ntohs(socket->addr.sin_port), err);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (err < 0) { goto out_err_sock; }
|
||||
|
||||
INIT_LIST_HEAD(&socket->requests);
|
||||
spin_lock_init(&socket->requests_lock);
|
||||
|
||||
// Important for the callbacks to happen after the connect, see TODO above.
|
||||
read_lock(&socket->sock->sk->sk_callback_lock);
|
||||
socket->sock->sk->sk_user_data = socket;
|
||||
socket->sock->sk->sk_data_ready = eggsfs_fetch_block_data_ready;
|
||||
socket->sock->sk->sk_state_change = eggsfs_fetch_block_state_change;
|
||||
read_unlock(&socket->sock->sk->sk_callback_lock);
|
||||
|
||||
return socket;
|
||||
|
||||
out_err_sock:
|
||||
sock_release(socket->sock);
|
||||
out_err:
|
||||
kfree(socket);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
struct eggsfs_fetch_block_request* eggsfs_fetch_block(
|
||||
struct eggsfs_block_socket* socket,
|
||||
u64 block_service_id,
|
||||
u64 block_id,
|
||||
u32 offset,
|
||||
u32 count
|
||||
) {
|
||||
int err;
|
||||
|
||||
struct eggsfs_fetch_block_request* req = kmem_cache_alloc(eggsfs_fetch_block_request_cachep, GFP_KERNEL);
|
||||
if (!req) { err = -ENOMEM; goto out_err; }
|
||||
|
||||
INIT_LIST_HEAD(&req->pages);
|
||||
int i;
|
||||
for (i = 0; i < (count + PAGE_SIZE - 1)/PAGE_SIZE; i++) {
|
||||
struct page* page = alloc_page(GFP_KERNEL);
|
||||
if (!page) { err = -ENOMEM; goto out_err_pages; }
|
||||
list_add_tail(&page->lru, &req->pages);
|
||||
}
|
||||
|
||||
req->bytes_left = count;
|
||||
req->page_offset = 0;
|
||||
req->header_read = 0;
|
||||
req->crc = 0;
|
||||
req->err = 0;
|
||||
// one for the socket, one for the caller
|
||||
atomic_set(&req->refcount, 2);
|
||||
reinit_completion(&req->comp);
|
||||
|
||||
spin_lock_bh(&socket->requests_lock);
|
||||
list_add_tail(&req->socket_requests, &socket->requests);
|
||||
spin_unlock_bh(&socket->requests_lock);
|
||||
|
||||
// fill in req
|
||||
char req_msg_buf[EGGSFS_BLOCKS_REQ_HEADER_SIZE + EGGSFS_FETCH_BLOCK_REQ_SIZE];
|
||||
char* req_msg = req_msg_buf;
|
||||
put_unaligned_le32(EGGSFS_BLOCKS_REQ_PROTOCOL_VERSION, req_msg); req_msg += 4;
|
||||
put_unaligned_le64(block_service_id, req_msg); req_msg += 8;
|
||||
*(u8*)req_msg = EGGSFS_BLOCKS_FETCH_BLOCK; req_msg += 1;
|
||||
{
|
||||
struct eggsfs_bincode_put_ctx ctx = {
|
||||
.start = req_msg,
|
||||
.cursor = req_msg,
|
||||
.end = req_msg_buf + sizeof(req_msg_buf),
|
||||
};
|
||||
eggsfs_fetch_block_req_put_start(&ctx, start);
|
||||
eggsfs_fetch_block_req_put_block_id(&ctx, start, req_block_id, block_id);
|
||||
eggsfs_fetch_block_req_put_offset(&ctx, req_block_id, req_offset, offset);
|
||||
eggsfs_fetch_block_req_put_count(&ctx, req_offset, req_count, count);
|
||||
eggsfs_fetch_block_req_put_end(ctx, req_count, end);
|
||||
BUG_ON(ctx.cursor != ctx.end);
|
||||
}
|
||||
|
||||
// send message
|
||||
struct msghdr msg = { NULL, };
|
||||
eggsfs_debug_print("sending fetch block req to %pI4:%d", &socket->addr.sin_addr, ntohs(socket->addr.sin_port));
|
||||
struct kvec iov = {
|
||||
.iov_base = req_msg_buf,
|
||||
.iov_len = sizeof(req_msg_buf),
|
||||
};
|
||||
err = kernel_sendmsg(socket->sock, &msg, &iov, 1, iov.iov_len);
|
||||
if (err < 0) { goto out_err_pages; }
|
||||
|
||||
return req;
|
||||
|
||||
out_err_pages:
|
||||
put_pages_list(&req->pages);
|
||||
kmem_cache_free(eggsfs_fetch_block_request_cachep, req);
|
||||
out_err:
|
||||
eggsfs_info_print("couldn't start fetch block request, err=%d", req->err);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
void eggsfs_put_fetch_block_request(struct eggsfs_fetch_block_request* req) {
|
||||
BUG_ON(atomic_read(&req->refcount) < 1);
|
||||
if (atomic_dec_and_test(&req->refcount)) {
|
||||
put_pages_list(&req->pages);
|
||||
kmem_cache_free(eggsfs_fetch_block_request_cachep, req);
|
||||
}
|
||||
}
|
||||
|
||||
static void eggsfs_write_block_state_check(struct sock* sk) {
|
||||
// TODO check if connection has been closed before its time and error out
|
||||
// eggsfs_debug_print("socket state check triggered: %d", sk->sk_state);
|
||||
}
|
||||
|
||||
static void eggsfs_write_block_state_change(struct sock* sk) {
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
eggsfs_write_block_state_check(sk);
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
static inline int atomic_set_return(atomic_t *v, int i) {
|
||||
atomic_set(v, i);
|
||||
return i;
|
||||
}
|
||||
|
||||
static int eggsfs_write_block_receive_single_req(
|
||||
struct eggsfs_block_socket* socket,
|
||||
struct eggsfs_write_block_request* req,
|
||||
struct sk_buff* skb,
|
||||
unsigned int offset, size_t len0
|
||||
) {
|
||||
size_t len = len0;
|
||||
|
||||
// Write resp (after write request)
|
||||
|
||||
#define HEADER_COPY(buf, count) ({ \
|
||||
int read = count > 0 ? eggsfs_skb_copy(buf, skb, offset, count) : 0; \
|
||||
offset += read; \
|
||||
len -= read; \
|
||||
req->write_resp_read += read; \
|
||||
read; \
|
||||
})
|
||||
|
||||
// Header
|
||||
{
|
||||
int header_left = EGGSFS_BLOCKS_RESP_HEADER_SIZE - (int)req->write_resp_read;
|
||||
int read = HEADER_COPY(req->write_resp.buf + req->write_resp_read, header_left);
|
||||
header_left -= read;
|
||||
if (header_left > 0) { return len0-len; }
|
||||
}
|
||||
|
||||
// Protocol check
|
||||
if (unlikely(le32_to_cpu(req->write_resp.data.protocol) != EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
|
||||
eggsfs_info_print("bad blocks resp protocol, expected %*pE, got %*pE", 4, &EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->write_resp.data.protocol);
|
||||
return eggsfs_error_to_linux(EGGSFS_ERR_MALFORMED_RESPONSE);
|
||||
}
|
||||
|
||||
// Error check
|
||||
if (unlikely(req->write_resp.data.kind == 0)) {
|
||||
int error_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)req->write_resp_read;
|
||||
int read = HEADER_COPY(req->write_resp.buf + req->write_resp_read, error_left);
|
||||
error_left -= read;
|
||||
if (error_left > 0) { return len0-len; }
|
||||
eggsfs_info_print("failed=%u", req->write_resp.data.error);
|
||||
// We immediately start writing, so any error here means that the socket is kaput
|
||||
return atomic_set_return(&req->status, eggsfs_error_to_linux(le16_to_cpu(req->write_resp.data.error)));
|
||||
}
|
||||
|
||||
#undef HEADER_COPY
|
||||
|
||||
// Written resp (after the block has been fully written)
|
||||
|
||||
#define HEADER_COPY(buf, count) ({ \
|
||||
int read = count > 0 ? eggsfs_skb_copy(buf, skb, offset, count) : 0; \
|
||||
offset += read; \
|
||||
len -= read; \
|
||||
req->written_resp_read += read; \
|
||||
read; \
|
||||
})
|
||||
|
||||
// Header
|
||||
{
|
||||
int header_left = EGGSFS_BLOCKS_RESP_HEADER_SIZE - (int)req->written_resp_read;
|
||||
int read = HEADER_COPY(req->written_resp.buf + req->written_resp_read, header_left);
|
||||
header_left -= read;
|
||||
if (header_left > 0) {
|
||||
return len0-len;
|
||||
}
|
||||
}
|
||||
|
||||
// Protocol check
|
||||
if (unlikely(le32_to_cpu(req->written_resp.data.protocol) != EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
|
||||
eggsfs_info_print("bad blocks resp protocol, expected %*pE, got %*pE", 4, &EGGSFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->written_resp.data.protocol);
|
||||
return eggsfs_error_to_linux(EGGSFS_ERR_MALFORMED_RESPONSE);
|
||||
}
|
||||
|
||||
// Error check
|
||||
if (unlikely(req->written_resp.data.kind == 0)) {
|
||||
int error_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)req->written_resp_read;
|
||||
int read = HEADER_COPY(req->written_resp.buf + req->written_resp_read, error_left);
|
||||
error_left -= read;
|
||||
if (error_left > 0) {
|
||||
return len0-len;
|
||||
}
|
||||
eggsfs_info_print("failed=%u", req->written_resp.data.error);
|
||||
// We immediately start writing, so any error here means that the socket is kaput
|
||||
// TODO here it might actually be fine, but better not risk it
|
||||
return atomic_set_return(&req->status, eggsfs_error_to_linux(le16_to_cpu(req->written_resp.data.error)));
|
||||
}
|
||||
|
||||
// We can finally get the proof
|
||||
{
|
||||
int proof_left = (EGGSFS_BLOCKS_RESP_HEADER_SIZE + 8) - (int)req->written_resp_read;
|
||||
int read = HEADER_COPY(req->written_resp.buf + req->written_resp_read, proof_left);
|
||||
proof_left -= read;
|
||||
if (proof_left > 0) {
|
||||
return len0-len;
|
||||
}
|
||||
}
|
||||
|
||||
#undef HEADER_COPY
|
||||
|
||||
// we can't get here unless we've fully written the block, the channel is broken
|
||||
if (smp_load_acquire(&req->bytes_left)) {
|
||||
eggsfs_warn_print("unexpected written resp before writing out block");
|
||||
return atomic_set_return(&req->status, -EIO);
|
||||
}
|
||||
|
||||
// We're good!
|
||||
atomic_set_return(&req->status, 1);
|
||||
|
||||
return len0-len;
|
||||
}
|
||||
|
||||
static int eggsfs_write_block_receive_inner(struct eggsfs_block_socket* socket, struct sk_buff* skb, unsigned int offset, size_t len) {
|
||||
int initial_len = len;
|
||||
int sock_err = atomic_read(&socket->sock_err);
|
||||
while (true) {
|
||||
spin_lock_bh(&socket->requests_lock);
|
||||
struct eggsfs_write_block_request* req = list_empty(&socket->requests) ?
|
||||
NULL :
|
||||
list_first_entry(&socket->requests, struct eggsfs_write_block_request, socket_requests);
|
||||
spin_unlock_bh(&socket->requests_lock);
|
||||
if (req == NULL) { break; }
|
||||
int consumed = sock_err;
|
||||
if (consumed == 0) {
|
||||
consumed = eggsfs_write_block_receive_single_req(socket, req, skb, offset, len);
|
||||
if (consumed < 0) {
|
||||
sock_err = consumed;
|
||||
} else {
|
||||
offset += consumed;
|
||||
len -= consumed;
|
||||
}
|
||||
} else {
|
||||
atomic_set(&req->status, consumed);
|
||||
}
|
||||
int status = atomic_read(&req->status);
|
||||
if (status != 0) {
|
||||
spin_lock_bh(&socket->requests_lock);
|
||||
list_del(&req->socket_requests);
|
||||
spin_unlock_bh(&socket->requests_lock);
|
||||
trace_eggsfs_block_write_exit(req->block_service_id, req->block_id, status > 0 ? 0 : status);
|
||||
queue_work(eggsfs_wq, req->callback);
|
||||
// start writing the next request
|
||||
queue_work(eggsfs_wq, &socket->write_work);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (sock_err == 0) {
|
||||
BUG_ON(len != 0); // can't have leftovers
|
||||
}
|
||||
return initial_len;
|
||||
}
|
||||
|
||||
static int eggsfs_write_block_receive(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len) {
|
||||
struct eggsfs_block_socket* socket = rd_desc->arg.data;
|
||||
return eggsfs_write_block_receive_inner(socket, skb, offset, len);
|
||||
}
|
||||
|
||||
static void eggsfs_write_block_data_ready(struct sock* sk) {
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
read_descriptor_t rd_desc;
|
||||
rd_desc.arg.data = sk->sk_user_data;
|
||||
rd_desc.count = 1;
|
||||
tcp_read_sock(sk, &rd_desc, eggsfs_write_block_receive);
|
||||
eggsfs_fetch_block_state_check(sk);
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
static void eggsfs_write_block_write_space(struct sock* sk) {
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
struct eggsfs_block_socket* socket = (struct eggsfs_block_socket*)sk->sk_user_data;
|
||||
if (sk_stream_is_writeable(sk)) {
|
||||
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
queue_work(eggsfs_wq, &socket->write_work);
|
||||
}
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
static void eggsfs_write_block_work(struct work_struct* work) {
|
||||
struct eggsfs_block_socket* socket = container_of(work, struct eggsfs_block_socket, write_work);
|
||||
|
||||
// The way this function works means that we will not start writing a request
|
||||
// before we've finished reading the proof for the previous one. We could be
|
||||
// slightly lower latency by writing ahead, but probably not a big difference.
|
||||
|
||||
spin_lock_bh(&socket->requests_lock);
|
||||
struct eggsfs_write_block_request* req = list_empty(&socket->requests) ?
|
||||
NULL :
|
||||
list_first_entry(&socket->requests, struct eggsfs_write_block_request, socket_requests);
|
||||
spin_unlock_bh(&socket->requests_lock);
|
||||
if (req == NULL) {
|
||||
eggsfs_debug_print("no request to work on");
|
||||
return;
|
||||
}
|
||||
|
||||
int err;
|
||||
|
||||
// Still writing request
|
||||
while (req->write_req_written < EGGSFS_BLOCKS_REQ_HEADER_SIZE+EGGSFS_WRITE_BLOCK_REQ_SIZE) {
|
||||
char req_msg_buf[EGGSFS_BLOCKS_REQ_HEADER_SIZE+EGGSFS_WRITE_BLOCK_REQ_SIZE];
|
||||
char* req_msg = req_msg_buf;
|
||||
put_unaligned_le32(EGGSFS_BLOCKS_REQ_PROTOCOL_VERSION, req_msg); req_msg += 4;
|
||||
put_unaligned_le64(req->block_service_id, req_msg); req_msg += 8;
|
||||
*(u8*)req_msg = EGGSFS_BLOCKS_WRITE_BLOCK; req_msg += 1;
|
||||
{
|
||||
struct eggsfs_bincode_put_ctx ctx = {
|
||||
.start = req_msg,
|
||||
.cursor = req_msg,
|
||||
.end = req_msg_buf + sizeof(req_msg_buf),
|
||||
};
|
||||
eggsfs_write_block_req_put_start(&ctx, start);
|
||||
eggsfs_write_block_req_put_block_id(&ctx, start, req_block_id, req->block_id);
|
||||
eggsfs_write_block_req_put_crc(&ctx, req_block_id, req_crc, req->crc);
|
||||
eggsfs_write_block_req_put_size(&ctx, req_crc, req_size, req->size);
|
||||
eggsfs_write_block_req_put_certificate(&ctx, req_size, req_certificate, req->certificate);
|
||||
eggsfs_write_block_req_put_end(ctx, req_certificate, end);
|
||||
BUG_ON(ctx.cursor != ctx.end);
|
||||
}
|
||||
|
||||
// send message
|
||||
struct msghdr msg = {
|
||||
.msg_flags = MSG_MORE | MSG_DONTWAIT,
|
||||
};
|
||||
eggsfs_debug_print("sending write block req to %pI4:%d", &socket->addr.sin_addr, ntohs(socket->addr.sin_port));
|
||||
struct kvec iov = {
|
||||
.iov_base = req_msg_buf + req->write_req_written,
|
||||
.iov_len = sizeof(req_msg_buf) - req->write_req_written,
|
||||
};
|
||||
int sent = kernel_sendmsg(socket->sock, &msg, &iov, 1, iov.iov_len);
|
||||
if (sent == -EAGAIN) { return; } // done for now, will be rescheduled by `sk_write_space`
|
||||
if (sent < 0) { err = sent; goto out_err; }
|
||||
req->write_req_written += sent;
|
||||
}
|
||||
|
||||
// Write the pages out
|
||||
u32 bytes_left = req->bytes_left;
|
||||
while (bytes_left > 0) {
|
||||
struct page* page = req->page;
|
||||
int sent = kernel_sendpage(
|
||||
socket->sock, page,
|
||||
req->page_offset,
|
||||
min((u32)(PAGE_SIZE - req->page_offset), bytes_left),
|
||||
MSG_DONTWAIT
|
||||
);
|
||||
if (sent == -EAGAIN) {
|
||||
// done for now, will be rescheduled by `sk_write_space`
|
||||
break;
|
||||
}
|
||||
if (sent < 0) { err = sent; goto out_err; }
|
||||
req->page_offset += sent;
|
||||
bytes_left -= sent;
|
||||
if (req->page_offset >= PAGE_SIZE) {
|
||||
BUG_ON(req->page_offset != PAGE_SIZE);
|
||||
req->page_offset = 0;
|
||||
req->page = (struct page*)req->page->private;
|
||||
}
|
||||
}
|
||||
smp_store_release(&req->bytes_left, bytes_left);
|
||||
return;
|
||||
|
||||
out_err:
|
||||
eggsfs_debug_print("block write failed err=%d", err);
|
||||
read_lock(&socket->sock->sk->sk_callback_lock);
|
||||
atomic_set(&socket->sock_err, err);
|
||||
// this will just drain the request list erroring them all because of the
|
||||
// socket error.
|
||||
eggsfs_write_block_receive_inner(socket, NULL, 0, 0);
|
||||
read_unlock(&socket->sock->sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
struct eggsfs_block_socket* eggsfs_get_write_block_socket(struct eggsfs_block_service* block_service) {
|
||||
struct eggsfs_block_socket* socket = eggsfs_get_fetch_block_socket(block_service);
|
||||
if (IS_ERR(socket)) { return socket; }
|
||||
|
||||
socket->sock->sk->sk_data_ready = eggsfs_write_block_data_ready;
|
||||
socket->sock->sk->sk_state_change = eggsfs_write_block_state_change;
|
||||
socket->sock->sk->sk_write_space = eggsfs_write_block_write_space;
|
||||
|
||||
INIT_WORK(&socket->write_work, eggsfs_write_block_work);
|
||||
|
||||
return socket;
|
||||
}
|
||||
|
||||
void eggsfs_put_write_block_socket(struct eggsfs_block_socket* socket) {
|
||||
eggsfs_put_fetch_block_socket(socket);
|
||||
}
|
||||
|
||||
struct eggsfs_write_block_request* eggsfs_write_block(
|
||||
struct eggsfs_block_socket* socket,
|
||||
struct work_struct* callback,
|
||||
// This must match with what you passed into `eggsfs_get_fetch_block_socket`
|
||||
u64 block_service_id,
|
||||
u64 block_id,
|
||||
u64 certificate,
|
||||
u32 size,
|
||||
u32 crc,
|
||||
// There must be enough pages to write everything. The next page should
|
||||
// be in ->private (the reasons for not using ->lru are purely because
|
||||
// how some other code works out)
|
||||
struct page* page
|
||||
) {
|
||||
int err;
|
||||
|
||||
struct eggsfs_write_block_request* req = kmem_cache_alloc(eggsfs_write_block_request_cachep, GFP_KERNEL);
|
||||
if (!req) { err = -ENOMEM; goto out_err; }
|
||||
|
||||
req->page = page;
|
||||
atomic_set(&req->status, 0);
|
||||
req->bytes_left = size;
|
||||
req->page_offset = 0;
|
||||
req->write_resp_read = 0;
|
||||
req->write_req_written = 0;
|
||||
req->written_resp_read = 0;
|
||||
req->block_service_id = block_service_id;
|
||||
req->block_id = block_id;
|
||||
req->crc = crc;
|
||||
req->size = size;
|
||||
req->certificate = certificate;
|
||||
req->callback = callback;
|
||||
// we rely on the proof becoming non-zero to find out when the request is finished
|
||||
memset(&req->written_resp.buf, 0, sizeof(req->written_resp.buf));
|
||||
|
||||
spin_lock_bh(&socket->requests_lock);
|
||||
list_add_tail(&req->socket_requests, &socket->requests);
|
||||
spin_unlock_bh(&socket->requests_lock);
|
||||
|
||||
// the request is filled in by the normal writer, since we need to ensure ordering
|
||||
queue_work(eggsfs_wq, &socket->write_work);
|
||||
|
||||
trace_eggsfs_block_write_enter(block_service_id, block_id, size);
|
||||
|
||||
return req;
|
||||
|
||||
out_err:
|
||||
eggsfs_info_print("couldn't start write block request, err=%d", err);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
void eggsfs_put_write_block_request(struct eggsfs_write_block_request* req) {
|
||||
kmem_cache_free(eggsfs_write_block_request_cachep, req);
|
||||
}
|
||||
|
||||
static void eggsfs_init_fetch_block_request_once(void *p) {
|
||||
struct eggsfs_fetch_block_request* req = p;
|
||||
init_completion(&req->comp);
|
||||
}
|
||||
|
||||
int __init eggsfs_blocksimple_init(void) {
|
||||
int err;
|
||||
eggsfs_fetch_block_request_cachep = kmem_cache_create(
|
||||
"eggsfs_fetch_block_request_cache",
|
||||
sizeof(struct eggsfs_fetch_block_request),
|
||||
0,
|
||||
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
|
||||
eggsfs_init_fetch_block_request_once
|
||||
);
|
||||
if (!eggsfs_fetch_block_request_cachep) { err = -ENOMEM; goto out_err; }
|
||||
|
||||
eggsfs_write_block_request_cachep = kmem_cache_create(
|
||||
"eggsfs_write_block_request_cache",
|
||||
sizeof(struct eggsfs_write_block_request),
|
||||
0,
|
||||
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
|
||||
NULL
|
||||
);
|
||||
if (!eggsfs_write_block_request_cachep) { err = -ENOMEM; goto out_fetch_request; }
|
||||
|
||||
return 0;
|
||||
|
||||
out_fetch_request:
|
||||
kmem_cache_destroy(eggsfs_fetch_block_request_cachep);
|
||||
out_err:
|
||||
return err;
|
||||
}
|
||||
|
||||
void __cold eggsfs_blocksimple_exit(void) {
|
||||
// TODO: handle case where there still are requests in flight.
|
||||
kmem_cache_destroy(eggsfs_fetch_block_request_cachep);
|
||||
kmem_cache_destroy(eggsfs_write_block_request_cachep);
|
||||
}
|
||||
+171
@@ -0,0 +1,171 @@
|
||||
#ifndef _EGGSFS_BLOCKSIMPLE_H
|
||||
#define _EGGSFS_BLOCKSIMPLE_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/net.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#include "bincode.h"
|
||||
|
||||
#define EGGSFS_MAX_BLOCK_SIZE (100 << 20) // 100MiB
|
||||
|
||||
#define EGGSFS_BLOCKS_RESP_HEADER_SIZE (4 + 1) // protocol + kind
|
||||
|
||||
struct eggsfs_block_service {
|
||||
u64 id;
|
||||
u32 ip1;
|
||||
u32 ip2;
|
||||
u16 port1;
|
||||
u16 port2;
|
||||
};
|
||||
|
||||
struct eggsfs_block_socket;
|
||||
|
||||
struct eggsfs_block_socket* eggsfs_get_fetch_block_socket(struct eggsfs_block_service* block_service);
|
||||
void eggsfs_put_fetch_block_socket(struct eggsfs_block_socket* socket);
|
||||
|
||||
struct eggsfs_fetch_block_request {
|
||||
// Pages where we write the block. When the request is done everything
|
||||
// will be available here. Freeing this is the responsibility of the
|
||||
// caller, who must wait for the request to be completed before doing so.
|
||||
struct list_head pages;
|
||||
|
||||
// Completes when the block is fetched and all pages have been filled
|
||||
// in.
|
||||
struct completion comp;
|
||||
|
||||
// Stores the error if something went wrong (to be checked by the caller before
|
||||
// inspecting pages or crc).
|
||||
int err;
|
||||
|
||||
// CRC of the body
|
||||
u32 crc;
|
||||
|
||||
// Below fields are private.
|
||||
|
||||
// List node for list of requests enqueued for a given socket.
|
||||
struct list_head socket_requests;
|
||||
|
||||
// The caller and blocksimple.c both need to make sure the request is still there
|
||||
// when they need it. We need such a thing since we sometimes send fetch requests
|
||||
// from contexts we want to be killable (i.e. syscalls). So in those cases we
|
||||
// need to drop them on the floor.
|
||||
atomic_t refcount;
|
||||
|
||||
// How many bytes we have left to read.
|
||||
u32 bytes_left;
|
||||
// The cursor inside the current page.
|
||||
u16 page_offset;
|
||||
// How much we've read of the header.
|
||||
u8 header_read;
|
||||
static_assert(EGGSFS_FETCH_BLOCK_RESP_SIZE == 0);
|
||||
union {
|
||||
char buf[EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2];
|
||||
struct {
|
||||
__le32 protocol;
|
||||
u8 kind;
|
||||
__le16 error;
|
||||
} __attribute__((packed)) data;
|
||||
} header;
|
||||
};
|
||||
|
||||
// Might return errors.
|
||||
struct eggsfs_fetch_block_request* eggsfs_fetch_block(
|
||||
struct eggsfs_block_socket* socket,
|
||||
// This must match with what you passed into `eggsfs_get_fetch_block_socket`
|
||||
u64 block_service_id,
|
||||
u64 block_id,
|
||||
u32 offset,
|
||||
u32 count
|
||||
);
|
||||
|
||||
// Must be called when you're done with the request. It's OK to call
|
||||
// this before the request has finished. If the caller has successfully waited for
|
||||
// the request to complete, it's OK to take ownership of the pages by removing them
|
||||
// from the list.
|
||||
void eggsfs_put_fetch_block_request(struct eggsfs_fetch_block_request* req);
|
||||
|
||||
struct eggsfs_block_socket* eggsfs_get_write_block_socket(struct eggsfs_block_service* block_service);
|
||||
void eggsfs_put_write_block_socket(struct eggsfs_block_socket* socket);
|
||||
|
||||
struct eggsfs_write_block_request {
|
||||
// Called when request is done
|
||||
struct work_struct* callback;
|
||||
|
||||
// The current page we're reading from. Singly linked list, next one
|
||||
// in ->private. These pages are managed by the issuer of the request.
|
||||
struct page* page;
|
||||
|
||||
// List node for list of requests enqueued for a given socket.
|
||||
struct list_head socket_requests;
|
||||
|
||||
// Req info
|
||||
u64 block_service_id;
|
||||
u64 block_id;
|
||||
u32 crc;
|
||||
u32 size;
|
||||
u64 certificate;
|
||||
|
||||
// * 0: not done yet
|
||||
// * 1: done
|
||||
// * -n: errored
|
||||
atomic_t status;
|
||||
|
||||
u32 bytes_left; // how many bytes left to write
|
||||
u16 page_offset; // offset into page we're reading from
|
||||
|
||||
// How much we've written of the write request
|
||||
u8 write_req_written;
|
||||
// How much we're read of the first response (after we ask to write), and of the
|
||||
// second (after we've written the block).
|
||||
u8 write_resp_read;
|
||||
static_assert(EGGSFS_WRITE_BLOCK_RESP_SIZE == 0);
|
||||
union {
|
||||
char buf[EGGSFS_BLOCKS_RESP_HEADER_SIZE + 2];
|
||||
struct {
|
||||
__le32 protocol;
|
||||
u8 kind;
|
||||
__le16 error;
|
||||
} __attribute__((packed)) data;
|
||||
} write_resp;
|
||||
u8 written_resp_read;
|
||||
static_assert(EGGSFS_BLOCK_WRITTEN_RESP_SIZE > 2);
|
||||
union {
|
||||
char buf[EGGSFS_BLOCKS_RESP_HEADER_SIZE+EGGSFS_BLOCK_WRITTEN_RESP_SIZE];
|
||||
struct {
|
||||
__le32 protocol;
|
||||
u8 kind;
|
||||
union {
|
||||
__le16 error;
|
||||
__be64 proof;
|
||||
};
|
||||
} __attribute__((packed)) data;
|
||||
} written_resp;
|
||||
};
|
||||
|
||||
// Unlike fetching, here you _must_ wait for the request to be complete. This is since
|
||||
// we must know that nothing might use the pages passed in before freeing them.
|
||||
struct eggsfs_write_block_request* eggsfs_write_block(
|
||||
struct eggsfs_block_socket* socket,
|
||||
struct work_struct* callback,
|
||||
// This must match with what you passed into `eggsfs_get_fetch_block_socket`
|
||||
u64 block_service_id,
|
||||
u64 block_id,
|
||||
u64 certificate,
|
||||
u32 size,
|
||||
u32 crc,
|
||||
// There must be enough pages to write everything. The next page should
|
||||
// be in ->private (the reasons for not using ->lru are purely because
|
||||
// the code that actually uses this is easier to write this way).
|
||||
struct page* page
|
||||
);
|
||||
|
||||
// Must be called when you're done with the request. Can't be called
|
||||
// before the request is completed.
|
||||
void eggsfs_put_write_block_request(struct eggsfs_write_block_request* req);
|
||||
|
||||
int __init eggsfs_blocksimple_init(void);
|
||||
void __cold eggsfs_blocksimple_exit(void);
|
||||
|
||||
#endif
|
||||
Executable
+7
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
set -eu -o pipefail
|
||||
|
||||
set -x
|
||||
|
||||
./sync.sh
|
||||
ssh uovo 'cd eggs-kmod && make -j kmod writefile'
|
||||
@@ -0,0 +1,15 @@
|
||||
#ifndef _EGGSFS_COMMON_H
|
||||
#define _EGGSFS_COMMON_H
|
||||
|
||||
#define EGGSFS_MAX_FILENAME 255
|
||||
|
||||
//#define eggsfs_debug_print(fmt, args...) trace_printk(fmt "\n", ##args)
|
||||
#define eggsfs_warn_print(fmt, args...) printk(KERN_WARNING "eggsfs: %s: " fmt "\n", __func__, ##args)
|
||||
#define eggsfs_info_print(fmt, args...) printk(KERN_INFO "eggsfs: %s: " fmt "\n", __func__, ##args)
|
||||
|
||||
#define eggsfs_debug_print(fmt, args...) do {} while(0)
|
||||
// #define eggsfs_debug_print(fmt, args...) printk(KERN_DEBUG "eggsfs: %s: " fmt "\n", __func__, ##args)
|
||||
|
||||
extern struct workqueue_struct* eggsfs_wq;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,22 @@
|
||||
#ifndef _EGGSFS_COUNTER_H
|
||||
#define _EGGSFS_COUNTER_H
|
||||
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/percpu.h>
|
||||
|
||||
#define EGGSFS_DECLARE_COUNTER(_name) DECLARE_PER_CPU(u64, _name)
|
||||
#define EGGSFS_DEFINE_COUNTER(_name) DEFINE_PER_CPU(u64, _name)
|
||||
|
||||
#define eggsfs_counter_inc(_counter) raw_cpu_inc(_counter)
|
||||
|
||||
#define eggsfs_counter_get(_counter) ({ \
|
||||
u64 total = 0; \
|
||||
int cpu; \
|
||||
for_each_possible_cpu(cpu) { \
|
||||
total += *per_cpu_ptr(_counter, cpu); \
|
||||
} \
|
||||
total; \
|
||||
})
|
||||
|
||||
#endif
|
||||
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
#include "crc.h"
|
||||
|
||||
#include "intrshims.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "crc32c.c"
|
||||
|
||||
u32 eggsfs_crc32c(u32 crc, const char* buf, size_t len) {
|
||||
return crc32c(crc, buf, len);
|
||||
}
|
||||
|
||||
u32 eggsfs_crc32c_simple(u32 crc, const char* buf, size_t len) {
|
||||
return crc32c_simple(crc, buf, len);
|
||||
}
|
||||
|
||||
u32 eggsfs_crc32c_xor(u32 crc1, u32 crc2, size_t len) {
|
||||
return crc32c_xor(crc1, crc2, len);
|
||||
}
|
||||
|
||||
u32 eggsfs_crc32c_append(u32 crc1, u32 crc2, size_t len2) {
|
||||
return crc32c_append(crc1, crc2, len2);
|
||||
}
|
||||
|
||||
u32 eggsfs_crc32c_zero_extend(u32 crc, ssize_t zeros) {
|
||||
return crc32c_zero_extend(crc, zeros);
|
||||
}
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
#ifndef _EGGSFS_CRC_H
|
||||
#define _EGGSFS_CRC_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
// You _must_ wrap this with kernel_fpu_begin/kernel_fpu_end!
|
||||
// It's not required for the other functions.
|
||||
u32 eggsfs_crc32c(u32 crc, const char* buf, size_t len);
|
||||
|
||||
// The only difference with `eggsfs_crc32c` is that kernel_fpu_begin/kernel_fpu_end
|
||||
// are not required for this one.
|
||||
u32 eggsfs_crc32c_simple(u32 crc, const char* buf, size_t len);
|
||||
|
||||
u32 eggsfs_crc32c_xor(u32 crc1, u32 crc2, size_t len);
|
||||
|
||||
u32 eggsfs_crc32c_append(u32 crc1, u32 crc2, size_t len2);
|
||||
|
||||
u32 eggsfs_crc32c_zero_extend(u32 crc, ssize_t zeros);
|
||||
|
||||
#endif
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../cpp/crc32c/crc32c.c
|
||||
+429
@@ -0,0 +1,429 @@
|
||||
#include "dir.h"
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
#include "inode.h"
|
||||
#include "metadata.h"
|
||||
#include "trace.h"
|
||||
|
||||
// sysctls
|
||||
int eggsfs_dir_refresh_time; // in jiffies
|
||||
//
|
||||
#define eggsfs_dir_get_page_n(_page) ({ *((((u32*)&(_page)->private))+1); })
|
||||
#define eggsfs_dir_set_page_n(_page, _n) ({ *((((u32*)&(_page)->private))+1) = _n; })
|
||||
|
||||
static inline int eggsfs_dir_entry_size(int name_len) {
|
||||
return 8 + 1 + name_len; // inode + len + name
|
||||
}
|
||||
|
||||
static inline void eggsfs_dir_entry_parse(const char* buf, u64* ino, u8* name_len, const char** name) {
|
||||
*ino = get_unaligned_le64(buf); buf += 8;
|
||||
*name_len = *(u8*)buf; buf++;
|
||||
*name = buf; buf += *name_len;
|
||||
}
|
||||
|
||||
struct eggsfs_readdir_ctx {
|
||||
struct page* current_page;
|
||||
struct page* pages;
|
||||
u32 page_offset;
|
||||
u32 page_n;
|
||||
};
|
||||
|
||||
// Increments the refcount of the cache in ->private under RCU.
|
||||
// After this we know the page won't be deallocated until we
|
||||
// decrease the refcount.
|
||||
static struct page* eggsfs_dir_get_cache(struct eggsfs_inode* enode) {
|
||||
struct page* page;
|
||||
rcu_read_lock();
|
||||
page = rcu_dereference(enode->dir.pages);
|
||||
if (page) { atomic_inc((atomic_t*)&page->private); }
|
||||
rcu_read_unlock();
|
||||
return page;
|
||||
}
|
||||
|
||||
// Decrements the refcount, and frees up all the pages in the
|
||||
// cache if we were the last ones. Must be called by
|
||||
// somebody who called eggsfs_dir_get_cache before.
|
||||
//
|
||||
// Note that ->current_page and ->dir_pages are both holding a
|
||||
// reference, so once you clear those you should call `eggsfs_dir_put_cache`.
|
||||
static void eggsfs_dir_put_cache(struct page* page) {
|
||||
if (atomic_dec_and_test((atomic_t*)&page->private)) {
|
||||
LIST_HEAD(pages);
|
||||
while (page) {
|
||||
struct page* next = (struct page*)page->mapping;
|
||||
page->mapping = NULL;
|
||||
page->private = 0;
|
||||
list_add_tail(&page->lru, &pages);
|
||||
page = next;
|
||||
}
|
||||
put_pages_list(&pages);
|
||||
}
|
||||
}
|
||||
|
||||
static void __eggsfs_dir_put_cache_rcu(struct rcu_head* rcu) {
|
||||
struct page* page = container_of(rcu, struct page, rcu_head);
|
||||
eggsfs_dir_put_cache(page);
|
||||
}
|
||||
|
||||
// Removes the cache from ->dir_pages, and also calls
|
||||
// `eggsfs_dir_put_cache` to clear the reference.
|
||||
void eggsfs_dir_drop_cache(struct eggsfs_inode* enode) {
|
||||
struct page* page;
|
||||
static_assert(sizeof(struct rcu_head) == sizeof(struct list_head));
|
||||
if (!atomic64_read((atomic64_t*)&enode->dir.pages)) { return; }
|
||||
page = (struct page*)atomic64_xchg((atomic64_t*)&enode->dir.pages, (u64)0);
|
||||
if (page) { call_rcu(&page->rcu_head, __eggsfs_dir_put_cache_rcu); }
|
||||
}
|
||||
|
||||
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_seqno);
|
||||
|
||||
static int eggsfs_dir_open(struct inode* inode, struct file* filp) {
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
struct eggsfs_readdir_ctx* ctx;
|
||||
struct page* page;
|
||||
int err;
|
||||
|
||||
trace_eggsfs_vfs_opendir_enter(inode);
|
||||
|
||||
if (get_jiffies_64() >= enode->mtime_expiry) {
|
||||
err = eggsfs_dir_revalidate(enode);
|
||||
if (err) { return err; }
|
||||
}
|
||||
|
||||
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
|
||||
if (!ctx) { return -ENOMEM; }
|
||||
|
||||
again: // progress: whoever wins the lock either get pages or fails
|
||||
page = eggsfs_dir_get_cache(enode);
|
||||
if (page && page->index != READ_ONCE(enode->mtime)) {
|
||||
eggsfs_dir_put_cache(page);
|
||||
eggsfs_dir_drop_cache(enode);
|
||||
page = NULL;
|
||||
}
|
||||
if (!page) {
|
||||
int seqno;
|
||||
if (!eggsfs_latch_try_acquire(&enode->dir.pages_latch, seqno)) {
|
||||
err = eggsfs_latch_wait_killable(&enode->dir.pages_latch, seqno);
|
||||
if (err) { goto out_ctx; }
|
||||
goto again;
|
||||
}
|
||||
|
||||
// If somebody else got to creating the page already, drop it,
|
||||
// otherwise we'll leak.
|
||||
page = eggsfs_dir_get_cache(enode);
|
||||
if (page && page->index != READ_ONCE(enode->mtime)) {
|
||||
eggsfs_dir_put_cache(page);
|
||||
eggsfs_dir_drop_cache(enode);
|
||||
page = NULL;
|
||||
}
|
||||
|
||||
if (!page) {
|
||||
u64 dir_mtime = READ_ONCE(enode->mtime);
|
||||
page = eggsfs_dir_read_all(filp->f_path.dentry, dir_mtime);
|
||||
if (!IS_ERR(page)) {
|
||||
// If the mtime matches (which will be the case unless somebody
|
||||
// else revalidated between the READ_ONCE above), then the refcount
|
||||
// starts at two: one for the ->dir_pages, and one for the
|
||||
// ->current_page below. Otherwise only for ->current_page (basically
|
||||
// we have these pages to be private to this read).
|
||||
if (likely(READ_ONCE(enode->mtime) == dir_mtime)) {
|
||||
atomic_set((atomic_t*)&page->private, 2);
|
||||
rcu_assign_pointer(enode->dir.pages, page);
|
||||
} else {
|
||||
atomic_set((atomic_t*)&page->private, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
eggsfs_latch_release(&enode->dir.pages_latch, seqno);
|
||||
}
|
||||
if (IS_ERR(page)) { err = PTR_ERR(page); goto out_ctx; }
|
||||
|
||||
ctx->current_page = page;
|
||||
ctx->pages = page;
|
||||
ctx->page_offset = 0;
|
||||
ctx->page_n = 0;
|
||||
|
||||
filp->private_data = ctx;
|
||||
|
||||
trace_eggsfs_vfs_opendir_exit(inode, 0);
|
||||
return 0;
|
||||
|
||||
out_ctx:
|
||||
kfree(ctx);
|
||||
trace_eggsfs_vfs_opendir_exit(inode, err);
|
||||
eggsfs_debug_print("err=%d", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void update_dcache(struct dentry* parent, u64 dir_seqno, const char* name, int name_len, u64 ino) {
|
||||
struct qstr filename = QSTR_INIT(name, name_len);
|
||||
struct dentry* dentry;
|
||||
struct dentry* alias;
|
||||
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
|
||||
|
||||
eggsfs_debug_print("parent=%pd name=%*pE ino=%llx", parent, name_len, name, ino);
|
||||
|
||||
filename.hash = full_name_hash(parent, filename.name, filename.len);
|
||||
|
||||
dentry = d_lookup(parent, &filename);
|
||||
if (!dentry) {
|
||||
again:
|
||||
dentry = d_alloc_parallel(parent, &filename, &wq);
|
||||
if (IS_ERR(dentry)) { return; }
|
||||
}
|
||||
if (!d_in_lookup(dentry)) {
|
||||
// pre-existing entry
|
||||
struct inode* old_inode = d_inode(dentry);
|
||||
if (old_inode && old_inode->i_ino == ino) {
|
||||
WRITE_ONCE(dentry->d_time, dir_seqno);
|
||||
goto out;
|
||||
}
|
||||
d_invalidate(dentry);
|
||||
dput(dentry);
|
||||
goto again;
|
||||
} else {
|
||||
// new entry
|
||||
struct inode* inode = eggsfs_get_inode(parent->d_sb, ino);
|
||||
// d_splice_alias propagates error in inode
|
||||
WRITE_ONCE(dentry->d_time, dir_seqno);
|
||||
alias = d_splice_alias(inode, dentry);
|
||||
d_lookup_done(dentry);
|
||||
if (alias) {
|
||||
dput(dentry);
|
||||
dentry = alias;
|
||||
}
|
||||
if (IS_ERR(dentry)) { return; }
|
||||
}
|
||||
|
||||
out:
|
||||
dput(dentry);
|
||||
};
|
||||
|
||||
struct eggsfs_dir_fill_ctx {
|
||||
struct dentry* parent;
|
||||
u64 dir_mtime;
|
||||
|
||||
// NULL or kmap'ed at page_addr
|
||||
struct page* current_page;
|
||||
// kmap'd address of current_page
|
||||
char* page_addr;
|
||||
// Current offset inside the page we're writing
|
||||
u32 page_off;
|
||||
// Number of eggsfs dir entries in the current page
|
||||
u32 page_n;
|
||||
};
|
||||
|
||||
static int eggsfs_dir_add_entry(struct eggsfs_dir_fill_ctx* ctx, const char* name, int name_len, u64 ino) {
|
||||
if (name_len > EGGSFS_MAX_FILENAME) { return -ENAMETOOLONG; }
|
||||
// Flush current page, and start a new one, linking it in ->mapping
|
||||
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
|
||||
struct page* page = alloc_page(GFP_KERNEL);
|
||||
if (!page) { return -ENOMEM; }
|
||||
|
||||
kunmap(ctx->current_page);
|
||||
ctx->current_page->mapping = (void*)page;
|
||||
eggsfs_dir_set_page_n(ctx->current_page, ctx->page_n);
|
||||
|
||||
ctx->current_page = page;
|
||||
ctx->page_addr = kmap(page);
|
||||
ctx->page_off = 0;
|
||||
ctx->page_n = 0;
|
||||
}
|
||||
if (ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE) {
|
||||
WARN_ON(ctx->page_off + eggsfs_dir_entry_size(name_len) > PAGE_SIZE);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
char* entry = ctx->page_addr + ctx->page_off;
|
||||
put_unaligned_le64(ino, entry); entry += 8;
|
||||
*(u8*)entry = (u8)name_len; entry++;
|
||||
memcpy(entry, name, name_len);
|
||||
ctx->page_off += eggsfs_dir_entry_size(name_len);
|
||||
++ctx->page_n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int eggsfs_dir_readdir_entry_cb(void* ptr, const char* name, int name_len, u64 hash, u64 ino) {
|
||||
struct eggsfs_dir_fill_ctx* ctx = ptr;
|
||||
eggsfs_debug_print("hash=%llx ino=%llx, name_len=%d, name=%*pE", hash, ino, name_len, name_len, name);
|
||||
update_dcache(ctx->parent, ctx->dir_mtime, name, name_len, ino);
|
||||
return eggsfs_dir_add_entry(ctx, name, name_len, ino);
|
||||
}
|
||||
|
||||
// mut hold
|
||||
static struct page* eggsfs_dir_read_all(struct dentry* dentry, u64 dir_mtime) {
|
||||
bool eof = false;
|
||||
int err;
|
||||
u64 continuation_key = 0;
|
||||
struct inode* inode = d_inode(dentry);
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
struct eggsfs_dir_fill_ctx ctx = {
|
||||
.parent = dentry,
|
||||
.dir_mtime = dir_mtime,
|
||||
};
|
||||
struct page* first_page;
|
||||
|
||||
first_page = alloc_page(GFP_KERNEL);
|
||||
if (!first_page) { return ERR_PTR(-ENOMEM); }
|
||||
ctx.current_page = first_page;
|
||||
ctx.page_addr = kmap(first_page);
|
||||
ctx.page_off = 0;
|
||||
ctx.page_n = 0;
|
||||
first_page->index = dir_mtime;
|
||||
|
||||
while (!eof) {
|
||||
eggsfs_debug_print("cont_key=%llx", continuation_key);
|
||||
err = eggsfs_shard_readdir((struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, continuation_key, &ctx, &continuation_key);
|
||||
if (err) { err = eggsfs_error_to_linux(err); goto out_err; }
|
||||
eof = continuation_key == 0;
|
||||
}
|
||||
|
||||
kunmap(ctx.current_page);
|
||||
eggsfs_dir_set_page_n(ctx.current_page, ctx.page_n);
|
||||
|
||||
return first_page;
|
||||
|
||||
out_err:
|
||||
kunmap(ctx.current_page);
|
||||
while (first_page) {
|
||||
struct page* next = (struct page*)first_page->mapping;
|
||||
first_page->mapping = NULL;
|
||||
put_page(first_page);
|
||||
first_page = next;
|
||||
}
|
||||
eggsfs_info_print("err=%d", err);
|
||||
return ERR_PTR(err);
|
||||
};
|
||||
|
||||
// vfs: exclusive inode.i_rwsem
|
||||
// vfs: mutex file.f_pos_lock
|
||||
static int eggsfs_dir_read(struct file* filp, struct dir_context* dctx) {
|
||||
struct eggsfs_readdir_ctx* ctx = filp->private_data;
|
||||
char* page_addr;
|
||||
|
||||
WARN_ON(!ctx);
|
||||
if (!ctx) { return -EINVAL; }
|
||||
|
||||
eggsfs_debug_print("ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u", ctx,
|
||||
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n);
|
||||
|
||||
if (!dir_emit_dots(filp, dctx)) { return 0; }
|
||||
if (!ctx->current_page || !eggsfs_dir_get_page_n(ctx->current_page)) { return 0; }
|
||||
|
||||
page_addr = kmap(ctx->current_page);
|
||||
|
||||
while (ctx->current_page) {
|
||||
int dtype;
|
||||
const char* entry = page_addr + ctx->page_offset;
|
||||
eggsfs_debug_print("next page_addr=%p ctx.page_offset=%u entry=%p", page_addr, ctx->page_offset, entry);
|
||||
|
||||
u64 ino;
|
||||
u8 name_len;
|
||||
const char* name;
|
||||
eggsfs_dir_entry_parse(entry, &ino, &name_len, &name);
|
||||
eggsfs_debug_print("name=%*pE ino=0x%016llx", name_len, name, ino);
|
||||
|
||||
switch (eggsfs_inode_type(ino)) {
|
||||
case EGGSFS_INODE_DIRECTORY: dtype = DT_DIR; break;
|
||||
case EGGSFS_INODE_FILE: dtype = DT_REG; break;
|
||||
case EGGSFS_INODE_SYMLINK: dtype = DT_LNK; break;
|
||||
default: dtype = DT_UNKNOWN;
|
||||
}
|
||||
|
||||
if (!dir_emit(dctx, name, name_len, ino, dtype)) {
|
||||
eggsfs_debug_print("break");
|
||||
break;
|
||||
}
|
||||
|
||||
ctx->page_offset += eggsfs_dir_entry_size(name_len);
|
||||
++ctx->page_n;
|
||||
++dctx->pos;
|
||||
|
||||
eggsfs_debug_print(
|
||||
"next ctx=%p ctx.current_page=%p ctx.pages=%p ctx.page_offset=%u ctx.page_n=%u page_n=%u", ctx,
|
||||
ctx->current_page, ctx->pages, ctx->page_offset, ctx->page_n, eggsfs_dir_get_page_n(ctx->current_page)
|
||||
);
|
||||
|
||||
if (ctx->page_n >= eggsfs_dir_get_page_n(ctx->current_page)) {
|
||||
kunmap(ctx->current_page);
|
||||
ctx->current_page = (struct page*)ctx->current_page->mapping;
|
||||
eggsfs_debug_print("next page current_page = %p", ctx->current_page);
|
||||
ctx->page_offset = 0;
|
||||
ctx->page_n = 0;
|
||||
page_addr = kmap(ctx->current_page);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->current_page) {
|
||||
eggsfs_debug_print("unmap current_page = %p", ctx->current_page);
|
||||
kunmap(ctx->current_page);
|
||||
}
|
||||
|
||||
eggsfs_debug_print("done");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
// multiple seeks at the same time?
|
||||
static loff_t eggsfs_llseek_dir(struct file* filp, loff_t offset, int whence) {
|
||||
int err;
|
||||
struct eggsfs_readdir_ctx* ctx = filp->private_data;
|
||||
loff_t dir_offset;
|
||||
|
||||
if (whence != SEEK_CUR) { return -EINVAL; }
|
||||
// concurrency with readdir?
|
||||
// TODO: if offset == 0 reacquire pages
|
||||
|
||||
ctx->current_page = ctx->pages;
|
||||
ctx->page_offset = 0;
|
||||
ctx->page_n = 0;
|
||||
if (offset > 2) {
|
||||
dir_offset = offset - 2;
|
||||
while (ctx->current_page && eggsfs_dir_get_page_n(ctx->current_page) <= dir_offset) {
|
||||
dir_offset -= eggsfs_dir_get_page_n(ctx->current_page);
|
||||
ctx->current_page = (struct page*)ctx->current_page->mapping;
|
||||
}
|
||||
if (ctx->current_page) {
|
||||
char* page_addr = (char*)kmap(ctx->current_page);
|
||||
while (ctx->page_n < dir_offset) {
|
||||
BUG_ON(ctx->page_offset >= PAGE_SIZE);
|
||||
u64 ino;
|
||||
u8 name_len;
|
||||
const char* name;
|
||||
eggsfs_dir_entry_parse(page_addr + ctx->page_offset, &ino, &name_len, &name);
|
||||
ctx->page_offset += eggsfs_dir_entry_size(name_len);
|
||||
++ctx->page_n;
|
||||
}
|
||||
kunmap(ctx->current_page);
|
||||
}
|
||||
}
|
||||
|
||||
filp->f_pos = offset;
|
||||
|
||||
return offset;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int eggsfs_dir_close(struct inode* inode, struct file* filp) {
|
||||
struct eggsfs_readdir_ctx* ctx = (struct eggsfs_readdir_ctx*)filp->private_data;
|
||||
|
||||
WARN_ON(!ctx);
|
||||
if (!ctx) { return -EINVAL; }
|
||||
|
||||
trace_eggsfs_vfs_closedir_enter(inode);
|
||||
eggsfs_dir_put_cache(ctx->pages);
|
||||
kfree(ctx);
|
||||
trace_eggsfs_vfs_closedir_exit(inode, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct file_operations eggsfs_dir_operations = {
|
||||
.open = eggsfs_dir_open,
|
||||
.iterate = eggsfs_dir_read,
|
||||
.iterate_shared = eggsfs_dir_read,
|
||||
.release = eggsfs_dir_close,
|
||||
};
|
||||
+41
@@ -0,0 +1,41 @@
|
||||
#ifndef _EGGSFS_DIR_H
|
||||
#define _EGGSFS_DIR_H
|
||||
|
||||
#include <linux/fs.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "inode.h"
|
||||
|
||||
extern struct file_operations eggsfs_dir_operations;
|
||||
|
||||
extern int eggsfs_dir_refresh_time;
|
||||
|
||||
int eggsfs_dir_readdir_entry_cb(void* ptr, const char* name, int name_len, u64 hash, u64 ino);
|
||||
|
||||
void eggsfs_dir_drop_cache(struct eggsfs_inode* enode);
|
||||
|
||||
static inline int eggsfs_dir_revalidate(struct eggsfs_inode* enode) {
|
||||
return eggsfs_do_getattr(enode);
|
||||
}
|
||||
|
||||
static inline int eggsfs_dir_needs_reval(struct eggsfs_inode* dir, struct dentry* dentry) {
|
||||
// 0 => invalid
|
||||
// -ECHILD => revalidate parent dir
|
||||
// 1 => valid
|
||||
u64 dentry_dir_mtime = dentry->d_time;
|
||||
int ret;
|
||||
u64 t = get_jiffies_64();
|
||||
if (dentry_dir_mtime != dir->mtime) { ret = 0; }
|
||||
else if (t >= dir->mtime_expiry) { ret = -ECHILD; }
|
||||
else { ret = 1; }
|
||||
|
||||
eggsfs_debug_print(
|
||||
"t=%llu dir=%p dir_id=0x%016lx dir_mtime=%llu dir_mtime_expiry=%llu dentry=%pd dentry_dir_mtime=%llu -> ret=%d",
|
||||
t, dir, dir->inode.i_ino, dir->mtime, dir->mtime_expiry, dentry, dentry_dir_mtime, ret
|
||||
);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
+41
@@ -0,0 +1,41 @@
|
||||
#include "err.h"
|
||||
|
||||
// Safe to use with non-error `err`
|
||||
int eggsfs_error_to_linux(int err) {
|
||||
if (err == 0) { return 0; }
|
||||
switch (err) {
|
||||
case EGGSFS_ERR_INTERNAL_ERROR: return -EIO;
|
||||
case EGGSFS_ERR_FATAL_ERROR: return -EIO;
|
||||
case EGGSFS_ERR_TIMEOUT: return -ETIMEDOUT;
|
||||
case EGGSFS_ERR_NOT_AUTHORISED: return -EPERM;
|
||||
case EGGSFS_ERR_UNRECOGNIZED_REQUEST: return -EIO;
|
||||
case EGGSFS_ERR_FILE_NOT_FOUND: return -ENOENT;
|
||||
case EGGSFS_ERR_DIRECTORY_NOT_FOUND: return -ENOENT;
|
||||
case EGGSFS_ERR_NAME_NOT_FOUND: return -ENOENT;
|
||||
case EGGSFS_ERR_TYPE_IS_DIRECTORY: return -EISDIR;
|
||||
case EGGSFS_ERR_TYPE_IS_NOT_DIRECTORY: return -ENOTDIR;
|
||||
case EGGSFS_ERR_BAD_COOKIE: return -EBADCOOKIE;
|
||||
case EGGSFS_ERR_INCONSISTENT_STORAGE_CLASS_PARITY: return -EIO;
|
||||
case EGGSFS_ERR_LAST_SPAN_STATE_NOT_CLEAN: return -EIO;
|
||||
case EGGSFS_ERR_COULD_NOT_PICK_BLOCK_SERVICES: return -EIO;
|
||||
case EGGSFS_ERR_BAD_SPAN_BODY: return -EIO;
|
||||
case EGGSFS_ERR_SPAN_NOT_FOUND: return -EIO;
|
||||
case EGGSFS_ERR_BLOCK_SERVICE_NOT_FOUND: return -EIO;
|
||||
case EGGSFS_ERR_CANNOT_CERTIFY_BLOCKLESS_SPAN: return -EIO;
|
||||
case EGGSFS_ERR_BAD_NUMBER_OF_BLOCKS_PROOFS: return -EIO;
|
||||
case EGGSFS_ERR_BAD_BLOCK_PROOF: return -EIO;
|
||||
case EGGSFS_ERR_CANNOT_OVERRIDE_NAME: return -EEXIST;
|
||||
case EGGSFS_ERR_NAME_IS_LOCKED: return -EIO;
|
||||
case EGGSFS_ERR_MISMATCHING_TARGET: return -EIO;
|
||||
case EGGSFS_ERR_MISMATCHING_OWNER: return -EIO;
|
||||
case EGGSFS_ERR_DIRECTORY_NOT_EMPTY: return -ENOTEMPTY;
|
||||
case EGGSFS_ERR_FILE_IS_TRANSIENT: return -EIO;
|
||||
case EGGSFS_ERR_OLD_DIRECTORY_NOT_FOUND: return -ENOENT;
|
||||
case EGGSFS_ERR_NEW_DIRECTORY_NOT_FOUND: return -ENOENT;
|
||||
case EGGSFS_ERR_LOOP_IN_DIRECTORY_RENAME: return -ELOOP;
|
||||
case EGGSFS_ERR_MALFORMED_REQUEST: return -EIO;
|
||||
case EGGSFS_ERR_MALFORMED_RESPONSE: return -EIO;
|
||||
}
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
+11
@@ -0,0 +1,11 @@
|
||||
#ifndef _EGGSFS_ERR_H
|
||||
#define _EGGSFS_ERR_H
|
||||
|
||||
#include <linux/errno.h>
|
||||
|
||||
#include "bincode.h"
|
||||
|
||||
int eggsfs_error_to_linux(int err);
|
||||
|
||||
#endif
|
||||
|
||||
+855
@@ -0,0 +1,855 @@
|
||||
#include "file.h"
|
||||
|
||||
#include <linux/uio.h>
|
||||
|
||||
#include "bincode.h"
|
||||
#include "inode.h"
|
||||
#include "common.h"
|
||||
#include "metadata.h"
|
||||
#include "err.h"
|
||||
#include "rs.h"
|
||||
#include "block.h"
|
||||
#include "skb.h"
|
||||
#include "crc.h"
|
||||
#include "trace.h"
|
||||
#include "span.h"
|
||||
|
||||
// open_mutex held here
|
||||
// really want atomic open for this
|
||||
static int eggsfs_file_open(struct inode* inode, struct file* filp) {
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
|
||||
eggsfs_debug_print("enode=%p flags=%x owner=%p", enode, enode->flags, current->group_leader);
|
||||
|
||||
if (enode->flags & EGGSFS_INODE_TRANSIENT) {
|
||||
eggsfs_debug_print("trying to open transient file");
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
if (filp->f_mode & FMODE_WRITE) {
|
||||
eggsfs_debug_print("opening file for writing");
|
||||
if (!(enode->flags & EGGSFS_INODE_WRITEABLE)) {
|
||||
eggsfs_debug_print("trying to open for write non-writeable file");
|
||||
return -EPERM;
|
||||
}
|
||||
enode->flags &= ~EGGSFS_INODE_WRITEABLE;
|
||||
enode->flags |= EGGSFS_INODE_TRANSIENT;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct eggsfs_transient_span* eggsfs_add_new_span(struct list_head* spans) {
|
||||
// The zeroing is important: eg we rely on it in `eggsfs_flush_spans` to detect whether we
|
||||
// need to compute the parity or not.
|
||||
struct eggsfs_transient_span* span = kzalloc(sizeof(struct eggsfs_transient_span), GFP_KERNEL);
|
||||
if (!span) {
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
span->status = EGGSFS_SPAN_STATUS_WRITING;
|
||||
INIT_LIST_HEAD(&span->pages);
|
||||
list_add_tail(&span->list, spans);
|
||||
return span;
|
||||
}
|
||||
|
||||
struct initiate_ctx {
|
||||
struct eggsfs_inode_file* file;
|
||||
struct eggsfs_transient_span* span;
|
||||
u32* cell_crcs;
|
||||
};
|
||||
|
||||
int eggsfs_shard_add_span_initiate_block_cb(
|
||||
void* data, int block, u32 ip1, u16 port1, u32 ip2, u16 port2, u64 block_service_id, u64 block_id, u64 certificate
|
||||
) {
|
||||
struct initiate_ctx* ctx = data;
|
||||
struct eggsfs_block_service bs = {
|
||||
.id = block_service_id,
|
||||
.ip1 = ip1,
|
||||
.port1 = port1,
|
||||
.ip2 = ip2,
|
||||
.port2 = port2,
|
||||
};
|
||||
struct eggsfs_block_socket* sock = eggsfs_get_write_block_socket(&bs);
|
||||
if (IS_ERR(sock)) { return PTR_ERR(sock); }
|
||||
ctx->span->block_sockets[block] = sock;
|
||||
int B = eggsfs_blocks(ctx->span->parity);
|
||||
int cell_size = ctx->span->block_size / ctx->span->stripes;
|
||||
u32 block_crc = 0;
|
||||
int s;
|
||||
for (s = 0; s < ctx->span->stripes; s++) {
|
||||
block_crc = eggsfs_crc32c_append(block_crc, ctx->cell_crcs[s*B + block], cell_size);
|
||||
}
|
||||
struct eggsfs_write_block_request* req = eggsfs_write_block(
|
||||
sock,
|
||||
&ctx->file->flusher,
|
||||
block_service_id,
|
||||
block_id,
|
||||
certificate,
|
||||
ctx->span->block_size,
|
||||
block_crc,
|
||||
ctx->span->blocks[block]
|
||||
);
|
||||
if (IS_ERR(req)) {
|
||||
eggsfs_put_write_block_socket(sock);
|
||||
return PTR_ERR(req);
|
||||
}
|
||||
ctx->span->block_reqs[block] = req;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Checks if the block-writing requests have all finished.
|
||||
//
|
||||
// * 0: not finished
|
||||
// * 1: finished
|
||||
// * -n: some request has failed with this error.
|
||||
//
|
||||
// Also, releases the sockets for requests that are done.
|
||||
static int eggsfs_block_writing_finished(struct eggsfs_transient_span* span) {
|
||||
int status = 1;
|
||||
int B = eggsfs_blocks(span->parity);
|
||||
int i;
|
||||
for (i = 0; i < B; i++) {
|
||||
struct eggsfs_write_block_request* req = span->block_reqs[i];
|
||||
if (req == NULL) {
|
||||
eggsfs_debug_print("%d is NULL", i);
|
||||
continue;
|
||||
}
|
||||
int req_status = atomic_read(&req->status);
|
||||
if (req_status < 0) { // error
|
||||
eggsfs_warn_print("request for block %d failed: %d", i, req_status);
|
||||
status = req_status;
|
||||
}
|
||||
if (req_status == 0) { // not completed yet
|
||||
eggsfs_debug_print("%d not completed yet", i);
|
||||
status = status == 1 ? 0 : status; // preserve errors
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
static void eggsfs_block_writing_put_reqs(struct eggsfs_transient_span* span) {
|
||||
int B = eggsfs_blocks(span->parity);
|
||||
int i;
|
||||
for (i = 0; i < B; i++) {
|
||||
struct eggsfs_write_block_request* req = span->block_reqs[i];
|
||||
if (req != NULL) {
|
||||
eggsfs_put_write_block_request(req);
|
||||
span->block_reqs[i] = NULL;
|
||||
}
|
||||
struct eggsfs_block_socket* sock = span->block_sockets[i];
|
||||
if (sock != NULL) {
|
||||
eggsfs_put_write_block_socket(sock);
|
||||
span->block_sockets[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct page* eggsfs_alloc_write_page(struct eggsfs_inode_file* file) {
|
||||
// the zeroing is to just write out to blocks assuming we have trailing
|
||||
// zeros, we could do it on demand but I can't be bothered.
|
||||
struct page* p = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
||||
if (p == NULL) { return p; }
|
||||
// this contributes to OOM score
|
||||
inc_mm_counter(file->owner->mm, MM_FILEPAGES);
|
||||
return p;
|
||||
}
|
||||
|
||||
static void eggsfs_free_transient_span(struct eggsfs_inode_file* file, struct eggsfs_transient_span* span) {
|
||||
// free pages
|
||||
int num_pages = 0;
|
||||
while (!list_empty(&span->pages)) {
|
||||
struct page *victim = lru_to_page(&span->pages);
|
||||
list_del(&victim->lru);
|
||||
put_page(victim);
|
||||
num_pages++;
|
||||
}
|
||||
add_mm_counter(file->owner->mm, MM_FILEPAGES, -num_pages);
|
||||
// Free reqs
|
||||
eggsfs_block_writing_put_reqs(span);
|
||||
// Free the span itself
|
||||
kfree(span);
|
||||
}
|
||||
|
||||
// Frees up everything after an error. Takes the inode lock.
|
||||
static void eggsfs_transient_error_cleanup(struct eggsfs_inode* enode) {
|
||||
struct eggsfs_inode_file* file = &enode->file;
|
||||
|
||||
if (atomic_read(&enode->file.transient_err) == 0) {
|
||||
eggsfs_warn_print("Error cleanup function is running, but the transient file is not errored. This should not be happening.");
|
||||
return;
|
||||
}
|
||||
|
||||
// We need to take the exclusive lock to ensure nobody else is writing to the last span.
|
||||
// We could even release it immediately afterwards, since afterwards everything else
|
||||
// will exit early singe `enode->file.transient_err` is non-zero.
|
||||
inode_lock(&enode->inode);
|
||||
|
||||
bool finished = false;
|
||||
struct eggsfs_transient_span* span;
|
||||
for (;;) {
|
||||
// Pick out first span (which is also the oldest, but doesn't matter anyway)
|
||||
spin_lock(&file->transient_spans_lock);
|
||||
span = list_first_entry_or_null(&file->transient_spans, struct eggsfs_transient_span, list);
|
||||
if (span == NULL) {
|
||||
// we've cleared all the spans
|
||||
finished = true;
|
||||
break;
|
||||
}
|
||||
if (eggsfs_block_writing_finished(span) == 0) {
|
||||
// The requests need to complete first, they rely on the pages still being there
|
||||
// (to read from them). We currently do not interrupt the requests mid-writing.
|
||||
// Note that we return here even if we aren't done cleaning up, but `eggsfs_flush_transient_span`
|
||||
// will be called back into, which will in turn call this function, continuing the cleanup.
|
||||
eggsfs_debug_print("requests not done yet, cleanup not finished");
|
||||
break;
|
||||
}
|
||||
eggsfs_block_writing_put_reqs(span);
|
||||
// We can destroy this span.
|
||||
list_del(&span->list);
|
||||
spin_unlock(&file->transient_spans_lock);
|
||||
eggsfs_free_transient_span(file, span);
|
||||
}
|
||||
spin_unlock(&file->transient_spans_lock);
|
||||
|
||||
inode_unlock(&enode->inode);
|
||||
|
||||
if (finished) {
|
||||
eggsfs_debug_print("finished cleaning up, we're done flushing");
|
||||
up(&file->done_flushing);
|
||||
} else {
|
||||
eggsfs_debug_print("not finished cleaning up");
|
||||
}
|
||||
}
|
||||
|
||||
void eggsfs_flush_transient_spans(struct work_struct* work) {
|
||||
struct eggsfs_inode_file* file = container_of(work, struct eggsfs_inode_file, flusher);
|
||||
struct eggsfs_inode* enode = container_of(file, struct eggsfs_inode, file);
|
||||
|
||||
eggsfs_debug_print("ino=%lu", enode->inode.i_ino);
|
||||
|
||||
if (atomic_read(&enode->file.transient_err) != 0) {
|
||||
eggsfs_transient_error_cleanup(enode);
|
||||
return;
|
||||
}
|
||||
|
||||
again:
|
||||
spin_lock(&file->transient_spans_lock);
|
||||
struct eggsfs_transient_span* span = list_first_entry_or_null(&file->transient_spans, struct eggsfs_transient_span, list);
|
||||
bool good_to_go = false;
|
||||
if (span) {
|
||||
if (span->status == EGGSFS_SPAN_STATUS_IDLE) {
|
||||
good_to_go = true;
|
||||
span->status = EGGSFS_SPAN_STATUS_FLUSHING;
|
||||
}
|
||||
if (span->status == EGGSFS_SPAN_STATUS_LAST) {
|
||||
good_to_go = true;
|
||||
span->status = EGGSFS_SPAN_STATUS_FLUSHING_LAST;
|
||||
}
|
||||
}
|
||||
spin_unlock(&file->transient_spans_lock);
|
||||
|
||||
if (!good_to_go) {
|
||||
eggsfs_debug_print("oldest span is not idle, terminating");
|
||||
return;
|
||||
}
|
||||
|
||||
int err;
|
||||
u32* cell_crcs = NULL;
|
||||
bool done = false;
|
||||
if (span->storage_class == EGGSFS_EMPTY_STORAGE) {
|
||||
done = true;
|
||||
} else if (span->storage_class == EGGSFS_INLINE_STORAGE) {
|
||||
// this is an easy one, just add the inline span
|
||||
struct page* page = list_first_entry_or_null(&span->pages, struct page, lru);
|
||||
if (page == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
char* data = kmap(page);
|
||||
eggsfs_debug_print("adding inline span of length %d", span->size);
|
||||
err = eggsfs_error_to_linux(eggsfs_shard_add_inline_span(
|
||||
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, enode->file.cookie,
|
||||
atomic64_read(&enode->file.flushed_so_far), span->size, data, span->size
|
||||
));
|
||||
kunmap(page);
|
||||
if (err) { goto error; }
|
||||
done = true;
|
||||
} else {
|
||||
// This means that it's the first time we see this span to flush
|
||||
int D = eggsfs_data_blocks(span->parity);
|
||||
int B = eggsfs_blocks(span->parity);
|
||||
int S = span->stripes;
|
||||
eggsfs_debug_print("size=%d, D=%d, B=%d, S=%d, status=%d", span->size, D, B, S, span->status);
|
||||
int i;
|
||||
if (span->block_reqs[0] != NULL) {
|
||||
// If we have already the sockets/requests, it being callback'd here means
|
||||
// that one of the request has finished. Let's check whether they all have.
|
||||
int reqs_status = eggsfs_block_writing_finished(span);
|
||||
if (reqs_status == 1) { // finished
|
||||
u64 block_ids[EGGSFS_MAX_BLOCKS];
|
||||
u64 block_proofs[EGGSFS_MAX_BLOCKS];
|
||||
for (i = 0; i < B; i++) {
|
||||
struct eggsfs_write_block_request* req = span->block_reqs[i];
|
||||
block_ids[i] = req->block_id;
|
||||
block_proofs[i] = be64_to_cpu(req->written_resp.data.proof);
|
||||
}
|
||||
eggsfs_block_writing_put_reqs(span); // we got our proof
|
||||
err = eggsfs_error_to_linux(eggsfs_shard_add_span_certify(
|
||||
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
||||
enode->inode.i_ino, file->cookie, atomic64_read(&file->flushed_so_far), span->parity,
|
||||
block_ids, block_proofs
|
||||
));
|
||||
if (err) { goto error; }
|
||||
done = true;
|
||||
} else if (reqs_status < 0) { // errored
|
||||
eggsfs_block_writing_put_reqs(span);
|
||||
err = reqs_status;
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
trace_eggsfs_span_flush_enter(
|
||||
enode->inode.i_ino, atomic64_read(&file->flushed_so_far), span->size, span->parity, span->stripes, span->storage_class
|
||||
);
|
||||
// Below, remember that cell size is always a multiple of PAGE_SIZE.
|
||||
{
|
||||
int data_size;
|
||||
for (data_size = span->block_size * D; data_size > span->size; data_size -= PAGE_SIZE) {
|
||||
// This can happen if we have trailing zeros because of how we arrange the
|
||||
// stripes.
|
||||
struct page* zpage = eggsfs_alloc_write_page(file);
|
||||
if (zpage == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
list_add_tail(&zpage->lru, &span->pages);
|
||||
}
|
||||
}
|
||||
struct page* curr_pages[EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY]; // buffer we use at various points
|
||||
memset(curr_pages, 0, sizeof(curr_pages));
|
||||
int cell_size = span->block_size / S;
|
||||
// First, we arrange the data blocks in singly linked lists in `span->blocks`.
|
||||
{
|
||||
int cell_offset = 0;
|
||||
int block = 0;
|
||||
struct page* page;
|
||||
list_for_each_entry(page, &span->pages, lru) {
|
||||
page->private = 0;
|
||||
if (curr_pages[block] == NULL) { // first
|
||||
span->blocks[block] = page;
|
||||
} else {
|
||||
curr_pages[block]->private = (unsigned long)page;
|
||||
}
|
||||
curr_pages[block] = page;
|
||||
cell_offset += PAGE_SIZE;
|
||||
if (cell_offset >= cell_size) {
|
||||
block++;
|
||||
block = block % D;
|
||||
cell_offset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Then, we compute the parity blocks.
|
||||
// We also compute the cell CRCs at this stage.
|
||||
cell_crcs = kzalloc(sizeof(uint32_t)*B*span->stripes, GFP_KERNEL);
|
||||
if (cell_crcs == NULL) { err = -ENOMEM; goto error; }
|
||||
u32 span_crc = 0;
|
||||
{
|
||||
char* pages_bufs[EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY];
|
||||
memset(curr_pages, 0, sizeof(curr_pages));
|
||||
kernel_fpu_begin();
|
||||
// traverse each block, page by page, keeping track of which cell we're in.
|
||||
u32 block_offset;
|
||||
for (block_offset = 0; block_offset < span->block_size; block_offset += PAGE_SIZE) {
|
||||
for (i = 0; i < D; i++) { // grab next data page for each block
|
||||
if (block_offset == 0) { // first page
|
||||
curr_pages[i] = span->blocks[i];
|
||||
} else {
|
||||
curr_pages[i] = (struct page*)curr_pages[i]->private;
|
||||
}
|
||||
}
|
||||
for (i = D; i < B; i++) { // allocate parity pages
|
||||
struct page* ppage = eggsfs_alloc_write_page(file);
|
||||
if (ppage == NULL) { err = -ENOMEM; goto kernel_fpu_error; }
|
||||
list_add_tail(&ppage->lru, &span->pages);
|
||||
if (curr_pages[i] == NULL) {
|
||||
span->blocks[i] = ppage;
|
||||
} else {
|
||||
curr_pages[i]->private = (unsigned long)ppage;
|
||||
}
|
||||
curr_pages[i] = ppage;
|
||||
}
|
||||
// get buffer pointers
|
||||
int s = block_offset / cell_size;
|
||||
for (i = 0; i < B; i++) {
|
||||
pages_bufs[i] = kmap(curr_pages[i]);
|
||||
}
|
||||
// compute parity
|
||||
err = eggsfs_compute_parity(span->parity, PAGE_SIZE, (const char**)&pages_bufs[0], &pages_bufs[D]);
|
||||
if (err) { goto kernel_fpu_error; }
|
||||
// compute CRCs
|
||||
for (i = 0; i < B; i++) {
|
||||
cell_crcs[s*B + i] = eggsfs_crc32c(cell_crcs[s*B + i], pages_bufs[i], PAGE_SIZE);
|
||||
}
|
||||
for (i = 0; i < B; i++) {
|
||||
kunmap(curr_pages[i]);
|
||||
}
|
||||
}
|
||||
// Compute overall span crc
|
||||
int s;
|
||||
for (s = 0; s < S; s++) {
|
||||
for (i = 0; i < D; i++) {
|
||||
span_crc = eggsfs_crc32c_append(span_crc, cell_crcs[s*B + i], cell_size);
|
||||
}
|
||||
}
|
||||
span_crc = eggsfs_crc32c_zero_extend(span_crc, (int)span->size - (int)(span->block_size*D));
|
||||
kernel_fpu_end();
|
||||
}
|
||||
// Now we need to init the span (the callback will actually send the requests)
|
||||
// TODO would be better to have this async, and free up the queue for other stuff.
|
||||
{
|
||||
struct initiate_ctx ctx = {
|
||||
.file = file,
|
||||
.span = span,
|
||||
.cell_crcs = cell_crcs,
|
||||
};
|
||||
err = eggsfs_error_to_linux(eggsfs_shard_add_span_initiate(
|
||||
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
||||
&ctx, enode->inode.i_ino, file->cookie, atomic64_read(&file->flushed_so_far), span->size,
|
||||
span_crc, span->storage_class, span->parity, span->stripes, cell_size, cell_crcs
|
||||
));
|
||||
if (err) { goto error; }
|
||||
}
|
||||
kfree(cell_crcs);
|
||||
// Now we wait.
|
||||
eggsfs_debug_print("sent all block writing requests");
|
||||
}
|
||||
}
|
||||
|
||||
if (done) {
|
||||
// Free everything
|
||||
spin_lock(&file->transient_spans_lock);
|
||||
list_del(&span->list);
|
||||
spin_unlock(&file->transient_spans_lock);
|
||||
// safe to do this without locks since there's only ever one
|
||||
// work queue working on any given span
|
||||
trace_eggsfs_span_flush_exit(enode->inode.i_ino, atomic64_read(&file->flushed_so_far), span->size, 0);
|
||||
eggsfs_debug_print("finished span at offset %llu", atomic64_read(&file->flushed_so_far));
|
||||
atomic64_add(span->size, &enode->file.flushed_so_far);
|
||||
bool all_done = span->status == EGGSFS_SPAN_STATUS_FLUSHING_LAST;
|
||||
eggsfs_free_transient_span(file, span);
|
||||
up(&file->in_flight_spans);
|
||||
if (all_done) {
|
||||
eggsfs_debug_print("finished");
|
||||
up(&file->done_flushing);
|
||||
} else {
|
||||
eggsfs_debug_print("going to next span");
|
||||
goto again;
|
||||
}
|
||||
} else {
|
||||
eggsfs_debug_print("span not done");
|
||||
// restore status so that we can keep working on this
|
||||
spin_lock(&file->transient_spans_lock);
|
||||
if (span->status == EGGSFS_SPAN_STATUS_FLUSHING) {
|
||||
span->status = EGGSFS_SPAN_STATUS_IDLE;
|
||||
} else if (span->status == EGGSFS_SPAN_STATUS_FLUSHING_LAST) {
|
||||
span->status = EGGSFS_SPAN_STATUS_LAST;
|
||||
} else {
|
||||
BUG();
|
||||
}
|
||||
spin_unlock(&file->transient_spans_lock);
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
kernel_fpu_error:
|
||||
kernel_fpu_end();
|
||||
error:
|
||||
eggsfs_info_print("transient span flushing failed with err=%d", err);
|
||||
trace_eggsfs_span_flush_exit(enode->inode.i_ino, atomic64_read(&file->flushed_so_far), span->size, err);
|
||||
atomic_set(&enode->file.transient_err, err);
|
||||
kfree(cell_crcs);
|
||||
eggsfs_transient_error_cleanup(enode);
|
||||
}
|
||||
|
||||
// Runs in the transient_spans_lock spinlock.
|
||||
static void eggsfs_compute_spans_parameters(
|
||||
struct eggsfs_block_policies* block_policies,
|
||||
struct eggsfs_span_policies* span_policies,
|
||||
u32 target_stripe_size,
|
||||
struct eggsfs_transient_span* span,
|
||||
u32 span_size
|
||||
) {
|
||||
eggsfs_debug_print("span_size=%u", span_size);
|
||||
|
||||
span->size = span_size;
|
||||
|
||||
// Easy case: inline span (or empty, can happen for empty files)
|
||||
if (span_size < 256) {
|
||||
span->storage_class = span_size > 0 ? EGGSFS_INLINE_STORAGE : EGGSFS_EMPTY_STORAGE;
|
||||
span->block_size = 0;
|
||||
span->stripes = 0;
|
||||
span->parity = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// Pick parity
|
||||
int num_span_policies = eggsfs_span_policies_len(span_policies);
|
||||
BUG_ON(num_span_policies == 0);
|
||||
int i;
|
||||
u8 parity = 0;
|
||||
for (i = num_span_policies - 2; i >= 0; i--) {
|
||||
u32 max_size;
|
||||
u8 this_parity;
|
||||
eggsfs_span_policies_get(span_policies, i, &max_size, &this_parity);
|
||||
if (span_size > max_size) {
|
||||
i++;
|
||||
eggsfs_span_policies_get(span_policies, i, &max_size, &parity);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (parity == 0) {
|
||||
i = 0;
|
||||
u32 max_size;
|
||||
eggsfs_span_policies_get(span_policies, i, &max_size, &parity);
|
||||
BUG_ON(span_size > max_size);
|
||||
}
|
||||
|
||||
// For simplicity, we have all cells to be a multiple of PAGE_SIZE, which means that all
|
||||
// blocks/stripes/spans are multiples of PAGE_SIZE. We might also have some extra pages
|
||||
// at the end to have things to line up correctly. This should only happens for big spans
|
||||
// anyway (small spans will just use mirroring).
|
||||
int S;
|
||||
if (eggsfs_data_blocks(parity) == 1) {
|
||||
// If we only have one data block, things are also pretty simple (just mirroring).
|
||||
// It doesn't make sense to have stripes in this case.
|
||||
S = 1;
|
||||
} else {
|
||||
// Otherwise compute striping etc.
|
||||
// We use target_stripe_size as an upper bound, for now (i.e. the stripe will always be smaller
|
||||
// than TargetStripeSize)
|
||||
S = min(max((u32)1, span_size / target_stripe_size), (u32)15);
|
||||
}
|
||||
int D = eggsfs_data_blocks(parity);
|
||||
int block_size = (span_size + D - 1) / D;
|
||||
int cell_size = (block_size + S - 1) / S;
|
||||
// Round up to cell to page size
|
||||
cell_size = (cell_size + PAGE_SIZE - 1) & PAGE_MASK;
|
||||
block_size = cell_size * S;
|
||||
|
||||
// Pick storage class
|
||||
BUG_ON(span_size > EGGSFS_MAX_BLOCK_SIZE);
|
||||
int num_block_policies = eggsfs_block_policies_len(block_policies);
|
||||
BUG_ON(num_block_policies == 0);
|
||||
u8 storage_class;
|
||||
for (i = num_block_policies-1; i >= 0; i--) {
|
||||
u32 min_size;
|
||||
eggsfs_block_policies_get(block_policies, i, &storage_class, &min_size);
|
||||
if (block_size > min_size) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
span->storage_class = storage_class;
|
||||
span->block_size = block_size;
|
||||
span->stripes = S;
|
||||
span->parity = parity;
|
||||
}
|
||||
|
||||
static ssize_t eggsfs_file_write_iter(struct kiocb* iocb, struct iov_iter* from) {
|
||||
int err;
|
||||
if (iocb->ki_flags & IOCB_DIRECT) { return -ENOSYS; }
|
||||
|
||||
struct file* file = iocb->ki_filp;
|
||||
struct inode* inode = file->f_inode;
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
loff_t* ppos = &iocb->ki_pos;
|
||||
loff_t ppos_before = *ppos;
|
||||
|
||||
eggsfs_debug_print("enode=%p, ino=%lu, count=%lu, size=%lld, *ppos=%lld", enode, inode->i_ino, iov_iter_count(from), enode->inode.i_size, *ppos);
|
||||
|
||||
if (!inode_trylock(inode)) {
|
||||
if (iocb->ki_flags & IOCB_NOWAIT) {
|
||||
return -EAGAIN;
|
||||
}
|
||||
inode_lock(inode);
|
||||
}
|
||||
|
||||
// still a transient file
|
||||
if (!(enode->flags & EGGSFS_INODE_TRANSIENT)) { err = -EPERM; goto out_err; }
|
||||
|
||||
// permanent error
|
||||
err = atomic_read(&enode->file.transient_err);
|
||||
if (err) { goto out_err; }
|
||||
|
||||
// we're writing at the right offset
|
||||
if (*ppos != enode->inode.i_size) {
|
||||
err = -EINVAL;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
// if it's a zero write, then we exit early
|
||||
if (unlikely(!iov_iter_count(from))) { goto out; }
|
||||
|
||||
// Precompute the bound for spans
|
||||
u32 max_span_size;
|
||||
{
|
||||
u8 p;
|
||||
eggsfs_span_policies_last(&enode->span_policies, &max_span_size, &p);
|
||||
}
|
||||
BUG_ON(max_span_size%PAGE_SIZE != 0); // needed for "new span" logic paired with page copying below, we could avoid it
|
||||
|
||||
// get the span we're currently writing at
|
||||
spin_lock(&enode->file.transient_spans_lock);
|
||||
BUG_ON(list_empty(&enode->file.transient_spans));
|
||||
struct eggsfs_transient_span* span = list_last_entry(&enode->file.transient_spans, struct eggsfs_transient_span, list);
|
||||
int status = span->status;
|
||||
spin_unlock(&enode->file.transient_spans_lock);
|
||||
|
||||
if (status != EGGSFS_SPAN_STATUS_WRITING) { err = -EPERM; goto out_err; } // we've already declared this file done, we're just flushing it
|
||||
|
||||
// We now start writing into the span, as much as we can anyway
|
||||
BUG_ON(!iov_iter_count(from)); // we rely on making progress for the switch to next spage below (page->index == 0)
|
||||
while (true) {
|
||||
struct page* page = list_empty(&span->pages) ? NULL : list_last_entry(&span->pages, struct page, lru);
|
||||
if (page == NULL || page->index == 0) { // we're the first ones to get here, or we need to switch to the next one
|
||||
BUG_ON(page != NULL && page->index > PAGE_SIZE);
|
||||
page = eggsfs_alloc_write_page(&enode->file);
|
||||
if (!page) {
|
||||
err = -ENOMEM;
|
||||
goto out_err; // we haven't corrupted anything, so no need to make it permanent
|
||||
}
|
||||
page->index = 0;
|
||||
list_add_tail(&page->lru, &span->pages);
|
||||
}
|
||||
|
||||
int ret = copy_page_from_iter(page, page->index, PAGE_SIZE - page->index, from);
|
||||
if (ret < 0) { err = ret; goto out_err_permanent; }
|
||||
eggsfs_debug_print("written %d to page %p", ret, page);
|
||||
enode->inode.i_size += ret;
|
||||
page->index = (page->index + ret) % PAGE_SIZE;
|
||||
*ppos += ret;
|
||||
|
||||
u32 current_span_size = enode->inode.i_size - enode->file.size_without_current_span;
|
||||
if (current_span_size >= max_span_size) { // we need to switch to the next span
|
||||
BUG_ON(current_span_size > max_span_size);
|
||||
if (down_trylock(&enode->file.in_flight_spans) == 1) {
|
||||
if (iocb->ki_flags & IOCB_NOWAIT) {
|
||||
err = -EAGAIN;
|
||||
goto out_err;
|
||||
}
|
||||
int err = down_killable(&enode->file.in_flight_spans);
|
||||
if (err < 0) { goto out_err; }
|
||||
}
|
||||
spin_lock(&enode->file.transient_spans_lock);
|
||||
span->status = EGGSFS_SPAN_STATUS_IDLE;
|
||||
eggsfs_compute_spans_parameters(
|
||||
&enode->block_policies, &enode->span_policies, enode->target_stripe_size,
|
||||
span, current_span_size
|
||||
);
|
||||
struct eggsfs_transient_span* new_span = eggsfs_add_new_span(&enode->file.transient_spans);
|
||||
spin_unlock(&enode->file.transient_spans_lock);
|
||||
enode->file.size_without_current_span += current_span_size;
|
||||
if (IS_ERR(new_span)) {
|
||||
err = PTR_ERR(new_span);
|
||||
// This is permanent otherwise we'd break the invariant of always having a WRITING span at
|
||||
// the end.
|
||||
goto out_err_permanent;
|
||||
}
|
||||
trace_eggsfs_span_add(enode->inode.i_ino, *ppos);
|
||||
queue_work(eggsfs_wq, &enode->file.flusher);
|
||||
span = new_span;
|
||||
}
|
||||
|
||||
if (!iov_iter_count(from)) { break; }
|
||||
}
|
||||
|
||||
int written;
|
||||
out:
|
||||
written = *ppos - ppos_before;
|
||||
inode_unlock(inode);
|
||||
|
||||
eggsfs_debug_print("written=%d", written);
|
||||
|
||||
return written;
|
||||
|
||||
out_err_permanent:
|
||||
atomic_set(&enode->file.transient_err, err);
|
||||
// we need to free the yet-to-be-written spans
|
||||
queue_work(eggsfs_wq, &enode->file.flusher);
|
||||
|
||||
out_err:
|
||||
inode_unlock(inode);
|
||||
eggsfs_info_print("err=%d", err);
|
||||
if (err == -EAGAIN && *ppos > ppos_before) {
|
||||
goto out; // we can just return what we've already written
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
// * 0: skip (not the right owner)
|
||||
// * 1: proceed
|
||||
// * -n: error
|
||||
__must_check static int eggsfs_file_flush_init(struct eggsfs_inode* enode) {
|
||||
int res;
|
||||
|
||||
inode_lock(&enode->inode);
|
||||
|
||||
// Not a transient file, there's nothing to do, files are immutable
|
||||
if (!(enode->flags & EGGSFS_INODE_TRANSIENT)) { res = 0; goto out; }
|
||||
// We are in another process, skip
|
||||
if (enode->file.owner != current->group_leader) { res = 0; goto out; }
|
||||
|
||||
// Mark the last span as the last one, and queue work
|
||||
spin_lock(&enode->file.transient_spans_lock);
|
||||
BUG_ON(list_empty(&enode->file.transient_spans));
|
||||
struct eggsfs_transient_span* span = list_last_entry(&enode->file.transient_spans, struct eggsfs_transient_span, list);
|
||||
// we might not be the first ones here -- every other operation is fine to do twice anyway.
|
||||
bool needs_queueing = false;
|
||||
if (span->status == EGGSFS_SPAN_STATUS_WRITING) {
|
||||
needs_queueing = true;
|
||||
u32 current_span_size = enode->inode.i_size - enode->file.size_without_current_span;
|
||||
span->status = EGGSFS_SPAN_STATUS_LAST;
|
||||
eggsfs_debug_print("finishing with %u", current_span_size);
|
||||
eggsfs_compute_spans_parameters(
|
||||
&enode->block_policies, &enode->span_policies, enode->target_stripe_size,
|
||||
span, current_span_size
|
||||
);
|
||||
enode->file.size_without_current_span += current_span_size;
|
||||
}
|
||||
spin_unlock(&enode->file.transient_spans_lock);
|
||||
|
||||
inode_unlock(&enode->inode);
|
||||
|
||||
if (needs_queueing) {
|
||||
queue_work(eggsfs_wq, &enode->file.flusher);
|
||||
}
|
||||
return 1;
|
||||
|
||||
out:
|
||||
inode_unlock(&enode->inode);
|
||||
return res;
|
||||
}
|
||||
|
||||
static int eggsfs_file_flush(struct file* filp, fl_owner_t id) { // can we get write while this is in progress?
|
||||
struct eggsfs_inode* enode = EGGSFS_I(filp->f_inode);
|
||||
struct dentry* dentry = filp->f_path.dentry;
|
||||
|
||||
int res = eggsfs_file_flush_init(enode);
|
||||
if (res == 0) { return 0; }
|
||||
if (res < 0) { return res; }
|
||||
|
||||
// Now we need to wait for things to be done
|
||||
eggsfs_debug_print("waiting for flush");
|
||||
res = down_killable(&enode->file.done_flushing);
|
||||
if (res < 0) { return res; }
|
||||
up(&enode->file.done_flushing); // allow others to wait on it again
|
||||
eggsfs_debug_print("flush done");
|
||||
|
||||
res = atomic_read(&enode->file.transient_err);
|
||||
if (res < 0) { return res; }
|
||||
|
||||
inode_lock(&enode->inode);
|
||||
// somebody might have gotten here before us
|
||||
if (enode->flags & EGGSFS_INODE_TRANSIENT) {
|
||||
res = eggsfs_error_to_linux(eggsfs_shard_link_file(
|
||||
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino,
|
||||
enode->file.cookie, dentry->d_parent->d_inode->i_ino, dentry->d_name.name, dentry->d_name.len,
|
||||
&enode->edge_creation_time
|
||||
));
|
||||
if (res == 0) {
|
||||
enode->flags &= ~EGGSFS_INODE_TRANSIENT;
|
||||
}
|
||||
}
|
||||
inode_unlock(&enode->inode);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static ssize_t eggsfs_file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
|
||||
struct file* file = iocb->ki_filp;
|
||||
struct inode* inode = file->f_inode;
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
loff_t *ppos = &iocb->ki_pos;
|
||||
ssize_t written = 0;
|
||||
|
||||
if (unlikely(iocb->ki_flags & IOCB_DIRECT)) { return -ENOSYS; }
|
||||
|
||||
eggsfs_debug_print("start of read loop, *ppos=%llu", *ppos);
|
||||
while (*ppos < inode->i_size && iov_iter_count(to)) {
|
||||
struct eggsfs_span* span = eggsfs_get_span(enode, *ppos);
|
||||
if (IS_ERR(span)) { written = PTR_ERR(span); goto out; }
|
||||
if (span == NULL) { goto out; }
|
||||
if (span->start%PAGE_SIZE != 0) {
|
||||
eggsfs_warn_print("span start is not a multiple of page size %llu", span->start);
|
||||
written = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
eggsfs_debug_print("span start=%llu end=%llu", span->start, span->end);
|
||||
|
||||
u64 span_offset = *ppos - span->start;
|
||||
u64 span_size = span->end - span->start;
|
||||
|
||||
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
|
||||
struct eggsfs_inline_span* inline_span = EGGSFS_INLINE_SPAN(span);
|
||||
size_t to_copy = span_size - span_offset;
|
||||
eggsfs_debug_print("(inline) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
|
||||
size_t copied =
|
||||
copy_to_iter(inline_span->body + span_offset, inline_span->len - span_offset, to) +
|
||||
iov_iter_zero(span_size - inline_span->len, to); // trailing zeros
|
||||
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out; }
|
||||
written += copied;
|
||||
*ppos += copied;
|
||||
} else {
|
||||
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
|
||||
if (block_span->cell_size%PAGE_SIZE != 0) {
|
||||
eggsfs_warn_print("cell size not multiple of page size %u", block_span->cell_size);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
u32 page_ix = span_offset/PAGE_SIZE;
|
||||
// how much we should read (e.g. exclude trailing zeros)
|
||||
u64 span_data_end = min(
|
||||
span->end,
|
||||
span->start + (u64)block_span->cell_size*eggsfs_data_blocks(block_span->parity)*block_span->stripes
|
||||
);
|
||||
eggsfs_debug_print("span_data_end=%llu", span_data_end);
|
||||
while (*ppos < span_data_end && iov_iter_count(to)) {
|
||||
struct page* page = eggsfs_get_span_page(block_span, page_ix);
|
||||
if (IS_ERR(page)) { written = PTR_ERR(page); goto out; }
|
||||
size_t to_copy = min((u64)PAGE_SIZE - (*ppos % PAGE_SIZE), span_data_end - *ppos);
|
||||
eggsfs_debug_print("(block) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
|
||||
size_t copied = copy_page_to_iter(page, *ppos % PAGE_SIZE, to_copy, to);
|
||||
eggsfs_debug_print("copied=%lu, remaining %lu", copied, iov_iter_count(to));
|
||||
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out; }
|
||||
written += copied;
|
||||
*ppos += copied;
|
||||
page_ix++;
|
||||
}
|
||||
// trailing zeros
|
||||
if (iov_iter_count(to)) {
|
||||
size_t to_copy = span->end - span_data_end;
|
||||
size_t copied = iov_iter_zero(to_copy, to);
|
||||
eggsfs_debug_print("(block zeroes) copying %lu, have %lu remaining", to_copy, iov_iter_count(to));
|
||||
if (copied < to_copy && iov_iter_count(to)) { written = -EFAULT; goto out; }
|
||||
written += copied;
|
||||
*ppos += copied;
|
||||
}
|
||||
}
|
||||
eggsfs_debug_print("before loop end, remaining %lu", iov_iter_count(to));
|
||||
}
|
||||
out:
|
||||
eggsfs_debug_print("out of the loop, written=%ld", written);
|
||||
if (written < 0) {
|
||||
eggsfs_debug_print("reading failed, err=%ld", written);
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
const struct file_operations eggsfs_filesimple_operations = {
|
||||
.open = eggsfs_file_open,
|
||||
.read_iter = eggsfs_file_read_iter,
|
||||
.write_iter = eggsfs_file_write_iter,
|
||||
.flush = eggsfs_file_flush,
|
||||
.llseek = generic_file_llseek,
|
||||
};
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
#ifndef _EGGSFS_FILESIMPLE_H
|
||||
#define _EGGSFS_FILESIMPLE_H
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
// Must be called with `spans_lock` taken.
|
||||
struct eggsfs_transient_span* eggsfs_add_new_span(struct list_head* spans);
|
||||
|
||||
void eggsfs_flush_transient_spans(struct work_struct* work);
|
||||
|
||||
int eggsfs_shard_add_span_initiate_block_cb(
|
||||
void* data, int block, u32 ip1, u16 port1, u32 ip2, u16 port2, u64 block_service_id, u64 block_id, u64 certificate
|
||||
);
|
||||
|
||||
extern const struct file_operations eggsfs_filesimple_operations;
|
||||
|
||||
#endif
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../cpp/rs/gf_tables.c
|
||||
@@ -0,0 +1,9 @@
|
||||
#cloud-config
|
||||
hostname: uovo
|
||||
users:
|
||||
- name: fmazzol
|
||||
ssh-authorized-keys:
|
||||
- ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE93TJCd7qWh/kHzaBoCSU9VgOKSWlUls9OkiBNCjG3B francesco.mazzoli@xtxmarkets.com
|
||||
sudo: ['ALL=(ALL) NOPASSWD:ALL']
|
||||
groups: sudo
|
||||
shell: /bin/bash
|
||||
+301
@@ -0,0 +1,301 @@
|
||||
#include "inode.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "dir.h"
|
||||
#include "metadata.h"
|
||||
#include "namei.h"
|
||||
#include "trace.h"
|
||||
#include "err.h"
|
||||
#include "file.h"
|
||||
|
||||
static struct kmem_cache* eggsfs_inode_cachep;
|
||||
|
||||
struct inode* eggsfs_inode_alloc(struct super_block* sb) {
|
||||
struct eggsfs_inode* enode;
|
||||
|
||||
eggsfs_debug_print("sb=%p", sb);
|
||||
|
||||
enode = (struct eggsfs_inode*)kmem_cache_alloc(eggsfs_inode_cachep, GFP_NOFS);
|
||||
if (!enode) {
|
||||
eggsfs_debug_print("err=ENOMEM");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
enode->mtime = 0;
|
||||
enode->mtime_expiry = 0;
|
||||
enode->edge_creation_time = 0;
|
||||
eggsfs_latch_init(&enode->getattr_update_latch);
|
||||
|
||||
eggsfs_debug_print("done enode=%p", enode);
|
||||
return &enode->inode;
|
||||
}
|
||||
|
||||
void eggsfs_inode_evict(struct inode* inode) {
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
eggsfs_debug_print("enode=%p", enode);
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
eggsfs_dir_drop_cache(enode);
|
||||
} else if (S_ISREG(inode->i_mode)) {
|
||||
// error out in flight stuff
|
||||
atomic_cmpxchg(&enode->file.transient_err, 0, -EINTR);
|
||||
// wait for all writing operations to be done (we might be cleaning up)
|
||||
down(&enode->file.done_flushing);
|
||||
up(&enode->file.done_flushing);
|
||||
}
|
||||
truncate_inode_pages(&inode->i_data, 0);
|
||||
clear_inode(inode);
|
||||
}
|
||||
|
||||
void eggsfs_inode_free(struct inode* inode) {
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
eggsfs_debug_print("enode=%p", enode);
|
||||
kmem_cache_free(eggsfs_inode_cachep, enode);
|
||||
}
|
||||
|
||||
static void eggsfs_inode_init_once(void* ptr) {
|
||||
struct eggsfs_inode* enode = (struct eggsfs_inode*)ptr;
|
||||
|
||||
inode_init_once(&enode->inode);
|
||||
}
|
||||
|
||||
int __init eggsfs_inode_init(void) {
|
||||
eggsfs_inode_cachep = kmem_cache_create(
|
||||
"eggsfs_inode_cache", sizeof(struct eggsfs_inode),
|
||||
0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, eggsfs_inode_init_once
|
||||
);
|
||||
if (!eggsfs_inode_cachep) { return -ENOMEM; }
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __cold eggsfs_inode_exit(void) {
|
||||
rcu_barrier();
|
||||
kmem_cache_destroy(eggsfs_inode_cachep);
|
||||
}
|
||||
|
||||
|
||||
int eggsfs_do_getattr(struct eggsfs_inode* enode) {
|
||||
int err;
|
||||
s64 seqno;
|
||||
|
||||
again: // progress: whoever wins the lock won't try again
|
||||
eggsfs_debug_print("enode=%p id=0x%016lx mtime=%lld mtime_expiry=%lld", enode, enode->inode.i_ino, enode->mtime, enode->mtime_expiry);
|
||||
|
||||
if (eggsfs_latch_try_acquire(&enode->getattr_update_latch, seqno)) {
|
||||
u64 ts = get_jiffies_64();
|
||||
if (smp_load_acquire(&enode->mtime_expiry) > ts) { err = 0; goto out; }
|
||||
|
||||
u64 mtime;
|
||||
u64 expiry;
|
||||
if (S_ISDIR(enode->inode.i_mode)) {
|
||||
struct eggsfs_block_policies block_policies;
|
||||
struct eggsfs_span_policies span_policies;
|
||||
u32 target_stripe_size;
|
||||
err = eggsfs_shard_getattr_dir(
|
||||
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
||||
enode->inode.i_ino,
|
||||
&mtime,
|
||||
&block_policies,
|
||||
&span_policies,
|
||||
&target_stripe_size
|
||||
);
|
||||
if (err == 0) {
|
||||
if (block_policies.len) {
|
||||
memcpy(&enode->block_policies, &block_policies, sizeof(block_policies));
|
||||
}
|
||||
if (span_policies.len) {
|
||||
memcpy(&enode->span_policies, &span_policies, sizeof(span_policies));
|
||||
}
|
||||
if (target_stripe_size) {
|
||||
enode->target_stripe_size = target_stripe_size;
|
||||
}
|
||||
expiry = ts + eggsfs_dir_refresh_time;
|
||||
}
|
||||
} else {
|
||||
u64 size;
|
||||
err = eggsfs_shard_getattr_file(
|
||||
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
||||
enode->inode.i_ino,
|
||||
&mtime,
|
||||
&size
|
||||
);
|
||||
if (err == 0) {
|
||||
enode->inode.i_size = size;
|
||||
expiry = ~(uint64_t)0;
|
||||
}
|
||||
}
|
||||
if (err) { err = eggsfs_error_to_linux(err); goto out; }
|
||||
else {
|
||||
WRITE_ONCE(enode->mtime, mtime);
|
||||
enode->inode.i_mtime.tv_sec = mtime / 1000000000 + 1577836800;
|
||||
enode->inode.i_mtime.tv_nsec = mtime % 1000000000;
|
||||
}
|
||||
|
||||
smp_store_release(&enode->mtime_expiry, expiry);
|
||||
|
||||
out:
|
||||
WRITE_ONCE(enode->getattr_err, err);
|
||||
eggsfs_latch_release(&enode->getattr_update_latch, seqno);
|
||||
eggsfs_debug_print("out mtime=%llu mtime_expiry=%llu", enode->mtime, enode->mtime_expiry);
|
||||
return err;
|
||||
} else {
|
||||
err = eggsfs_latch_wait_killable(&enode->getattr_update_latch, seqno);
|
||||
if (err) { return err; }
|
||||
if (READ_ONCE(enode->getattr_err)) { goto again; }
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int eggsfs_create(struct inode* parent, struct dentry* dentry, umode_t mode, bool excl) {
|
||||
int err;
|
||||
|
||||
BUG_ON(dentry->d_inode);
|
||||
|
||||
struct eggsfs_inode* parent_enode = EGGSFS_I(parent);
|
||||
|
||||
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
|
||||
err = -ENAMETOOLONG;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
eggsfs_debug_print("creating %*s", dentry->d_name.len, dentry->d_name.name);
|
||||
|
||||
u64 ino, cookie;
|
||||
err = eggsfs_error_to_linux(eggsfs_shard_create_file(
|
||||
(struct eggsfs_fs_info*)parent->i_sb->s_fs_info, eggsfs_inode_shard(parent->i_ino),
|
||||
EGGSFS_INODE_FILE, dentry->d_name.name, dentry->d_name.len, &ino, &cookie
|
||||
));
|
||||
if (err) { return eggsfs_error_to_linux(err); }
|
||||
// make this dentry stick around?
|
||||
|
||||
struct inode* inode = eggsfs_get_inode(dentry->d_sb, ino); // new_inode
|
||||
if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_err; }
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
INIT_LIST_HEAD(&enode->file.transient_spans);
|
||||
struct eggsfs_transient_span* span = eggsfs_add_new_span(&enode->file.transient_spans);
|
||||
if (IS_ERR(span)) { err = PTR_ERR(span); goto out_err; }
|
||||
enode->flags = EGGSFS_INODE_WRITEABLE;
|
||||
enode->file.cookie = cookie;
|
||||
enode->file.size_without_current_span = 0;
|
||||
spin_lock_init(&enode->file.transient_spans_lock);
|
||||
memcpy(&enode->block_policies, &parent_enode->block_policies, sizeof(parent_enode->block_policies));
|
||||
memcpy(&enode->span_policies, &parent_enode->span_policies, sizeof(parent_enode->span_policies));
|
||||
enode->target_stripe_size = parent_enode->target_stripe_size;
|
||||
atomic_set(&enode->file.transient_err, 0);
|
||||
enode->file.owner = current->group_leader;
|
||||
INIT_WORK(&enode->file.flusher, eggsfs_flush_transient_spans);
|
||||
sema_init(&enode->file.done_flushing, 0);
|
||||
atomic64_set(&enode->file.flushed_so_far, 0);
|
||||
sema_init(&enode->file.in_flight_spans, 2); // fairly arbitrary, ~300MiB of memory max
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
d_invalidate(dentry);
|
||||
|
||||
eggsfs_debug_print("size=%lld", inode->i_size);
|
||||
|
||||
return 0;
|
||||
|
||||
out_err:
|
||||
return err;
|
||||
}
|
||||
|
||||
// vfs: refcount of path->dentry
|
||||
static int eggsfs_getattr(const struct path* path, struct kstat* stat, u32 request_mask, unsigned int query_flags) {
|
||||
struct inode* inode = d_inode(path->dentry);
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
|
||||
trace_eggsfs_vfs_getattr_enter(inode);
|
||||
|
||||
// >= so that eggsfs_dir_refresh_time=0 causes revalidation at every call to this function
|
||||
if (get_jiffies_64() >= smp_load_acquire(&enode->mtime_expiry)) {
|
||||
int err;
|
||||
|
||||
// FIXME: symlinks
|
||||
if (!(request_mask & STATX_MTIME) && (S_ISDIR(inode->i_mode) || !(request_mask & STATX_SIZE))) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
trace_eggsfs_vfs_getattr_lock(inode);
|
||||
|
||||
// dentry refcount also protects the inode (e.g. d_delete will not turn used dentry into a negative one),
|
||||
// so no need to grab anything before we start waiting for stuff
|
||||
err = eggsfs_do_getattr(enode);
|
||||
if (err) {
|
||||
trace_eggsfs_vfs_getattr_exit(inode, err);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
generic_fillattr(inode, stat);
|
||||
trace_eggsfs_vfs_getattr_exit(inode, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct inode_operations eggsfs_dir_inode_ops = {
|
||||
.create = eggsfs_create,
|
||||
.lookup = eggsfs_lookup,
|
||||
.unlink = eggsfs_unlink,
|
||||
.mkdir = eggsfs_mkdir,
|
||||
.rmdir = eggsfs_rmdir,
|
||||
.rename = eggsfs_rename,
|
||||
.getattr = eggsfs_getattr,
|
||||
};
|
||||
|
||||
static const struct inode_operations eggsfs_file_inode_ops = {
|
||||
.getattr = eggsfs_getattr,
|
||||
};
|
||||
|
||||
extern struct file_operations eggsfs_dir_operations;
|
||||
|
||||
struct inode* eggsfs_get_inode(struct super_block* sb, u64 ino) {
|
||||
trace_eggsfs_get_inode_enter(ino);
|
||||
|
||||
struct inode* inode = iget_locked(sb, ino); // new_inode when possible?
|
||||
if (!inode) {
|
||||
trace_eggsfs_get_inode_exit(ino, NULL, false, 0);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
|
||||
bool new = inode->i_state & I_NEW;
|
||||
if (new) {
|
||||
switch (eggsfs_inode_type(ino)) {
|
||||
case EGGSFS_INODE_DIRECTORY: inode->i_mode = S_IFDIR; break;
|
||||
case EGGSFS_INODE_FILE: inode->i_mode = S_IFREG; break;
|
||||
case EGGSFS_INODE_SYMLINK: inode->i_mode = S_IFLNK; break;
|
||||
default: BUG();
|
||||
}
|
||||
inode->i_mode |= S_ISDIR(inode->i_mode) ? 0777 : 0666;
|
||||
inode->i_blocks = 0;
|
||||
inode->i_size = 0;
|
||||
|
||||
inode->i_op = NULL;
|
||||
inode->i_fop = NULL;
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
inode->i_op = &eggsfs_dir_inode_ops;
|
||||
inode->i_fop = &eggsfs_dir_operations;
|
||||
|
||||
rcu_assign_pointer(enode->dir.pages, NULL);
|
||||
eggsfs_latch_init(&enode->dir.pages_latch);
|
||||
} else if (S_ISREG(inode->i_mode)) {
|
||||
inode->i_op = &eggsfs_file_inode_ops;
|
||||
inode->i_fop = &eggsfs_filesimple_operations;
|
||||
|
||||
// init normal file stuff
|
||||
enode->file.spans = RB_ROOT;
|
||||
seqcount_init(&enode->file.spans_seqcount);
|
||||
mutex_init(&enode->file.spans_wlock);
|
||||
}
|
||||
|
||||
// FIXME
|
||||
inode->i_uid = make_kuid(&init_user_ns, 1000);//65534);
|
||||
inode->i_gid = make_kgid(&init_user_ns, 1000);//65534);
|
||||
|
||||
unlock_new_inode(inode);
|
||||
}
|
||||
|
||||
trace_eggsfs_get_inode_exit(ino, inode, new, 0);
|
||||
return inode;
|
||||
}
|
||||
|
||||
+203
@@ -0,0 +1,203 @@
|
||||
#ifndef _EGGSFS_INODE_H
|
||||
#define _EGGSFS_INODE_H
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/xarray.h>
|
||||
|
||||
#include "block.h"
|
||||
#include "latch.h"
|
||||
#include "bincode.h"
|
||||
|
||||
#define EGGSFS_ROOT_INODE 0x2000000000000000ull
|
||||
|
||||
struct eggsfs_file_span;
|
||||
|
||||
// When we create a file, it is writeable. When we start writing to it, it's transient.
|
||||
// In any case, with any of these two flags, we have the `transient` case of the union.
|
||||
#define EGGSFS_INODE_WRITEABLE 1
|
||||
#define EGGSFS_INODE_TRANSIENT 2
|
||||
|
||||
// These two influence the size of eggsfs_inode, currently
|
||||
#define EGGSFS_MAX_BLOCK_POLICIES 2 // one for flash, one for hdd
|
||||
#define EGGSFS_MAX_SPAN_POLICIES 3 // we happen to have three for now
|
||||
|
||||
struct eggsfs_block_policies {
|
||||
u8 len;
|
||||
char body[EGGSFS_MAX_BLOCK_POLICIES*EGGSFS_BLOCK_POLICY_ENTRY_SIZE];
|
||||
};
|
||||
|
||||
static inline int eggsfs_block_policies_len(struct eggsfs_block_policies* policies) {
|
||||
BUG_ON(policies->len == 0);
|
||||
return (int)policies->len / EGGSFS_BLOCK_POLICY_ENTRY_SIZE;
|
||||
}
|
||||
|
||||
static inline void eggsfs_block_policies_get(struct eggsfs_block_policies* policies, int ix, u8* storage_class, u32* min_size) {
|
||||
BUG_ON(ix < 0 || ix >= EGGSFS_MAX_BLOCK_POLICIES);
|
||||
char* b = policies->body + ix*EGGSFS_BLOCK_POLICY_ENTRY_SIZE;
|
||||
*storage_class = *(u8*)b; b += 1;
|
||||
*min_size = get_unaligned_le32(b); b += 4;
|
||||
}
|
||||
|
||||
struct eggsfs_span_policies {
|
||||
u8 len;
|
||||
char body[EGGSFS_MAX_SPAN_POLICIES*EGGSFS_SPAN_POLICY_ENTRY_SIZE];
|
||||
};
|
||||
|
||||
static inline int eggsfs_span_policies_len(struct eggsfs_span_policies* policies) {
|
||||
BUG_ON(policies->len == 0);
|
||||
return (int)policies->len / EGGSFS_SPAN_POLICY_ENTRY_SIZE;
|
||||
}
|
||||
|
||||
static inline void eggsfs_span_policies_get(struct eggsfs_span_policies* policies, int ix, u32* max_size, u8* parity) {
|
||||
BUG_ON(ix < 0 || ix >= EGGSFS_MAX_SPAN_POLICIES);
|
||||
char* b = policies->body + ix*EGGSFS_BLOCK_POLICY_ENTRY_SIZE;
|
||||
*max_size = get_unaligned_le32(b); b += 4;
|
||||
*parity = *(u8*)b; b += 1;
|
||||
}
|
||||
|
||||
static inline void eggsfs_span_policies_last(struct eggsfs_span_policies* policies, u32* max_size, u8* parity) {
|
||||
eggsfs_span_policies_get(policies, eggsfs_span_policies_len(policies)-1, max_size, parity);
|
||||
}
|
||||
|
||||
#define EGGSFS_SPAN_STATUS_WRITING 0 // we're writing to this span
|
||||
#define EGGSFS_SPAN_STATUS_IDLE 1 // we're waiting to flush this out
|
||||
#define EGGSFS_SPAN_STATUS_LAST 2 // like idle, but signals that there won't be other spans after this one
|
||||
#define EGGSFS_SPAN_STATUS_FLUSHING 3 // we're flushing this to the block services
|
||||
#define EGGSFS_SPAN_STATUS_FLUSHING_LAST 4 // we're flushing this to the block services, and it's the last span
|
||||
|
||||
#define EGGSFS_MAX_DATA 10
|
||||
#define EGGSFS_MAX_PARITY 4
|
||||
#define EGGSFS_MAX_BLOCKS (EGGSFS_MAX_DATA+EGGSFS_MAX_PARITY)
|
||||
|
||||
struct eggsfs_transient_span {
|
||||
struct list_head list;
|
||||
// Linear list of pages with the body of the span, used when we're still gathering
|
||||
// content for it. We use `index` in the page to track where we're writing to.
|
||||
// We also add the parity block pages here, just to have all the pages available in
|
||||
// one place.
|
||||
struct list_head pages;
|
||||
// Used by the workers which write out to the block services. They first arrange
|
||||
// the pages here and compute the parity blocks. Then they can just write them out.
|
||||
//
|
||||
// We use `->private` to link these together, so that `pages` list can still contain
|
||||
// all of them, which is handy in a couple of places.
|
||||
//
|
||||
// In this stage we use `index` to track where we're reading from the head page,
|
||||
// and keep popping the head page as we write them out.
|
||||
struct page* blocks[EGGSFS_MAX_BLOCKS];
|
||||
struct eggsfs_block_socket* block_sockets[EGGSFS_MAX_BLOCKS];
|
||||
struct eggsfs_write_block_request* block_reqs[EGGSFS_MAX_BLOCKS];
|
||||
// These are decided by whoever moves the span from WRITING to IDLE/LAST.
|
||||
u32 size; // the logical size of the span, might be smaller than actual if we added leading zeros because of RS
|
||||
u32 block_size;
|
||||
u8 stripes;
|
||||
u8 storage_class;
|
||||
u8 parity;
|
||||
// See EGGSFS_SPAN_STATUS above.
|
||||
u8 status;
|
||||
};
|
||||
|
||||
struct eggsfs_inode_file {
|
||||
// Normal file stuff
|
||||
struct rb_root spans;
|
||||
seqcount_t spans_seqcount; // to read in a lockless way
|
||||
struct mutex spans_wlock; // to ensure there's only one thing fetching the spans when we can't find the span
|
||||
|
||||
// Transient stuff. Only initialized on file creation, otherwise it's garbage.
|
||||
// Could be factored out to separate data structure.
|
||||
//
|
||||
// List of spans, FIFO. There's always a "WRITING/LAST" span at the end,
|
||||
// and there's never a WRITING span which could be IDLE.
|
||||
struct list_head transient_spans;
|
||||
// Used to synchronize between things adding and removing to the list of spans.
|
||||
spinlock_t transient_spans_lock;
|
||||
// Worker which flushes the spans
|
||||
struct work_struct flusher;
|
||||
// To know when we're done flushing
|
||||
struct semaphore done_flushing;
|
||||
// If we've encountered an error such that we want to stop.
|
||||
atomic_t transient_err;
|
||||
// How many bytes we've flushed so far
|
||||
atomic64_t flushed_so_far;
|
||||
// Used to bound the number of in-flight spans (otherwise we
|
||||
// would eat quite a bit of memory for big files)
|
||||
struct semaphore in_flight_spans;
|
||||
u64 cookie;
|
||||
// We use this to track where we should close the file from.
|
||||
struct task_struct* owner;
|
||||
// Size _apart from current span_.
|
||||
u64 size_without_current_span;
|
||||
};
|
||||
|
||||
|
||||
struct eggsfs_inode_dir {
|
||||
// In `struct page`, we use:
|
||||
//
|
||||
// ->mapping to store the next page
|
||||
// ->private to store the number of entries stored in the page (high 32 bits),
|
||||
// and a reference count with how many people need the cache (low 32 bits).
|
||||
// The reference count is only stored in the "head" page.
|
||||
// ->rcu_head to synchronize between modifications to the reference count.
|
||||
struct page __rcu * pages;
|
||||
struct eggsfs_latch pages_latch;
|
||||
};
|
||||
|
||||
struct eggsfs_inode {
|
||||
struct inode inode;
|
||||
|
||||
// We cache things based on the eggsfs mtime, but we need to decide when the
|
||||
// mtime itself is stale, which we do using `mtime_expiry`.
|
||||
u64 mtime; // in eggsfs time
|
||||
u64 mtime_expiry; // in jiffies
|
||||
|
||||
u64 edge_creation_time; // in eggsfs time, used for operations (re)moving the edge
|
||||
|
||||
// These are relevant for directoriese (obviously), but we also use them for transient
|
||||
// files when we create them, since we need it in many places.
|
||||
// TODO consider refreshing it for directories, right now we never do.
|
||||
struct eggsfs_block_policies block_policies;
|
||||
struct eggsfs_span_policies span_policies;
|
||||
u32 target_stripe_size;
|
||||
|
||||
union {
|
||||
struct eggsfs_inode_file file;
|
||||
struct eggsfs_inode_dir dir;
|
||||
};
|
||||
|
||||
struct eggsfs_latch getattr_update_latch;
|
||||
int getattr_err;
|
||||
|
||||
int flags;
|
||||
};
|
||||
|
||||
#define EGGSFS_I(ptr) container_of(ptr, struct eggsfs_inode, inode)
|
||||
#define EGGSFS_INODE_NEED_GETATTR 1
|
||||
|
||||
#define EGGSFS_INODE_DIRECTORY 1
|
||||
#define EGGSFS_INODE_FILE 2
|
||||
#define EGGSFS_INODE_SYMLINK 3
|
||||
|
||||
static inline u64 eggsfs_inode_type(u64 ino) {
|
||||
return (ino >> 61) & 0x03;
|
||||
}
|
||||
|
||||
static inline u32 eggsfs_inode_shard(u64 ino) {
|
||||
return ino & 0xff;
|
||||
}
|
||||
|
||||
struct inode* eggsfs_get_inode(struct super_block* sb, u64 ino);
|
||||
|
||||
|
||||
// super ops
|
||||
struct inode* eggsfs_inode_alloc(struct super_block* sb);
|
||||
void eggsfs_inode_evict(struct inode* inode);
|
||||
void eggsfs_inode_free(struct inode* inode);
|
||||
|
||||
int __init eggsfs_inode_init(void);
|
||||
void __cold eggsfs_inode_exit(void);
|
||||
|
||||
// inode ops
|
||||
int eggsfs_do_getattr(struct eggsfs_inode* enode);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,44 @@
|
||||
#ifndef _EGGS_INTRSHIMS_H
|
||||
#define _EGGS_INTRSHIMS_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#define _mm_crc32_u64(C, D) __builtin_ia32_crc32di(C, D)
|
||||
#define _mm_crc32_u32(C, D) __builtin_ia32_crc32si(C, D)
|
||||
#define _mm_crc32_u8(C, D) __builtin_ia32_crc32qi(C, D)
|
||||
|
||||
typedef long long __m128i __attribute__((vector_size(16)));
|
||||
typedef unsigned long long __m128i_u __attribute__((vector_size(16)));
|
||||
|
||||
typedef char __v16qi __attribute__((vector_size(16)));
|
||||
typedef short __v8hi __attribute__((vector_size(16)));
|
||||
typedef int __v4si __attribute__((vector_size(16)));
|
||||
typedef long long __v2di __attribute__((vector_size(16)));
|
||||
|
||||
#define _mm_loadu_si128(p) (*(p))
|
||||
#define _mm_setr_epi32(i0, i1, i2, i3) ((__m128i)(__v4si){i0,i1,i2,i3})
|
||||
#define _mm_clmulepi64_si128(X, Y, I) ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (char)(I)))
|
||||
#define _mm_xor_si128(a, b) ((a) ^ (b))
|
||||
#define _mm_cvtsi32_si128(a) ((__m128i)(__v4si){a,0,0,0})
|
||||
#define _mm_cvtsi128_si64(a) ((a)[0])
|
||||
#define _mm_extract_epi64(X, N) ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
|
||||
|
||||
typedef long long __m256i __attribute__((vector_size(32)));
|
||||
typedef unsigned long long __m256i_u __attribute__((vector_size(32)));
|
||||
|
||||
typedef char __v32qi __attribute__((vector_size(32)));
|
||||
typedef short __v16hi __attribute__((vector_size(32)));
|
||||
typedef int __v8si __attribute__((vector_size(32)));
|
||||
typedef long long __v4di __attribute__((vector_size(32)));
|
||||
|
||||
#define _mm256_and_si256(a, b) ((a) & (b))
|
||||
#define _mm256_xor_si256(a, b) ((a) ^ (b))
|
||||
#define _mm256_permute2x128_si256(a, b, i) ((__m256i)__builtin_ia32_permti256((a), (b), (i)))
|
||||
#define _mm256_srli_epi16(a, b) ((__m256i)__builtin_ia32_psrlwi256((__v16hi)(a), (b)))
|
||||
#define _mm256_shuffle_epi8(a, b) ((__m256i)__builtin_ia32_pshufb256((__v32qi)(a), (__v32qi)(b)))
|
||||
#define _mm256_setzero_si256() ((__m256i)(__v4di){0,0,0,0})
|
||||
#define _mm256_loadu_si256(p) (*(p))
|
||||
#define _mm256_storeu_si256(p, a) (*(p) = a)
|
||||
#define _mm256_gf2p8mul_epi8(a, b) ((__m256i)__builtin_ia32_vgf2p8mulb_v32qi((__v32qi)(a), (__v32qi)(b)))
|
||||
|
||||
#endif
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../cpp/crc32c/iscsi.h
|
||||
+76
@@ -0,0 +1,76 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/version.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "inode.h"
|
||||
#include "metadata.h"
|
||||
#include "block.h"
|
||||
#include "super.h"
|
||||
#include "sysctl.h"
|
||||
#include "sysfs.h"
|
||||
#include "block.h"
|
||||
#include "rs.h"
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
struct workqueue_struct* eggsfs_wq;
|
||||
|
||||
static int __init eggsfs_init(void) {
|
||||
int err;
|
||||
|
||||
eggsfs_debug_print("initializing module");
|
||||
|
||||
eggsfs_shard_init();
|
||||
|
||||
eggsfs_wq = alloc_workqueue("eggsfs-wq", 0, 0);
|
||||
if (!eggsfs_wq) { return -ENOMEM; }
|
||||
|
||||
err = eggsfs_sysfs_init();
|
||||
if (err) { goto out_sysfs; }
|
||||
|
||||
err = eggsfs_sysctl_init();
|
||||
if (err) { goto out_sysctl; }
|
||||
|
||||
err = eggsfs_blocksimple_init();
|
||||
if (err) { goto out_blocksimple; }
|
||||
|
||||
err = eggsfs_inode_init();
|
||||
if (err) { goto out_inode; }
|
||||
|
||||
err = eggsfs_fs_init();
|
||||
if (err) { goto out_fs; }
|
||||
|
||||
err = eggsfs_rs_init();
|
||||
if (err) { goto out_rs; }
|
||||
|
||||
return 0;
|
||||
|
||||
out_rs:
|
||||
eggsfs_fs_exit();
|
||||
out_fs:
|
||||
eggsfs_inode_exit();
|
||||
out_inode:
|
||||
eggsfs_blocksimple_exit();
|
||||
out_blocksimple:
|
||||
eggsfs_sysctl_exit();
|
||||
out_sysctl:
|
||||
eggsfs_sysfs_exit();
|
||||
out_sysfs:
|
||||
destroy_workqueue(eggsfs_wq);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit eggsfs_exit(void) {
|
||||
eggsfs_fs_exit();
|
||||
eggsfs_inode_exit();
|
||||
eggsfs_blocksimple_exit();
|
||||
eggsfs_sysctl_exit();
|
||||
eggsfs_sysfs_exit();
|
||||
destroy_workqueue(eggsfs_wq);
|
||||
}
|
||||
|
||||
module_init(eggsfs_init);
|
||||
module_exit(eggsfs_exit);
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
#ifndef _EGGSFS_LATCH_H
|
||||
#define _EGGSFS_LATCH_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
struct eggsfs_latch {
|
||||
atomic64_t counter;
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
|
||||
#define eggsfs_latch_init(_latch) ({ \
|
||||
atomic64_set(&(_latch)->counter, 0); \
|
||||
init_waitqueue_head(&(_latch)->wq); \
|
||||
})
|
||||
|
||||
#define eggsfs_latch_try_acquire(_latch, _seqno) ({ \
|
||||
(_seqno) = atomic64_fetch_or(1, &(_latch)->counter); \
|
||||
smp_mb__after_atomic(); \
|
||||
!(_seqno & 1); \
|
||||
})
|
||||
|
||||
#define eggsfs_latch_release(_latch, _seqno) ({ \
|
||||
smp_mb__before_atomic(); \
|
||||
atomic64_set(&(_latch)->counter, (_seqno) + 2); \
|
||||
wake_up_all(&(_latch)->wq); \
|
||||
})
|
||||
|
||||
#define eggsfs_latch_wait_killable(_latch, _seqno) ({ \
|
||||
u64 goal = ((_seqno) | 1); \
|
||||
wait_event_killable((_latch)->wq, atomic64_read(&(_latch)->counter) > goal); \
|
||||
})
|
||||
|
||||
#endif
|
||||
|
||||
+1040
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,76 @@
|
||||
#ifndef _EGGSFS_SHARD_H
|
||||
#define _EGGSFS_SHARD_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "net.h"
|
||||
#include "super.h"
|
||||
#include "inode.h"
|
||||
|
||||
int eggsfs_shard_lookup(struct eggsfs_fs_info* info, u64 dir, const char* name, int name_len, u64* ino, u64* creation_time);
|
||||
int eggsfs_shard_readdir(struct eggsfs_fs_info* info, u64 dir, u64 start_pos, void* data, u64* next_hash);
|
||||
int eggsfs_shard_unlink_file(struct eggsfs_fs_info* info, u64 dir, u64 file, const char* name, int name_len, u64 creation_time);
|
||||
int eggsfs_shard_rename(struct eggsfs_fs_info* info, u64 dir, u64 target_id, const char* old_name, int old_name_len, u64 old_creation_time, const char* new_name, int new_name_len, u64* new_creation_time);
|
||||
int eggsfs_shard_link_file(struct eggsfs_fs_info* info, u64 file, u64 cookie, u64 dir, const char* name, int name_len, u64* creation_time);
|
||||
int eggsfs_shard_getattr_file(struct eggsfs_fs_info* info, u64 file, u64* mtime, u64* size);
|
||||
int eggsfs_shard_getattr_dir(
|
||||
struct eggsfs_fs_info* info,
|
||||
u64 file,
|
||||
u64* mtime,
|
||||
struct eggsfs_block_policies* block_policies,
|
||||
struct eggsfs_span_policies* span_policies,
|
||||
u32* target_stripe_size
|
||||
);
|
||||
int eggsfs_shard_getattr(struct eggsfs_fs_info* info, u64 id);
|
||||
int eggsfs_shard_create_file(struct eggsfs_fs_info* info, u8 shid, int itype, const char* name, int name_len, u64* ino, u64* cookie);
|
||||
int eggsfs_shard_file_spans(struct eggsfs_fs_info* info, u64 file, u64 offset, void* data);
|
||||
int eggsfs_shard_add_inline_span(struct eggsfs_fs_info* info, u64 file, u64 cookie, u64 offset, u32 size, const char* data, u8 len);
|
||||
int eggsfs_shard_add_span_initiate(
|
||||
struct eggsfs_fs_info* info, void* data, u64 file, u64 cookie, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity,
|
||||
u8 stripes, u32 cell_size, u32* cell_crcs
|
||||
);
|
||||
int eggsfs_shard_add_span_certify(
|
||||
struct eggsfs_fs_info* info, u64 file, u64 cookie, u64 offset, u8 parity, const u64* block_ids, const u64* block_proofs
|
||||
);
|
||||
|
||||
struct eggsfs_shard_add_span_initiate_new_block {
|
||||
u64 size;
|
||||
u32 crc32;
|
||||
};
|
||||
struct eggsfs_shard_add_span_initiate_async_request {
|
||||
struct eggsfs_shard_async_request request;
|
||||
char data[EGGSFS_UDP_MTU];
|
||||
};
|
||||
void eggsfs_shard_add_span_initiate_async(struct eggsfs_fs_info* info, struct eggsfs_shard_add_span_initiate_async_request* request, u64 file, u64 cookie, u64 byte_offset, u8 storage_class, u8 parity, u32 crc32, u64 size, struct eggsfs_shard_add_span_initiate_new_block* blocks, u16 n_blocks);
|
||||
|
||||
struct eggsfs_shard_add_span_certify_block_proof {
|
||||
u64 block_id;
|
||||
u64 proof;
|
||||
};
|
||||
struct eggsfs_shard_add_span_certify_async_request {
|
||||
struct eggsfs_shard_async_request request;
|
||||
char data[EGGSFS_UDP_MTU];
|
||||
};
|
||||
void eggsfs_shard_add_span_certify_async(struct eggsfs_fs_info* info, struct eggsfs_shard_add_span_certify_async_request* request, u64 file, u64 cookie, u64 offset, struct eggsfs_shard_add_span_certify_block_proof* proofs, u16 n_blocks);
|
||||
|
||||
int eggsfs_cdc_mkdir(struct eggsfs_fs_info* info, u64 dir, const char* name, int name_len, u64* ino, u64* creation_time);
|
||||
int eggsfs_cdc_rmdir(struct eggsfs_fs_info* info, u64 owner_dir, u64 target, u64 creation_time, const char* name, int name_len);
|
||||
|
||||
int eggsfs_cdc_rename_directory(
|
||||
struct eggsfs_fs_info* info,
|
||||
u64 target, u64 old_parent, u64 new_parent,
|
||||
const char* old_name, int old_name_len, u64 old_creation_time,
|
||||
const char* new_name, int new_name_len,
|
||||
u64* new_creation_time
|
||||
);
|
||||
int eggsfs_cdc_rename_file(
|
||||
struct eggsfs_fs_info* info,
|
||||
u64 target, u64 old_parent, u64 new_parent,
|
||||
const char* old_name, int old_name_len, u64 old_creation_time,
|
||||
const char* new_name, int new_name_len,
|
||||
u64* new_creation_time
|
||||
);
|
||||
|
||||
void __init eggsfs_shard_init(void);
|
||||
|
||||
#endif
|
||||
+320
@@ -0,0 +1,320 @@
|
||||
#include "namei.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "dir.h"
|
||||
#include "err.h"
|
||||
#include "inode.h"
|
||||
#include "metadata.h"
|
||||
#include "trace.h"
|
||||
|
||||
EGGSFS_DEFINE_COUNTER(eggsfs_stat_dir_revalidations);
|
||||
|
||||
static int eggsfs_d_revalidate(struct dentry* dentry, unsigned int flags) {
|
||||
struct dentry* parent;
|
||||
struct inode* dir;
|
||||
int ret;
|
||||
|
||||
eggsfs_debug_print("dentry=%pd flags=%x[rcu=%d]", dentry, flags, !!(flags & LOOKUP_RCU));
|
||||
|
||||
if (IS_ROOT(dentry)) { return 1; }
|
||||
|
||||
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
|
||||
return -ENAMETOOLONG;
|
||||
}
|
||||
|
||||
if (flags & LOOKUP_RCU) {
|
||||
parent = READ_ONCE(dentry->d_parent);
|
||||
dir = d_inode_rcu(parent);
|
||||
if (!dir) { return -ECHILD; }
|
||||
ret = eggsfs_dir_needs_reval(EGGSFS_I(dir), dentry);
|
||||
if (parent != READ_ONCE(dentry->d_parent)) { return -ECHILD; }
|
||||
return ret;
|
||||
} else {
|
||||
parent = dget_parent(dentry);
|
||||
dir = d_inode(parent);
|
||||
ret = eggsfs_dir_needs_reval(EGGSFS_I(dir), dentry);
|
||||
if (ret == -ECHILD) {
|
||||
eggsfs_counter_inc(eggsfs_stat_dir_revalidations);
|
||||
ret = eggsfs_dir_revalidate(EGGSFS_I(dir));
|
||||
if (ret) { goto out; }
|
||||
ret = eggsfs_dir_needs_reval(EGGSFS_I(dir), dentry);
|
||||
if (ret == -ECHILD) { ret = 1; }
|
||||
}
|
||||
out:
|
||||
dput(parent);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct dentry_operations eggsfs_dentry_ops = {
|
||||
.d_revalidate = eggsfs_d_revalidate,
|
||||
};
|
||||
|
||||
|
||||
static void eggsfs_dcache_handle_error(struct inode* dir, struct dentry* dentry, int err) {
|
||||
eggsfs_debug_print("err=%d", err);
|
||||
switch (err) {
|
||||
case EGGSFS_ERR_CANNOT_OVERRIDE_NAME:
|
||||
WARN_ON(dentry->d_inode);
|
||||
trace_eggsfs_dcache_invalidate_neg_entry(dir, dentry);
|
||||
WRITE_ONCE(EGGSFS_I(dir)->mtime_expiry, 0);
|
||||
d_invalidate(dentry);
|
||||
break;
|
||||
// TODO is this correct? verify, I've changed the code from an earlier error
|
||||
// which does not exist anymore.
|
||||
case EGGSFS_ERR_DIRECTORY_NOT_FOUND:
|
||||
WARN_ON(dentry->d_inode);
|
||||
trace_eggsfs_dcache_delete_inode(dir);
|
||||
WRITE_ONCE(EGGSFS_I(dir)->mtime_expiry, 0);
|
||||
clear_nlink(dir);
|
||||
d_invalidate(dentry->d_parent);
|
||||
break;
|
||||
case EGGSFS_ERR_DIRECTORY_NOT_EMPTY:
|
||||
WARN_ON(!dentry->d_inode);
|
||||
if (dentry->d_inode) {
|
||||
trace_eggsfs_dcache_invalidate_dir(dentry->d_inode);
|
||||
WRITE_ONCE(EGGSFS_I(dentry->d_inode)->mtime_expiry, 0);
|
||||
}
|
||||
break;
|
||||
case EGGSFS_ERR_NAME_NOT_FOUND:
|
||||
WARN_ON(!dentry->d_inode);
|
||||
if (dentry->d_inode) {
|
||||
trace_eggsfs_dcache_delete_entry(dir, dentry, dentry->d_inode);
|
||||
WRITE_ONCE(EGGSFS_I(dentry->d_inode)->mtime_expiry, 0);
|
||||
clear_nlink(dentry->d_inode);
|
||||
d_invalidate(dentry);
|
||||
}
|
||||
break;
|
||||
case EGGSFS_ERR_MISMATCHING_TARGET:
|
||||
WARN_ON(!dentry->d_inode);
|
||||
if (dentry->d_inode) {
|
||||
trace_eggsfs_dcache_invalidate_entry(dir, dentry, dentry->d_inode);
|
||||
WRITE_ONCE(EGGSFS_I(dentry->d_inode)->mtime_expiry, 0);
|
||||
d_invalidate(dentry);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// vfs: shared dir.i_rwsem
|
||||
struct dentry* eggsfs_lookup(struct inode* dir, struct dentry* dentry, unsigned int flags) {
|
||||
eggsfs_debug_print("dir=0x%016lx, name=%*pE", dir->i_ino, dentry->d_name.len, dentry->d_name.name);
|
||||
int err;
|
||||
|
||||
trace_eggsfs_vfs_lookup_enter(dir, dentry);
|
||||
|
||||
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
|
||||
err = -ENAMETOOLONG;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
u64 ino, creation_time;
|
||||
// need to keep the eggs error around for the EGGSFS_ERR_NAME_NOT_FOUND
|
||||
err = eggsfs_shard_lookup(
|
||||
dir->i_sb->s_fs_info, dir->i_ino, dentry->d_name.name, dentry->d_name.len, &ino, &creation_time
|
||||
);
|
||||
if (unlikely(err < 0)) {
|
||||
goto out_err;
|
||||
}
|
||||
if (err) { // not unlikely since vfs will do lookups for entries likely to not exist
|
||||
if (err == EGGSFS_ERR_NAME_NOT_FOUND) {
|
||||
dentry->d_time = EGGSFS_I(dir)->mtime;
|
||||
trace_eggsfs_vfs_lookup_exit(dir, dentry, NULL, 0);
|
||||
return d_splice_alias(NULL, dentry);
|
||||
}
|
||||
err = eggsfs_error_to_linux(err);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
struct inode* inode = eggsfs_get_inode(dentry->d_sb, ino);
|
||||
if (IS_ERR(inode)) {
|
||||
err = PTR_ERR(inode);
|
||||
goto out_err;
|
||||
}
|
||||
inode->i_ino = ino;
|
||||
EGGSFS_I(inode)->edge_creation_time = creation_time;
|
||||
|
||||
dentry->d_time = EGGSFS_I(dir)->mtime;
|
||||
|
||||
trace_eggsfs_vfs_lookup_exit(dir, dentry, inode, 0);
|
||||
return d_splice_alias(inode, dentry);
|
||||
|
||||
out_err:
|
||||
trace_eggsfs_vfs_lookup_exit(dir, dentry, NULL, err);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
// vfs: exclusive dir.i_rwsem, lockref dentry
|
||||
int eggsfs_mkdir(struct inode* dir, struct dentry* dentry, umode_t mode) {
|
||||
int err;
|
||||
|
||||
struct eggsfs_inode* dir_enode = EGGSFS_I(dir);
|
||||
|
||||
trace_eggsfs_vfs_mkdir_enter(dir, dentry);
|
||||
BUG_ON(dentry->d_inode);
|
||||
|
||||
if (dentry->d_name.len > EGGSFS_MAX_FILENAME) {
|
||||
err = -ENAMETOOLONG;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
u64 ino, creation_time;
|
||||
err = eggsfs_cdc_mkdir(
|
||||
dir->i_sb->s_fs_info,
|
||||
dir->i_ino,
|
||||
dentry->d_name.name,
|
||||
dentry->d_name.len,
|
||||
&ino,
|
||||
&creation_time
|
||||
);
|
||||
if (unlikely(err)) {
|
||||
if (err < 0) { goto out_err; }
|
||||
eggsfs_dcache_handle_error(dir, dentry, err);
|
||||
err = eggsfs_error_to_linux(err);
|
||||
goto out_err;
|
||||
}
|
||||
eggsfs_dir_drop_cache(EGGSFS_I(dir));
|
||||
|
||||
struct inode* inode = eggsfs_get_inode(dentry->d_sb, ino);
|
||||
if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_err; }
|
||||
struct eggsfs_inode* enode = EGGSFS_I(inode);
|
||||
enode->edge_creation_time = creation_time;
|
||||
memcpy(&enode->block_policies, &dir_enode->block_policies, sizeof(enode->block_policies));
|
||||
memcpy(&enode->span_policies, &dir_enode->span_policies, sizeof(enode->span_policies));
|
||||
enode->target_stripe_size = dir_enode->target_stripe_size;
|
||||
dentry->d_time = dir_enode->mtime;
|
||||
d_instantiate(dentry, inode);
|
||||
|
||||
trace_eggsfs_vfs_mkdir_exit(dir, dentry, inode, 0);
|
||||
return 0;
|
||||
|
||||
out_err:
|
||||
trace_eggsfs_vfs_mkdir_exit(dir, dentry, NULL, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
// vfs: exclusive dir,d_entry->d_inode.i_rwsem
|
||||
int eggsfs_rmdir(struct inode* dir, struct dentry* dentry) {
|
||||
int err;
|
||||
|
||||
trace_eggsfs_vfs_rmdir_enter(dir, dentry, dentry->d_inode);
|
||||
BUG_ON(!dentry->d_inode);
|
||||
|
||||
if (unlikely(dentry->d_name.len > EGGSFS_MAX_FILENAME)) {
|
||||
pr_err("eggsfs: eggsfs_rmdir dir=%lu dentry=%pd exceed max filename length %d\n",
|
||||
dir->i_ino, dentry, EGGSFS_MAX_FILENAME);
|
||||
err = -ENAMETOOLONG;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
err = eggsfs_cdc_rmdir((struct eggsfs_fs_info*)dir->i_sb->s_fs_info, dir->i_ino, dentry->d_inode->i_ino, EGGSFS_I(dentry->d_inode)->edge_creation_time, dentry->d_name.name, dentry->d_name.len);
|
||||
if (unlikely(err)) {
|
||||
if (err < 0) { goto out_err; }
|
||||
eggsfs_dcache_handle_error(dir, dentry, err);
|
||||
err = eggsfs_error_to_linux(err);
|
||||
goto out_err;
|
||||
} else {
|
||||
eggsfs_dir_drop_cache(EGGSFS_I(dir));
|
||||
clear_nlink(dentry->d_inode);
|
||||
}
|
||||
|
||||
out_err:
|
||||
trace_eggsfs_vfs_rmdir_exit(dir, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
// vfs: exclusive dir,d_entry->d_inode.i_rwsem
|
||||
int eggsfs_unlink(struct inode* dir, struct dentry* dentry) {
|
||||
int err;
|
||||
|
||||
trace_eggsfs_vfs_unlink_enter(dir, dentry, dentry->d_inode);
|
||||
BUG_ON(!dentry->d_inode);
|
||||
|
||||
if (unlikely(dentry->d_name.len > EGGSFS_MAX_FILENAME)) {
|
||||
pr_err("eggsfs: eggsfs_unlink dir=%lu dentry=%pd exceed max filename length %d\n",
|
||||
dir->i_ino, dentry, EGGSFS_MAX_FILENAME);
|
||||
err = -ENAMETOOLONG;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
struct eggsfs_inode* enode = EGGSFS_I(dentry->d_inode);
|
||||
err = eggsfs_shard_unlink_file((struct eggsfs_fs_info*)dir->i_sb->s_fs_info, dir->i_ino, dentry->d_inode->i_ino, dentry->d_name.name, dentry->d_name.len, enode->edge_creation_time);
|
||||
if (unlikely(err)) {
|
||||
if (err < 0) { goto out_err; }
|
||||
eggsfs_dcache_handle_error(dir, dentry, err);
|
||||
err = eggsfs_error_to_linux(err);
|
||||
goto out_err;
|
||||
} else {
|
||||
eggsfs_dir_drop_cache(EGGSFS_I(dir));
|
||||
// no hard links, inode is gone
|
||||
clear_nlink(dentry->d_inode);
|
||||
}
|
||||
|
||||
out_err:
|
||||
trace_eggsfs_vfs_unlink_exit(dir, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
// vfs: exclusive i_rwsem for all
|
||||
int eggsfs_rename(struct inode* old_dir, struct dentry* old_dentry, struct inode* new_dir, struct dentry* new_dentry, unsigned int flags) {
|
||||
int err;
|
||||
|
||||
trace_eggsfs_vfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
|
||||
|
||||
BUG_ON(!old_dentry->d_inode);
|
||||
|
||||
if (flags) { err = -EINVAL; goto out; }
|
||||
|
||||
u64 old_creation_time = EGGSFS_I(old_dentry->d_inode)->edge_creation_time;
|
||||
u64 new_creation_time;
|
||||
struct eggsfs_fs_info* info = old_dir->i_sb->s_fs_info;
|
||||
if (old_dir == new_dir) {
|
||||
err = eggsfs_shard_rename(
|
||||
info,
|
||||
old_dir->i_ino,
|
||||
old_dentry->d_inode->i_ino, old_dentry->d_name.name, old_dentry->d_name.len,
|
||||
old_creation_time,
|
||||
new_dentry->d_name.name, new_dentry->d_name.len,
|
||||
&new_creation_time
|
||||
);
|
||||
} else if (S_ISDIR(old_dentry->d_inode->i_mode)) {
|
||||
// TODO update block policies/span policies here, it's kind of a no brainer in this case
|
||||
err = eggsfs_cdc_rename_directory(
|
||||
info,
|
||||
old_dentry->d_inode->i_ino,
|
||||
old_dir->i_ino, new_dir->i_ino,
|
||||
old_dentry->d_name.name, old_dentry->d_name.len, old_creation_time,
|
||||
new_dentry->d_name.name, new_dentry->d_name.len,
|
||||
&new_creation_time
|
||||
);
|
||||
} else {
|
||||
err = eggsfs_cdc_rename_file(
|
||||
info,
|
||||
old_dentry->d_inode->i_ino,
|
||||
old_dir->i_ino, new_dir->i_ino,
|
||||
old_dentry->d_name.name, old_dentry->d_name.len, old_creation_time,
|
||||
new_dentry->d_name.name, new_dentry->d_name.len,
|
||||
&new_creation_time
|
||||
);
|
||||
}
|
||||
err = eggsfs_error_to_linux(err);
|
||||
if (err) {
|
||||
if (err == -ENOENT) {
|
||||
// eggsfs_dentry_handle_enoent(old_dentry);
|
||||
} else {
|
||||
d_drop(old_dentry);
|
||||
d_drop(new_dentry); //?
|
||||
}
|
||||
// new entry? drop as well
|
||||
} else {
|
||||
old_dentry->d_time = new_creation_time;
|
||||
if (new_dentry->d_inode) { inode_dec_link_count(new_dentry->d_inode); }
|
||||
d_move(old_dentry, new_dentry);
|
||||
// negative old dentry?
|
||||
}
|
||||
|
||||
out:
|
||||
trace_eggsfs_vfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, err);
|
||||
eggsfs_debug_print("err=%d", err);
|
||||
return err;
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
#ifndef _EGGSFS_NAMEI_H
|
||||
#define _EGGSFS_NAMEI_H
|
||||
|
||||
#include <linux/fs.h>
|
||||
|
||||
#include "counter.h"
|
||||
|
||||
EGGSFS_DECLARE_COUNTER(eggsfs_stat_dir_revalidations);
|
||||
|
||||
struct dentry* eggsfs_lookup(struct inode* dir, struct dentry* dentry, unsigned int flags);
|
||||
int eggsfs_mkdir(struct inode* dir, struct dentry* dentry, umode_t mode);
|
||||
int eggsfs_rmdir(struct inode* dir, struct dentry* dentry);
|
||||
int eggsfs_unlink(struct inode* dir, struct dentry* dentry);
|
||||
int eggsfs_rename(
|
||||
struct inode* old_dir, struct dentry* old_dentry,
|
||||
struct inode* new_dir, struct dentry* new_dentry,
|
||||
unsigned int flags
|
||||
);
|
||||
|
||||
extern struct dentry_operations eggsfs_dentry_ops;
|
||||
|
||||
#endif
|
||||
+337
@@ -0,0 +1,337 @@
|
||||
#include "net.h"
|
||||
|
||||
#include <linux/version.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "trace.h"
|
||||
|
||||
static struct eggsfs_shard_request* get_shard_request(struct eggsfs_shard_socket* s, u64 request_id) __must_hold(s->lock) {
|
||||
struct rb_node* node = s->requests.rb_node;
|
||||
while (node) {
|
||||
struct eggsfs_shard_request* req = container_of(node, struct eggsfs_shard_request, node);
|
||||
if (request_id < req->request_id) { node = node->rb_left; }
|
||||
else if (request_id > req->request_id) { node = node->rb_right; }
|
||||
else { return req; }
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void sock_readable(struct sock* sk) {
|
||||
struct eggsfs_shard_socket* s;
|
||||
struct sk_buff* skb;
|
||||
int err;
|
||||
u64 request_id;
|
||||
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
s = (struct eggsfs_shard_socket*)sk->sk_user_data;
|
||||
BUG_ON(!s);
|
||||
while (1) {
|
||||
skb = skb_recv_udp(sk, 0, 1, &err);
|
||||
if (!skb) {
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
if (unlikely(skb_is_nonlinear(skb))) {
|
||||
// TODO: need top handle this
|
||||
eggsfs_warn_print("dropping nonlinear request");
|
||||
goto drop_skb;
|
||||
} else if (unlikely(skb_checksum_complete(skb))) {
|
||||
eggsfs_warn_print("dropping request with bad checksum");
|
||||
goto drop_skb;
|
||||
}
|
||||
|
||||
// u32 protocol + u64 req_id
|
||||
if (unlikely(skb->len < 12)) {
|
||||
eggsfs_warn_print("dropping runt eggsfs request");
|
||||
goto drop_skb;
|
||||
}
|
||||
request_id = get_unaligned_le64(skb->data + 4);
|
||||
|
||||
struct eggsfs_shard_request* req;
|
||||
spin_lock_bh(&s->lock); {
|
||||
req = get_shard_request(s, request_id);
|
||||
if (req) {
|
||||
BUG_ON(req->skb);
|
||||
rb_erase(&req->node, &s->requests);
|
||||
req->skb = skb;
|
||||
skb = NULL;
|
||||
complete(&req->comp);
|
||||
|
||||
#if 0
|
||||
if (!(req->flags & EGGSFS_SHARD_ASYNC)) {
|
||||
} else {
|
||||
struct eggsfs_shard_async_request* areq = container_of(req, struct eggsfs_shard_async_request, request);
|
||||
cancel_delayed_work(&areq->work);
|
||||
queue_work(eggsfs_wq, &areq->callback);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} spin_unlock_bh(&s->lock);
|
||||
if (!req) {
|
||||
eggsfs_warn_print("could not find request id %llu", request_id);
|
||||
}
|
||||
|
||||
if (skb) {
|
||||
drop_skb:
|
||||
kfree_skb(skb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// timer async request expired
|
||||
// if skb then retrun
|
||||
// requeue
|
||||
// whoever wants to remove request needs to handle timer and workqueue
|
||||
|
||||
static void insert_shard_request(struct eggsfs_shard_socket* s, struct eggsfs_shard_request* req) __must_hold(s->lock) {
|
||||
u64 request_id = req->request_id;
|
||||
struct rb_node** new = &s->requests.rb_node;
|
||||
struct rb_node* parent = NULL;
|
||||
while (*new) {
|
||||
struct eggsfs_shard_request* this = container_of(*new, struct eggsfs_shard_request, node);
|
||||
parent = *new;
|
||||
if (request_id < this->request_id) { new = &((*new)->rb_left); }
|
||||
else if (request_id > this->request_id) { new = &((*new)->rb_right); }
|
||||
else { printk(KERN_ERR "dup id %lld vs. %lld %p vs. %p\n", request_id, this->request_id, req, this); BUG_ON(1); return; }
|
||||
}
|
||||
|
||||
rb_link_node(&req->node, parent, new);
|
||||
rb_insert_color(&req->node, &s->requests);
|
||||
}
|
||||
|
||||
|
||||
static_assert(HZ / 100 > 0);
|
||||
|
||||
#define SHARD_ATTEMPTS 10
|
||||
const static u64 shard_timeouts_10ms[SHARD_ATTEMPTS] = {
|
||||
1, 2, 4, 8, 16, 32, 64, 100, 200, 200
|
||||
};
|
||||
|
||||
// 5 times the shard numbers, given that we roughly do 5 shard requests per CDC request
|
||||
|
||||
#define CDC_ATTEMPTS 10
|
||||
const static u64 cdc_timeouts_10ms[CDC_ATTEMPTS] = {
|
||||
5, 10, 20, 40, 80, 160, 320, 640, 1000, 1000
|
||||
};
|
||||
|
||||
// Returns:
|
||||
//
|
||||
// * n == 0 for success
|
||||
// * n < 0 for error (notable ones being -ETIMEOUT and -ERESTARTSYS)
|
||||
//
|
||||
// Once this is done, the request is guaranteed to not be in the tree anymore.
|
||||
static int __must_check wait_for_request(struct eggsfs_shard_socket* s, struct eggsfs_shard_request* req, u64 timeout_10ms) {
|
||||
int ret = wait_for_completion_killable_timeout(&req->comp, (timeout_10ms * 10 * HZ) / 1000);
|
||||
// We got a response, this is the happy path.
|
||||
if (likely(ret > 0)) {
|
||||
BUG_ON(!req->skb);
|
||||
return 0;
|
||||
}
|
||||
// We didn't get a response, it's our job to clean up the request.
|
||||
spin_lock_bh(&s->lock);
|
||||
// We might have filled the request in the time it took us to get here, in which
|
||||
// case `sock_readable` will have already erased it.
|
||||
if (likely(!req->skb)) {
|
||||
rb_erase(&req->node, &s->requests);
|
||||
}
|
||||
spin_unlock_bh(&s->lock);
|
||||
// If the request was filled in the meantime, we also need to free the skb.
|
||||
if (unlikely(req->skb)) {
|
||||
kfree_skb(req->skb);
|
||||
req->skb = NULL;
|
||||
}
|
||||
return ret ? ret : -ETIMEDOUT;
|
||||
}
|
||||
|
||||
void eggsfs_net_shard_free_socket(struct eggsfs_shard_socket* s) {
|
||||
// TODO anything else?
|
||||
sock_release(s->sock);
|
||||
}
|
||||
|
||||
int eggsfs_init_shard_socket(struct eggsfs_shard_socket* s) {
|
||||
struct sockaddr_in addr;
|
||||
int err;
|
||||
|
||||
err = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &s->sock);
|
||||
if (err) { goto out_err; }
|
||||
|
||||
write_lock_bh(&s->sock->sk->sk_callback_lock);
|
||||
BUG_ON(s->sock->sk->sk_user_data);
|
||||
s->sock->sk->sk_user_data = s;
|
||||
s->sock->sk->sk_data_ready = sock_readable; // FIXME: keep around the default one
|
||||
write_unlock_bh(&s->sock->sk->sk_callback_lock);
|
||||
|
||||
addr.sin_family = AF_INET;
|
||||
addr.sin_addr.s_addr = 0;
|
||||
addr.sin_port = 0;
|
||||
err = s->sock->ops->bind(s->sock, (struct sockaddr*)&addr, sizeof(addr));
|
||||
if (err) { goto out_socket; }
|
||||
|
||||
s->requests = RB_ROOT;
|
||||
spin_lock_init(&s->lock);
|
||||
|
||||
return 0;
|
||||
|
||||
out_socket:
|
||||
sock_release(s->sock);
|
||||
out_err:
|
||||
return err;
|
||||
}
|
||||
|
||||
// TODO: congestion control
|
||||
|
||||
// shard_id is just used for debugging/event tracing, and should be -1 if we're going to the CDC
|
||||
struct sk_buff* eggsfs_metadata_request(
|
||||
struct eggsfs_shard_socket* sock,
|
||||
s16 shard_id,
|
||||
struct msghdr* msg,
|
||||
u64 req_id,
|
||||
void* p,
|
||||
u32 len,
|
||||
u32* attempts
|
||||
) {
|
||||
struct sockaddr_in* addr = (struct sockaddr_in*)msg->msg_name;
|
||||
struct eggsfs_shard_request req;
|
||||
struct kvec vec;
|
||||
int err = -EIO;
|
||||
|
||||
u8 kind = *((u8*)p + 12);
|
||||
trace_eggsfs_metadata_request_enter(msg, req_id, len, shard_id, kind); // which socket?
|
||||
|
||||
req.skb = NULL;
|
||||
req.request_id = req_id;
|
||||
*attempts = 0;
|
||||
|
||||
vec.iov_base = p;
|
||||
vec.iov_len = len;
|
||||
|
||||
int max_attempts = shard_id < 0 ? CDC_ATTEMPTS : SHARD_ATTEMPTS;
|
||||
const u64* timeouts_10ms = shard_id < 0 ? cdc_timeouts_10ms : shard_timeouts_10ms;
|
||||
|
||||
do {
|
||||
init_completion(&req.comp);
|
||||
|
||||
BUG_ON(req.skb);
|
||||
spin_lock_bh(&sock->lock);
|
||||
insert_shard_request(sock, &req);
|
||||
spin_unlock_bh(&sock->lock);
|
||||
|
||||
eggsfs_debug_print("sending request of kind 0x%02x, len %d to %pI4:%d, after %d attempts", (int)kind, len, &addr->sin_addr, ntohs(addr->sin_port), *attempts);
|
||||
eggsfs_debug_print("sending to sock=%p iov=%p len=%u", sock->sock, p, len);
|
||||
err = kernel_sendmsg(sock->sock, msg, &vec, 1, len);
|
||||
if (err < 0) {
|
||||
eggsfs_info_print("could not send req %llu of kind 0x%02x to %pI4:%d: %d", req_id, (int)kind, &addr->sin_addr, ntohs(addr->sin_port), err);
|
||||
goto out_unregister;
|
||||
}
|
||||
|
||||
err = wait_for_request(sock, &req, timeouts_10ms[*attempts]);
|
||||
(*attempts)++;
|
||||
if (!err) {
|
||||
eggsfs_debug_print("got response");
|
||||
BUG_ON(!req.skb);
|
||||
trace_eggsfs_metadata_request_exit(req_id, *attempts, req.skb->len, 0);
|
||||
return req.skb;
|
||||
}
|
||||
|
||||
eggsfs_info_print("err=%d", err);
|
||||
if (err != -ETIMEDOUT || *attempts >= max_attempts) {
|
||||
eggsfs_info_print("giving up after %d attempts due to err %d", *attempts, err);
|
||||
goto out_err;
|
||||
}
|
||||
} while (1);
|
||||
|
||||
out_unregister:
|
||||
spin_lock_bh(&sock->lock);
|
||||
// TODO what are the circumstances where this can be true? That is, kernel_sendmsg returns
|
||||
// an error, but we get a fill from the shard?
|
||||
BUG_ON(req.skb);
|
||||
rb_erase(&req.node, &sock->requests);
|
||||
spin_unlock_bh(&sock->lock);
|
||||
|
||||
out_err:
|
||||
trace_eggsfs_metadata_request_exit(req_id, *attempts, 0, err);
|
||||
eggsfs_info_print("err=%d", err);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
static void eggsfs_shard_async_request_worker(struct work_struct* work) {
|
||||
struct eggsfs_shard_async_request* req = container_of(to_delayed_work(work), struct eggsfs_shard_async_request, work);
|
||||
struct kvec vec;
|
||||
int err;
|
||||
|
||||
eggsfs_debug_print();
|
||||
|
||||
spin_lock_bh(&req->socket->lock);
|
||||
if (req->request.skb) { goto out_done; }
|
||||
if (req->request.attempt == 0) {
|
||||
insert_shard_request(req->socket, &req->request);
|
||||
} else if (req->request.attempt >= eggsfs_max_request_retries) {
|
||||
rb_erase(&req->request.node, &req->socket->requests);
|
||||
req->request.skb = ERR_PTR(-ETIMEDOUT);
|
||||
goto out_failed;
|
||||
}
|
||||
++req->request.attempt;
|
||||
spin_unlock_bh(&req->socket->lock);
|
||||
|
||||
vec.iov_base = req + 1;
|
||||
vec.iov_len = req->length;
|
||||
|
||||
err = kernel_sendmsg(req->socket->sock, &req->msg, &vec, 1, req->length);
|
||||
if (err < 0) {
|
||||
spin_lock_bh(&req->socket->lock);
|
||||
if (!req->request.skb) {
|
||||
rb_erase(&req->request.node, &req->socket->requests);
|
||||
req->request.skb = ERR_PTR(err);
|
||||
goto out_failed;
|
||||
}
|
||||
spin_unlock_bh(&req->socket->lock);
|
||||
} else {
|
||||
queue_delayed_work(eggsfs_wq, &req->work, HZ);
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
out_failed:
|
||||
spin_unlock_bh(&req->socket->lock);
|
||||
queue_work(system_wq, &req->callback);
|
||||
return;
|
||||
|
||||
out_done:
|
||||
spin_unlock_bh(&req->socket->lock);
|
||||
return;
|
||||
}
|
||||
|
||||
// this layer needs to be capable of picking sockets / though we still need a list of them
|
||||
void eggsfs_shard_async_request(struct eggsfs_shard_async_request* req, struct eggsfs_shard_socket* sock, struct msghdr* hdr, u64 req_id, u32 len) {
|
||||
eggsfs_debug_print();
|
||||
|
||||
req->socket = sock;
|
||||
req->request.request_id = req_id;
|
||||
req->request.flags = EGGSFS_SHARD_ASYNC;
|
||||
req->request.skb = NULL;
|
||||
req->request.attempt = 0;
|
||||
req->length = len;
|
||||
req->msg = *hdr;
|
||||
INIT_DELAYED_WORK(&req->work, eggsfs_shard_async_request_worker);
|
||||
queue_delayed_work(eggsfs_wq, &req->work, 0);
|
||||
}
|
||||
|
||||
void eggsfs_shard_async_cleanup(struct eggsfs_shard_async_request* req) {
|
||||
cancel_delayed_work_sync(&req->work);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
void eggsfs_shard_async_cancel(struct eggsfs_shard_async_request* req) {
|
||||
spin_lock_bh(&req->socket->lock);
|
||||
if (!req->request.skb) {
|
||||
rb_erase(&req.node, &sock->requests);
|
||||
}
|
||||
spin_unlock_bh(&req->socket->lock);
|
||||
cancel_delayed_work_sync(&req->work);
|
||||
}
|
||||
*/
|
||||
+49
@@ -0,0 +1,49 @@
|
||||
#ifndef _EGGSFS_NET_H
|
||||
#define _EGGSFS_NET_H
|
||||
|
||||
#include <linux/net.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/inet_common.h>
|
||||
|
||||
#define EGGSFS_UDP_MTU 1472
|
||||
|
||||
struct eggsfs_shard_socket {
|
||||
struct socket* sock;
|
||||
struct rb_root requests;
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
#define EGGSFS_SHARD_ASYNC 1
|
||||
|
||||
struct eggsfs_shard_request {
|
||||
struct rb_node node;
|
||||
u64 request_id;
|
||||
struct sk_buff* skb;
|
||||
struct completion comp;
|
||||
};
|
||||
|
||||
int eggsfs_init_shard_socket(struct eggsfs_shard_socket* sock);
|
||||
void eggsfs_net_shard_free_socket(struct eggsfs_shard_socket* sock);
|
||||
|
||||
struct sk_buff* eggsfs_metadata_request(
|
||||
struct eggsfs_shard_socket* sock, s16 shard_id, struct msghdr* msg, u64 req_id, void* p, u32 len, u32* attempts
|
||||
);
|
||||
|
||||
struct eggsfs_shard_async_request {
|
||||
struct eggsfs_shard_request request;
|
||||
struct eggsfs_shard_socket* socket;
|
||||
struct msghdr msg;
|
||||
struct delayed_work work;
|
||||
struct work_struct callback;
|
||||
u32 length;
|
||||
};
|
||||
|
||||
#if 0
|
||||
|
||||
void eggsfs_shard_async_request(struct eggsfs_shard_async_request* req, struct eggsfs_shard_socket* sock, struct msghdr* msg, u64 req_id, u32 len);
|
||||
void eggsfs_shard_async_cleanup(struct eggsfs_shard_async_request* req);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Executable
+78
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eu -o pipefail
|
||||
|
||||
run=false
|
||||
deploy=false
|
||||
build_type='alpine-debug'
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-run)
|
||||
run=true
|
||||
shift
|
||||
;;
|
||||
-deploy)
|
||||
deploy=true
|
||||
shift
|
||||
;;
|
||||
-build-type)
|
||||
shift
|
||||
build_type="$1"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Bad usage"
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Kills the current VM, starts it again, deploys to it and builds kmod, loads kmod, opens a tmux
|
||||
# session with dmesg in one pane and a console in ~/eggs in the other.
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd $SCRIPT_DIR
|
||||
|
||||
make -j kmod
|
||||
|
||||
pkill qemu || true
|
||||
tmux kill-session -t uovo || true
|
||||
|
||||
sleep 1 # sometimes qemu lingers?
|
||||
|
||||
# Windows:
|
||||
#
|
||||
# 0. dmesg
|
||||
# 1. free shell
|
||||
# 2. shell with eggsfs running
|
||||
# 3. qemu running
|
||||
|
||||
tmux new-session -d -s uovo
|
||||
tmux new-window -t uovo:3 './startvm.sh'
|
||||
|
||||
# Wait for VM to go up, and build
|
||||
while ! scp eggsfs.ko uovo: ; do sleep 1; done
|
||||
|
||||
# Start dmesg as soon as it's booted (before we insert module)
|
||||
tmux send-keys -t uovo:0 "ssh -t uovo dmesg -wHT" Enter
|
||||
|
||||
# Insert module
|
||||
ssh uovo 'sudo insmod eggsfs.ko'
|
||||
|
||||
if [[ "$deploy" = true ]]; then
|
||||
# Deploy binaries
|
||||
(cd ../deploy && ./deploy.py --build-type "$build_type" --upload --host fmazzol@uovo)
|
||||
fi
|
||||
|
||||
# Create shells
|
||||
tmux new-window -t uovo:1 'ssh -t uovo'
|
||||
if [[ "$run" = true ]]; then
|
||||
tmux new-window -t uovo:2 'ssh -t uovo "cd eggs && ./eggsrun -trace -binaries-dir ~/eggs -data-dir ~/eggs-data/"'
|
||||
fi
|
||||
|
||||
# Attach
|
||||
tmux attach-session -t uovo:1
|
||||
|
||||
# 4349852
|
||||
# ./eggstests -kmod -filter mounted -cfg fsTest.checkThreads=1 -cfg fsTest.numDirs=10 -cfg fsTest.numFiles=100 -binaries-dir $(pwd)
|
||||
@@ -0,0 +1,130 @@
|
||||
#include "rs.h"
|
||||
|
||||
#include <linux/string.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/fpu/api.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "intrshims.h"
|
||||
|
||||
#define rs_malloc(sz) kmalloc(sz, GFP_KERNEL)
|
||||
#define rs_free(p) kfree(p)
|
||||
#define die(...) BUG()
|
||||
|
||||
enum rs_cpu_level {
|
||||
RS_CPU_SCALAR = 1,
|
||||
RS_CPU_AVX2 = 2,
|
||||
RS_CPU_GFNI = 3,
|
||||
};
|
||||
|
||||
static inline bool rs_detect_valgrind(void) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#define broadcast_u8(x) \
|
||||
((__m256i)__builtin_ia32_pbroadcastb256((__v16qi){x,0,0,0,0,0,0,0,0,0,0,0,0,0,0}))
|
||||
|
||||
static enum rs_cpu_level eggsfs_rs_cpu_level;
|
||||
|
||||
#include "rs_core.c"
|
||||
|
||||
#define rs_compute_parity_scalar_func(D, P) \
|
||||
static void rs_compute_parity_scalar_##D##_##P(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) { \
|
||||
rs_compute_parity_scalar(D, P, r, size, data, parity); \
|
||||
}
|
||||
|
||||
#define rs_compute_parity_avx2_func(D, P) \
|
||||
__attribute__((target("avx,avx2"))) \
|
||||
static void rs_compute_parity_avx2_##D##_##P(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) { \
|
||||
rs_compute_parity_avx2(D, P, r, size, data, parity); \
|
||||
}
|
||||
|
||||
#define rs_compute_parity_gfni_func(D, P) \
|
||||
__attribute__((target("avx,avx2,gfni"))) \
|
||||
static void rs_compute_parity_gfni_##D##_##P(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) { \
|
||||
rs_compute_parity_gfni(D, P, r, size, data, parity); \
|
||||
}
|
||||
|
||||
#define rs_gen(D, P) \
|
||||
static struct rs* rs_##D##_##P = NULL; \
|
||||
static int rs_init_##D##_##P(void) { \
|
||||
rs_##D##_##P = rs_new_core(eggsfs_mk_parity(D, P)); \
|
||||
if (rs_##D##_##P == NULL) { \
|
||||
return -ENOMEM; \
|
||||
} \
|
||||
return 0; \
|
||||
} \
|
||||
rs_compute_parity_scalar_func(D, P) \
|
||||
rs_compute_parity_avx2_func(D, P) \
|
||||
rs_compute_parity_gfni_func(D, P) \
|
||||
static void rs_compute_parity_##D##_##P(uint64_t size, const uint8_t** data, uint8_t** parity) { \
|
||||
switch (eggsfs_rs_cpu_level) { \
|
||||
case RS_CPU_SCALAR: \
|
||||
rs_compute_parity_scalar_##D##_##P(rs_##D##_##P, size, data, parity); \
|
||||
break; \
|
||||
case RS_CPU_AVX2: \
|
||||
rs_compute_parity_avx2_##D##_##P(rs_##D##_##P, size, data, parity); \
|
||||
break; \
|
||||
case RS_CPU_GFNI: \
|
||||
rs_compute_parity_gfni_##D##_##P(rs_##D##_##P, size, data, parity); \
|
||||
break; \
|
||||
default: \
|
||||
BUG(); \
|
||||
} \
|
||||
}
|
||||
|
||||
// Right now we don't need anything else, let's not needlessly generate tons of code
|
||||
rs_gen( 4, 4)
|
||||
rs_gen(10, 4)
|
||||
|
||||
int eggsfs_compute_parity(u8 parity, ssize_t size, const char** data, char** out) {
|
||||
if (eggsfs_parity_blocks(parity) == 0) { // nothing to do
|
||||
return 0;
|
||||
}
|
||||
if (eggsfs_data_blocks(parity) == 1) { // mirroring
|
||||
int i;
|
||||
for (i = 0; i < eggsfs_parity_blocks(parity); i++) {
|
||||
memcpy(out[i], data[0], size);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int err = 0;
|
||||
if (parity == eggsfs_mk_parity(4, 4)) {
|
||||
rs_compute_parity_4_4(size, (const u8**)data, (u8**)out);
|
||||
} else if (parity == eggsfs_mk_parity(10, 4)) {
|
||||
rs_compute_parity_10_4(size, (const u8**)data, (u8**)out);
|
||||
} else {
|
||||
eggsfs_warn_print("cannot compute with RS(%d,%d)", eggsfs_data_blocks(parity), eggsfs_parity_blocks(parity));
|
||||
err = -EINVAL;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
int __init eggsfs_rs_init(void) {
|
||||
if (rs_has_cpu_level_core(RS_CPU_GFNI)) {
|
||||
eggsfs_info_print("picking GFNI");
|
||||
eggsfs_rs_cpu_level = RS_CPU_GFNI;
|
||||
} else if (rs_has_cpu_level_core(RS_CPU_AVX2)) {
|
||||
eggsfs_info_print("picking AVX2");
|
||||
eggsfs_rs_cpu_level = RS_CPU_AVX2;
|
||||
} else {
|
||||
eggsfs_warn_print("picking scalar execution -- this will be slow.");
|
||||
eggsfs_rs_cpu_level = RS_CPU_SCALAR;
|
||||
}
|
||||
|
||||
int err;
|
||||
err = rs_init_4_4();
|
||||
if (err < 0) { return err; }
|
||||
err = rs_init_10_4();
|
||||
if (err < 0) { kfree(rs_4_4); return err; }
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __cold eggsfs_rs_exit(void) {
|
||||
kfree(rs_4_4);
|
||||
kfree(rs_10_4);
|
||||
}
|
||||
|
||||
#include "gf_tables.c"
|
||||
@@ -0,0 +1,31 @@
|
||||
#ifndef _EGGSFS_RS_H
|
||||
#define _EGGSFS_RS_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
static inline u8 eggsfs_data_blocks(u8 parity) {
|
||||
return parity & 0x0F;
|
||||
}
|
||||
|
||||
static inline u8 eggsfs_parity_blocks(u8 parity) {
|
||||
return parity >> 4;
|
||||
}
|
||||
|
||||
static inline u8 eggsfs_blocks(u8 parity) {
|
||||
return eggsfs_data_blocks(parity) + eggsfs_parity_blocks(parity);
|
||||
}
|
||||
|
||||
static inline u8 eggsfs_mk_parity(u8 data, u8 parity) {
|
||||
return (parity << 4) | data;
|
||||
}
|
||||
|
||||
// len(data) = eggsfs_data_blocks(parity)
|
||||
// len(out) = eggsfs_parity_blocks(parity)
|
||||
//
|
||||
// You _must_ wrap this with kernel_fpu_begin()/kernel_fpu_end()!
|
||||
int eggsfs_compute_parity(u8 parity, ssize_t size, const char** data, char** out);
|
||||
|
||||
int __init eggsfs_rs_init(void);
|
||||
void __cold eggsfs_rs_exit(void);
|
||||
|
||||
#endif
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../cpp/rs/rs_core.c
|
||||
@@ -0,0 +1,64 @@
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#include "bincode.h"
|
||||
#include "shuckle.h"
|
||||
#include "common.h"
|
||||
|
||||
void eggsfs_write_shuckle_req_header(char* buf, u32 req_len, u8 req_kind) {
|
||||
put_unaligned_le32(EGGSFS_SHUCKLE_REQ_PROTOCOL_VERSION, buf); buf += 4;
|
||||
// automatically include the kind, much nicer for the caller
|
||||
put_unaligned_le32(req_len+1, buf); buf += 4;
|
||||
*(u8*)buf = req_kind; buf++;
|
||||
}
|
||||
|
||||
int eggsfs_read_shuckle_resp_header(char* buf, u32* resp_len, u8* resp_kind) {
|
||||
u32 protocol = get_unaligned_le32(buf); buf += 4;
|
||||
if (protocol != EGGSFS_SHUCKLE_RESP_PROTOCOL_VERSION) {
|
||||
eggsfs_debug_print("bad shuckle protocol, expected 0x%08x, got 0x%08x", EGGSFS_SHUCKLE_RESP_PROTOCOL_VERSION, protocol);
|
||||
return -EINVAL;
|
||||
}
|
||||
*resp_len = get_unaligned_le32(buf); buf += 4;
|
||||
if (*resp_len == 0) {
|
||||
eggsfs_debug_print("unexpected zero-length shuckle response (the kind should at least be there)");
|
||||
return -EINVAL;
|
||||
}
|
||||
*resp_kind = *(u8*)buf; buf++;
|
||||
eggsfs_debug_print("resp_len=%d, resp_kind=0x%02x", *resp_len, (int)*resp_kind);
|
||||
(*resp_len)--; // exclude the kind, it's much nicer for the caller
|
||||
return 0;
|
||||
}
|
||||
|
||||
int eggsfs_create_shuckle_socket(const char* shuckle_addr, struct socket** sock) {
|
||||
int err;
|
||||
const char* addr_end;
|
||||
u16 port;
|
||||
|
||||
// parse device, which is the shuckle address in 0.0.0.0:0 form (ipv4, port)
|
||||
struct sockaddr_in addr;
|
||||
addr.sin_family = AF_INET;
|
||||
if (in4_pton(shuckle_addr, -1, (u8*)&addr.sin_addr, ':', &addr_end) != 1) {
|
||||
err = -EINVAL;
|
||||
goto out_err;
|
||||
}
|
||||
err = kstrtou16(addr_end+1, 10, &port);
|
||||
if (err < 0) { goto out_err; }
|
||||
addr.sin_port = htons(port);
|
||||
eggsfs_debug_print("parsed shuckle addr %pI4:%d", &addr.sin_addr, ntohs(addr.sin_port));
|
||||
|
||||
// create socket
|
||||
err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, sock);
|
||||
if (err < 0) { goto out_err; }
|
||||
|
||||
// connect
|
||||
err = kernel_connect(*sock, (struct sockaddr*)&addr, sizeof(addr), 0);
|
||||
if (err < 0) { goto out_connect; }
|
||||
eggsfs_debug_print("connected to shuckle");
|
||||
|
||||
return 0;
|
||||
|
||||
out_connect:
|
||||
sock_release(*sock);
|
||||
out_err:
|
||||
eggsfs_debug_print("failed err=%d", err);
|
||||
return err;
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
#ifndef _EGGSFS_SHUCKLE_H
|
||||
#define _EGGSFS_SHUCKLE_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/inet.h>
|
||||
|
||||
#define EGGSFS_SHUCKLE_REQ_HEADER_SIZE (4 + 4 + 1) // protocol + len + kind
|
||||
void eggsfs_write_shuckle_req_header(char* buf, u32 req_len, u8 req_kind);
|
||||
|
||||
#define EGGSFS_SHUCKLE_RESP_HEADER_SIZE (4 + 4 + 1) // protocol + len + kind
|
||||
int eggsfs_read_shuckle_resp_header(char* buf, u32* resp_len, u8* resp_kind);
|
||||
|
||||
int eggsfs_create_shuckle_socket(const char* shuckle_addr, struct socket** sock);
|
||||
|
||||
#endif
|
||||
+95
@@ -0,0 +1,95 @@
|
||||
#include "skb.h"
|
||||
|
||||
#include <linux/highmem.h>
|
||||
|
||||
#include "block.h"
|
||||
#include "common.h"
|
||||
|
||||
u32 eggsfs_skb_copy(void* dstp, struct sk_buff* skb, u32 offset, u32 len) {
|
||||
struct skb_seq_state seq;
|
||||
u32 consumed = 0;
|
||||
char* dst = dstp;
|
||||
|
||||
skb_prepare_seq_read(skb, offset, min(offset + len, skb->len), &seq);
|
||||
for (;;) {
|
||||
const u8* ptr;
|
||||
int avail = skb_seq_read(consumed, &ptr, &seq);
|
||||
if (avail == 0) { return consumed; }
|
||||
BUG_ON(avail < 0);
|
||||
// avail _can_ exceed the upper bound here
|
||||
int actual_avail = min((u32)avail, len-consumed);
|
||||
memcpy(dst + consumed, ptr, actual_avail);
|
||||
consumed += actual_avail;
|
||||
if (actual_avail < avail) {
|
||||
skb_abort_seq_read(&seq);
|
||||
return consumed;
|
||||
}
|
||||
}
|
||||
BUG();
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
u32 eggsfs_skb_copy_to_pages(struct eggsfs_block_request* req, struct sk_buff* skb, u32 offset) {
|
||||
struct skb_seq_state seq;
|
||||
u32 total_consumed = 0;
|
||||
struct page* page;
|
||||
void* page_ptr;
|
||||
u32 page_offset = req->page_offset;
|
||||
|
||||
if (WARN_ON(!req->bytes_left)) { return 0; }
|
||||
|
||||
skb_prepare_seq_read(skb, offset, min(offset + req->bytes_left, skb->len), &seq);
|
||||
|
||||
BUG_ON(list_empty(&req->pages));
|
||||
page = list_first_entry(&req->pages, struct page, lru);
|
||||
page_ptr = kmap_atomic(page);
|
||||
|
||||
while (total_consumed < req->bytes_left) {
|
||||
const u8* ptr;
|
||||
int avail = skb_seq_read(total_consumed, &ptr, &seq);
|
||||
printk(KERN_INFO "eggsfs: skb_seq_read total_consumed=%u page_offset=%u avail=%d bytes_left=%u\n", total_consumed, page_offset, avail, req->bytes_left);
|
||||
if (avail == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
while (avail) {
|
||||
// FIXME: is the last one needed?
|
||||
u32 this_len = min3((u32)avail, (u32)PAGE_SIZE - page_offset, req->bytes_left - total_consumed);
|
||||
printk(KERN_INFO "eggsfs: copy to page total_consumed=%u avail=%d this_len=%d page_offset=%u\n", total_consumed, avail, this_len, page_offset);
|
||||
req->crc32 = eggsfs_crc(req->crc32, ptr, this_len);
|
||||
memcpy((char*)page_ptr + page_offset, ptr, this_len);
|
||||
ptr += this_len;
|
||||
page_offset += this_len;
|
||||
avail -= this_len;
|
||||
total_consumed += this_len;
|
||||
|
||||
if (page_offset >= PAGE_SIZE) {
|
||||
BUG_ON(page_offset > PAGE_SIZE);
|
||||
|
||||
kunmap_atomic(page_ptr);
|
||||
list_rotate_left(&req->pages);
|
||||
|
||||
page = list_first_entry(&req->pages, struct page, lru);
|
||||
page_ptr = kmap_atomic(page);
|
||||
page_offset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
skb_abort_seq_read(&seq);
|
||||
|
||||
req->bytes_left -= total_consumed;
|
||||
req->page_offset = page_offset;
|
||||
|
||||
if (!req->bytes_left && page_offset) {
|
||||
// partially filled page at the end
|
||||
memset(page_ptr + page_offset, 0, PAGE_SIZE - page_offset);
|
||||
list_rotate_left(&req->pages);
|
||||
}
|
||||
|
||||
kunmap_atomic(page_ptr);
|
||||
|
||||
return total_consumed;
|
||||
}
|
||||
#endif
|
||||
+16
@@ -0,0 +1,16 @@
|
||||
#ifndef _EGGSFS_SKB_H
|
||||
#define _EGGSFS_SKB_H
|
||||
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
struct eggsfs_block_request;
|
||||
|
||||
// TODO possibly move this one to blocksimple.c
|
||||
u32 eggsfs_skb_copy(void* dstp, struct sk_buff* skb, u32 offset, u32 len);
|
||||
|
||||
#if 0
|
||||
// TODO remove this thing below
|
||||
u32 eggsfs_skb_copy_to_pages(struct eggsfs_block_request* req, struct sk_buff* skb, u32 offset);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
+324
@@ -0,0 +1,324 @@
|
||||
|
||||
#include "bincode.h"
|
||||
#include "common.h"
|
||||
#include "span.h"
|
||||
#include "metadata.h"
|
||||
#include "rs.h"
|
||||
#include "latch.h"
|
||||
#include "crc.h"
|
||||
#include "err.h"
|
||||
|
||||
static struct eggsfs_span* eggsfs_lookup_span(struct rb_root* spans, u64 offset) {
|
||||
struct rb_node* node = spans->rb_node;
|
||||
while (node) {
|
||||
struct eggsfs_span* span = container_of(node, struct eggsfs_span, node);
|
||||
if (offset < span->start) { node = node->rb_left; }
|
||||
else if (offset >= span->end) { node = node->rb_right; }
|
||||
else {
|
||||
eggsfs_debug_print("off=%llu span=%p", offset, span);
|
||||
return span;
|
||||
}
|
||||
}
|
||||
eggsfs_debug_print("off=%llu no_span", offset);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool eggsfs_insert_span(struct rb_root* spans, struct eggsfs_span* span) {
|
||||
struct rb_node** new = &(spans->rb_node);
|
||||
struct rb_node* parent = NULL;
|
||||
while (*new) {
|
||||
struct eggsfs_span* this = container_of(*new, struct eggsfs_span, node);
|
||||
parent = *new;
|
||||
if (span->end <= this->start) { new = &((*new)->rb_left); }
|
||||
else if (span->start >= this->end) { new = &((*new)->rb_right); }
|
||||
else {
|
||||
// TODO: be loud if there are overlaping spans
|
||||
eggsfs_debug_print("span=%p (%llu-%llu) already present", span, span->start, span->end);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
rb_link_node(&span->node, parent, new);
|
||||
rb_insert_color(&span->node, spans);
|
||||
eggsfs_debug_print("span=%p (%llu-%llu) inserted", span, span->start, span->end);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void eggsfs_free_span(struct eggsfs_span* span) {
|
||||
if (span->storage_class == EGGSFS_INLINE_STORAGE) {
|
||||
kfree(EGGSFS_INLINE_SPAN(span));
|
||||
} else {
|
||||
kfree(EGGSFS_BLOCK_SPAN(span));
|
||||
}
|
||||
}
|
||||
|
||||
struct eggsfs_get_span_ctx {
|
||||
struct list_head spans;
|
||||
int err;
|
||||
};
|
||||
|
||||
void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs) {
|
||||
eggsfs_debug_print("offset=%llu size=%u crc=%08x storage_class=%d parity=%d stripes=%d cell_size=%u", offset, size, crc, storage_class, parity, stripes, cell_size);
|
||||
|
||||
struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data;
|
||||
if (ctx->err) { return; }
|
||||
|
||||
struct eggsfs_block_span* span = kmalloc(sizeof(struct eggsfs_block_span), GFP_KERNEL);
|
||||
if (!span) { ctx->err = -ENOMEM; return; }
|
||||
|
||||
if (eggsfs_data_blocks(parity) > EGGSFS_MAX_DATA || eggsfs_parity_blocks(parity) > EGGSFS_MAX_PARITY) {
|
||||
eggsfs_warn_print("D=%d > %d || P=%d > %d", eggsfs_data_blocks(parity), EGGSFS_MAX_DATA, eggsfs_data_blocks(parity), EGGSFS_MAX_PARITY);
|
||||
ctx->err = -EIO;
|
||||
return;
|
||||
}
|
||||
|
||||
span->span.start = offset;
|
||||
span->span.end = offset + size;
|
||||
span->span.storage_class = storage_class;
|
||||
|
||||
xa_init(&span->pages);
|
||||
int i;
|
||||
for (i = 0; i < stripes; i++) {
|
||||
eggsfs_latch_init(&span->stripe_latches[i]);
|
||||
}
|
||||
span->cell_size = cell_size;
|
||||
memcpy(span->stripes_crc, stripes_crcs, sizeof(uint32_t)*stripes);
|
||||
span->stripes = stripes;
|
||||
span->parity = parity;
|
||||
|
||||
eggsfs_debug_print("adding normal span");
|
||||
list_add_tail(&span->span.lru, &ctx->spans);
|
||||
}
|
||||
|
||||
void eggsfs_file_spans_cb_block(
|
||||
void* data, int block_ix,
|
||||
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2,
|
||||
u64 block_id, u32 crc
|
||||
) {
|
||||
struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data;
|
||||
if (ctx->err) { return; }
|
||||
|
||||
struct eggsfs_span* span = list_last_entry(&ctx->spans, struct eggsfs_span, lru);
|
||||
struct eggsfs_block_span* block_span = EGGSFS_BLOCK_SPAN(span);
|
||||
|
||||
struct eggsfs_block* block = &block_span->blocks[block_ix];
|
||||
block->bs.id = bs_id;
|
||||
block->bs.ip1 = ip1;
|
||||
block->bs.port1 = port1;
|
||||
block->bs.ip2 = ip2;
|
||||
block->bs.port2 = port2;
|
||||
block->id = block_id;
|
||||
block->crc = crc;
|
||||
}
|
||||
|
||||
void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body) {
|
||||
eggsfs_debug_print("offset=%llu size=%u len=%u body=%*pE", offset, size, len, len, body);
|
||||
|
||||
struct eggsfs_get_span_ctx* ctx = (struct eggsfs_get_span_ctx*)data;
|
||||
if (ctx->err) { return; }
|
||||
|
||||
struct eggsfs_inline_span* span = kmalloc(sizeof(struct eggsfs_inline_span), GFP_KERNEL);
|
||||
if (!span) { ctx->err = -ENOMEM; return; }
|
||||
|
||||
span->span.start = offset;
|
||||
span->span.end = offset + size;
|
||||
span->span.storage_class = EGGSFS_INLINE_STORAGE;
|
||||
span->len = len;
|
||||
memcpy(span->body, body, len);
|
||||
eggsfs_debug_print("adding inline span");
|
||||
list_add_tail(&span->span.lru, &ctx->spans);
|
||||
}
|
||||
|
||||
struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset) {
|
||||
unsigned seq;
|
||||
struct eggsfs_span* span;
|
||||
struct eggsfs_inode_file* file = &enode->file;
|
||||
|
||||
eggsfs_debug_print("ino=%016lx, pid=%d, mu=%p, off=%llu getting span", enode->inode.i_ino, get_current()->pid, &file->spans_wlock, offset);
|
||||
|
||||
retry: // seqlock protects against spurious NULLs, if we get a span it is valid
|
||||
seq = read_seqcount_begin(&file->spans_seqcount);
|
||||
span = eggsfs_lookup_span(&file->spans, offset);
|
||||
if (likely(span)) { return span; }
|
||||
if (read_seqcount_retry(&file->spans_seqcount, seq)) { goto retry; }
|
||||
|
||||
int err = mutex_lock_killable(&file->spans_wlock);
|
||||
if (err) { return ERR_PTR(err); }
|
||||
|
||||
span = eggsfs_lookup_span(&file->spans, offset);
|
||||
if (!span) { // somebody might have gotten here first
|
||||
struct eggsfs_span* tmp;
|
||||
struct eggsfs_get_span_ctx ctx = { .err = 0, };
|
||||
INIT_LIST_HEAD(&ctx.spans);
|
||||
int err = eggsfs_error_to_linux(eggsfs_shard_file_spans(
|
||||
(struct eggsfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, offset, &ctx
|
||||
));
|
||||
err = err ?: ctx.err;
|
||||
if (unlikely(err)) {
|
||||
eggsfs_debug_print("failed to get file spans err=%d", err);
|
||||
list_for_each_entry_safe(span, tmp, &ctx.spans, lru) {
|
||||
list_del(&span->lru);
|
||||
eggsfs_free_span(span);
|
||||
}
|
||||
span = ERR_PTR(err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Need to disable preemption around seqcount_t write critical section,
|
||||
// see docs for seqcount_t.
|
||||
preempt_disable();
|
||||
write_seqcount_begin(&file->spans_seqcount);
|
||||
list_for_each_entry_safe(span, tmp, &ctx.spans, lru) {
|
||||
eggsfs_debug_print("inserting span start=%llu, end=%llu", span->start, span->end);
|
||||
if (!eggsfs_insert_span(&file->spans, span)) {
|
||||
// span already cached
|
||||
list_del(&span->lru);
|
||||
eggsfs_free_span(span);
|
||||
} else {
|
||||
}
|
||||
}
|
||||
write_seqcount_end(&file->spans_seqcount);
|
||||
preempt_enable();
|
||||
|
||||
span = eggsfs_lookup_span(&file->spans, offset);
|
||||
|
||||
// TODO: add to reclaimer LRU _after_ we've looked it up
|
||||
}
|
||||
|
||||
out:
|
||||
mutex_unlock(&file->spans_wlock);
|
||||
return span;
|
||||
}
|
||||
|
||||
struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix) {
|
||||
// this should be guaranteed by the caller but we rely on it below, so let's check
|
||||
if (span->cell_size%PAGE_SIZE != 0) {
|
||||
eggsfs_warn_print("cell_size=%u, PAGE_SIZE=%lu, span->cell_size%%PAGE_SIZE=%lu", span->cell_size, PAGE_SIZE, span->cell_size%PAGE_SIZE);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
struct page* page;
|
||||
u32 start_page, end_page, curr_page;
|
||||
int err = 0;
|
||||
|
||||
again:
|
||||
page = xa_load(&span->pages, page_ix);
|
||||
if (unlikely(page != NULL)) { return page; }
|
||||
|
||||
// We need to load the stripe
|
||||
int D = eggsfs_data_blocks(span->parity);
|
||||
u32 span_offset = page_ix * PAGE_SIZE;
|
||||
u32 stripe = span_offset / (span->cell_size*D);
|
||||
// TODO better error?
|
||||
if (stripe > span->stripes) {
|
||||
eggsfs_warn_print("span_offset=%u, stripe=%u, stripes=%u", span_offset, stripe, span->stripes);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
start_page = (span->cell_size/PAGE_SIZE)*D*stripe;
|
||||
end_page = (span->cell_size/PAGE_SIZE)*D*((int)stripe + 1);
|
||||
|
||||
int seqno;
|
||||
if (!eggsfs_latch_try_acquire(&span->stripe_latches[stripe], seqno)) {
|
||||
int err = eggsfs_latch_wait_killable(&span->stripe_latches[stripe], seqno);
|
||||
if (err) { return ERR_PTR(err); }
|
||||
goto again;
|
||||
}
|
||||
|
||||
struct eggsfs_block_socket* socks[EGGSFS_MAX_DATA];
|
||||
memset(socks, 0, sizeof(struct eggsfs_block_socket*)*EGGSFS_MAX_DATA);
|
||||
struct eggsfs_fetch_block_request* reqs[EGGSFS_MAX_DATA];
|
||||
memset(reqs, 0, sizeof(struct eggsfs_fetch_block_request*)*EGGSFS_MAX_DATA);
|
||||
|
||||
// get block services sockets
|
||||
int i;
|
||||
for (i = 0; i < D; i++) {
|
||||
socks[i] = eggsfs_get_fetch_block_socket(&span->blocks[i].bs);
|
||||
if (IS_ERR(socks[i])) {
|
||||
err = PTR_ERR(socks[i]);
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
// fetch them all at the right offset
|
||||
u32 block_offset = stripe * span->cell_size;
|
||||
for (i = 0; i < D; i++) {
|
||||
struct eggsfs_block* block = &span->blocks[i];
|
||||
reqs[i] = eggsfs_fetch_block(socks[i], block->bs.id, block->id, block_offset, span->cell_size);
|
||||
if (IS_ERR(reqs[i])) {
|
||||
err = PTR_ERR(reqs[i]);
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for all the block requests
|
||||
for (i = 0; i < D; i++) {
|
||||
struct eggsfs_fetch_block_request* req = reqs[i];
|
||||
err = wait_for_completion_killable(&req->comp);
|
||||
if (err) { goto out_err; }
|
||||
err = req->err;
|
||||
if (err) {
|
||||
eggsfs_debug_print("request failed: %d", err);
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
// Store all the pages into the xarray
|
||||
curr_page = start_page;
|
||||
u32 crc = 0;
|
||||
for (i = 0; i < D; i++) {
|
||||
struct eggsfs_fetch_block_request* req = reqs[i];
|
||||
crc = eggsfs_crc32c_append(crc, req->crc, span->cell_size);
|
||||
struct page* page;
|
||||
list_for_each_entry(page, &req->pages, lru) {
|
||||
struct page* old_page = xa_store(&span->pages, curr_page, page, GFP_KERNEL);
|
||||
if (IS_ERR(old_page)) {
|
||||
err = PTR_ERR(old_page);
|
||||
eggsfs_debug_print("xa_store failed: %d", err);
|
||||
goto out_err;
|
||||
}
|
||||
BUG_ON(old_page != NULL); // the latch protects against this
|
||||
curr_page++;
|
||||
}
|
||||
}
|
||||
|
||||
// Check the crc
|
||||
if (crc != span->stripes_crc[stripe]) {
|
||||
err = -EIO;
|
||||
eggsfs_debug_print("crc check failed failed, expected %08x, got %08x", span->stripes_crc[stripe], crc);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
// OK, we're good, now take ownership of the pages by emptying the pages in the requests
|
||||
// so that they won't be free by `eggsfs_put_fetch_block_request`
|
||||
for (i = 0; i < D; i++) {
|
||||
struct list_head* pages = &reqs[i]->pages;
|
||||
while (!list_empty(pages)) {
|
||||
list_del(&lru_to_page(pages)->lru);
|
||||
}
|
||||
}
|
||||
|
||||
page = xa_load(&span->pages, page_ix);
|
||||
BUG_ON(page == NULL);
|
||||
|
||||
out:
|
||||
// Unlock and free everything
|
||||
eggsfs_latch_release(&span->stripe_latches[stripe], seqno);
|
||||
for (i = 0; i < D; i++) {
|
||||
if (reqs[i] == NULL) { continue; }
|
||||
eggsfs_put_fetch_block_request(reqs[i]);
|
||||
}
|
||||
for (i = 0; i < D; i++) {
|
||||
if (socks[i] == NULL) { continue; }
|
||||
eggsfs_put_fetch_block_socket(socks[i]);
|
||||
}
|
||||
return err == 0 ? page : ERR_PTR(err);
|
||||
|
||||
out_err:
|
||||
eggsfs_debug_print("getting span page failed, err=%d", err);
|
||||
// we might have partially written the stripe, forget about them
|
||||
for (curr_page = start_page; curr_page < end_page; curr_page++) {
|
||||
xa_erase(&span->pages, curr_page);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
+69
@@ -0,0 +1,69 @@
|
||||
#ifndef _EGGSFS_SPANSIMPLE_H
|
||||
#define _EGGSFS_SPANSIMPLE_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "inode.h"
|
||||
|
||||
struct eggsfs_span {
|
||||
struct rb_node node; // to look them up
|
||||
struct list_head lru; // to decide what to evict
|
||||
u64 start;
|
||||
u64 end;
|
||||
u8 storage_class; // used to determine which type of span this is enclosed in
|
||||
};
|
||||
|
||||
struct eggsfs_inline_span {
|
||||
struct eggsfs_span span;
|
||||
u8 len;
|
||||
u8 body[255];
|
||||
};
|
||||
|
||||
#define EGGSFS_INLINE_SPAN(_span) ({ \
|
||||
BUG_ON(_span->storage_class != EGGSFS_INLINE_STORAGE); \
|
||||
container_of(_span, struct eggsfs_inline_span, span); \
|
||||
})
|
||||
|
||||
struct eggsfs_block {
|
||||
u64 id;
|
||||
struct eggsfs_block_service bs;
|
||||
u32 crc;
|
||||
};
|
||||
|
||||
struct eggsfs_block_span {
|
||||
struct eggsfs_span span;
|
||||
struct xarray pages; // indexed by page number
|
||||
struct eggsfs_block blocks[EGGSFS_MAX_BLOCKS];
|
||||
struct eggsfs_latch stripe_latches[15];
|
||||
u32 cell_size;
|
||||
u32 stripes_crc[15];
|
||||
u8 stripes;
|
||||
u8 parity;
|
||||
};
|
||||
|
||||
#define EGGSFS_BLOCK_SPAN(_span) ({ \
|
||||
BUG_ON(_span->storage_class == EGGSFS_EMPTY_STORAGE || _span->storage_class == EGGSFS_INLINE_STORAGE); \
|
||||
container_of(_span, struct eggsfs_block_span, span); \
|
||||
})
|
||||
|
||||
// Fetches and caches the span if necessary. `offset` need not be the actual
|
||||
// span offset, the span containing the offset will be returned.
|
||||
//
|
||||
// If the offset is out of bounds, NULL will be returned. Errors might
|
||||
// be returned if we fail to fetch the span.
|
||||
struct eggsfs_span* eggsfs_get_span(struct eggsfs_inode* enode, u64 offset);
|
||||
|
||||
// The page_ix is the page number inside the span.
|
||||
struct page* eggsfs_get_span_page(struct eggsfs_block_span* span, u32 page_ix);
|
||||
|
||||
void eggsfs_file_spans_cb_span(void* data, u64 offset, u32 size, u32 crc, u8 storage_class, u8 parity, u8 stripes, u32 cell_size, const uint32_t* stripes_crcs);
|
||||
void eggsfs_file_spans_cb_block(
|
||||
void* data, int block_ix,
|
||||
// block service stuff
|
||||
u64 bs_id, u32 ip1, u16 port1, u32 ip2, u16 port2,
|
||||
// block stuff
|
||||
u64 block_id, u32 crc
|
||||
);
|
||||
void eggsfs_file_spans_cb_inline_span(void* data, u64 offset, u32 size, u8 len, const char* body);
|
||||
|
||||
#endif
|
||||
Executable
+20
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
set -eu -o pipefail
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
|
||||
# qemu-system-x86_64 -s -enable-kvm -cpu host -m 64G -drive "file=${SCRIPT_DIR}/ubuntu-20.04.qcow2,if=virtio" -nic user,hostfwd=tcp::2222-:22,model=virtio-net-pci -smp 50 -virtfs local,path=/data/home/fmazzol/src/eggsfs/kmod/eggs-data,mount_tag=eggs-data,security_model=mapped,id=eggs-data -vnc :1 -monitor stdio
|
||||
|
||||
qemu-system-x86_64 \
|
||||
-machine accel=kvm,type=q35 \
|
||||
-enable-kvm \
|
||||
-cpu host \
|
||||
-kernel /data/home/fmazzol/src/eggsfs/kmod/linux-5.4.237/arch/x86/boot/bzImage \
|
||||
-append "root=/dev/sda1 single console=ttyS0 systemd.unit=graphical.target" \
|
||||
-hda ubuntu-20.04.img \
|
||||
-hdb init.img \
|
||||
-m 64G \
|
||||
-smp 50 \
|
||||
-nographic \
|
||||
-nic user,hostfwd=tcp::2222-:22,model=virtio-net-pci \
|
||||
-virtfs local,path=/data/home/fmazzol/src/eggsfs/kmod/eggs-data,mount_tag=eggs-data,security_model=mapped,id=eggs-data
|
||||
+274
@@ -0,0 +1,274 @@
|
||||
#include "super.h"
|
||||
|
||||
#include <linux/inet.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "inode.h"
|
||||
#include "metadata.h"
|
||||
#include "namei.h"
|
||||
#include "net.h"
|
||||
#include "shuckle.h"
|
||||
#include "bincode.h"
|
||||
#include "sysfs.h"
|
||||
#include "err.h"
|
||||
#include "rs.h"
|
||||
|
||||
static void eggsfs_free_fs_info(struct eggsfs_fs_info* info) {
|
||||
eggsfs_debug_print("info=%p", info);
|
||||
eggsfs_net_shard_free_socket(&info->sock);
|
||||
kfree(info);
|
||||
}
|
||||
|
||||
static struct eggsfs_fs_info* eggsfs_init_fs_info(const char* dev_name) {
|
||||
int err;
|
||||
|
||||
struct eggsfs_fs_info* info = kmalloc(sizeof(struct eggsfs_fs_info), GFP_KERNEL);
|
||||
if (!info) { err = -ENOMEM; goto out; }
|
||||
|
||||
err = eggsfs_init_shard_socket(&info->sock);
|
||||
if (err) { goto out_info; }
|
||||
|
||||
struct socket* shuckle_sock;
|
||||
err = eggsfs_create_shuckle_socket(dev_name, &shuckle_sock);
|
||||
if (err < 0) { goto out_info; }
|
||||
|
||||
struct kvec iov;
|
||||
struct msghdr msg = {NULL};
|
||||
|
||||
static_assert(EGGSFS_SHARDS_REQ_SIZE == 0);
|
||||
char shuckle_req[EGGSFS_SHUCKLE_REQ_HEADER_SIZE];
|
||||
eggsfs_write_shuckle_req_header(shuckle_req, 0, EGGSFS_SHUCKLE_SHARDS);
|
||||
int written_so_far;
|
||||
for (written_so_far = 0; written_so_far < sizeof(shuckle_req);) {
|
||||
iov.iov_base = shuckle_req + written_so_far;
|
||||
iov.iov_len = sizeof(shuckle_req) - written_so_far;
|
||||
int written = kernel_sendmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len);
|
||||
if (written < 0) { err = written; goto out_sock; }
|
||||
written_so_far += written;
|
||||
}
|
||||
|
||||
char shards_resp_header[EGGSFS_SHUCKLE_RESP_HEADER_SIZE + 2]; // + 2 = list len
|
||||
int read_so_far;
|
||||
for (read_so_far = 0; read_so_far < sizeof(shards_resp_header);) {
|
||||
iov.iov_base = shards_resp_header + read_so_far;
|
||||
iov.iov_len = sizeof(shards_resp_header) - read_so_far;
|
||||
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
|
||||
if (read < 0) { err = read; goto out_sock; }
|
||||
read_so_far += read;
|
||||
}
|
||||
u32 shuckle_resp_len;
|
||||
u8 shuckle_resp_kind;
|
||||
err = eggsfs_read_shuckle_resp_header(shards_resp_header, &shuckle_resp_len, &shuckle_resp_kind);
|
||||
if (err < 0) { goto out_sock; }
|
||||
u16 shard_info_len = get_unaligned_le16(shards_resp_header + sizeof(shards_resp_header) - 2);
|
||||
if (shard_info_len != 256) {
|
||||
eggsfs_info_print("expected 256 shard infos, got %d", shard_info_len);
|
||||
err = -EIO; goto out_sock;
|
||||
}
|
||||
if (shuckle_resp_len != 2 + EGGSFS_SHARD_INFO_SIZE*256) {
|
||||
eggsfs_info_print("expected size of %d, got %d", 2 + EGGSFS_SHARD_INFO_SIZE*256, shuckle_resp_len);
|
||||
err = -EIO; goto out_sock;
|
||||
}
|
||||
|
||||
int shid;
|
||||
for (shid = 0; shid < 256; shid++) {
|
||||
char shard_info_resp[EGGSFS_SHARD_INFO_SIZE];
|
||||
for (read_so_far = 0; read_so_far < EGGSFS_SHARD_INFO_SIZE;) {
|
||||
iov.iov_base = shard_info_resp + read_so_far;
|
||||
iov.iov_len = sizeof(shard_info_resp) - read_so_far;
|
||||
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
|
||||
if (read < 0) { err = read; goto out_sock; }
|
||||
read_so_far += read;
|
||||
}
|
||||
struct eggsfs_bincode_get_ctx ctx = {
|
||||
.buf = shard_info_resp,
|
||||
.end = shard_info_resp + sizeof(shard_info_resp),
|
||||
.err = 0,
|
||||
};
|
||||
eggsfs_shard_info_get_start(&ctx, start);
|
||||
eggsfs_shard_info_get_ip(&ctx, start, shard_ip);
|
||||
eggsfs_shard_info_get_port(&ctx, shard_ip, shard_port);
|
||||
eggsfs_shard_info_get_last_seen(&ctx, shard_port, last_seen);
|
||||
eggsfs_shard_info_get_end(&ctx, last_seen, end);
|
||||
eggsfs_shard_info_get_finish(&ctx, end);
|
||||
if (ctx.err != 0) { err = eggsfs_error_to_linux(ctx.err); goto out_sock; }
|
||||
|
||||
struct sockaddr_in* addr = &info->shards_addrs[shid];
|
||||
struct msghdr* hdr = &info->shards_msghdrs[shid];
|
||||
|
||||
addr->sin_addr.s_addr = htonl(shard_ip.x);
|
||||
addr->sin_family = AF_INET;
|
||||
addr->sin_port = htons(shard_port.x);
|
||||
|
||||
eggsfs_debug_print("shard %d has addr %pI4:%d", shid, &addr->sin_addr, ntohs(addr->sin_port));
|
||||
|
||||
hdr->msg_name = addr;
|
||||
hdr->msg_namelen = sizeof(struct sockaddr_in);
|
||||
hdr->msg_control = NULL;
|
||||
hdr->msg_controllen = 0;
|
||||
hdr->msg_flags = 0;
|
||||
}
|
||||
|
||||
static_assert(EGGSFS_CDC_REQ_SIZE == 0);
|
||||
eggsfs_write_shuckle_req_header(shuckle_req, EGGSFS_SHARDS_REQ_SIZE, EGGSFS_SHUCKLE_CDC);
|
||||
for (written_so_far = 0; written_so_far < sizeof(shuckle_req);) {
|
||||
iov.iov_base = shuckle_req + written_so_far;
|
||||
iov.iov_len = sizeof(shuckle_req) - written_so_far;
|
||||
int written = kernel_sendmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len);
|
||||
if (written < 0) { err = written; goto out_sock; }
|
||||
written_so_far += written;
|
||||
}
|
||||
|
||||
char cdc_resp_header[EGGSFS_SHUCKLE_RESP_HEADER_SIZE];
|
||||
for (read_so_far = 0; read_so_far < sizeof(cdc_resp_header);) {
|
||||
iov.iov_base = cdc_resp_header + read_so_far;
|
||||
iov.iov_len = sizeof(cdc_resp_header) - read_so_far;
|
||||
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
|
||||
if (read < 0) { err = read; goto out_sock; }
|
||||
read_so_far += read;
|
||||
}
|
||||
err = eggsfs_read_shuckle_resp_header(cdc_resp_header, &shuckle_resp_len, &shuckle_resp_kind);
|
||||
if (err < 0) { goto out_sock; }
|
||||
if (shuckle_resp_len != EGGSFS_CDC_RESP_SIZE) {
|
||||
eggsfs_debug_print("expected size of %d, got %d", EGGSFS_CDC_RESP_SIZE, shuckle_resp_len);
|
||||
err = -EINVAL; goto out_sock;
|
||||
}
|
||||
{
|
||||
char cdc_resp[EGGSFS_CDC_RESP_SIZE];
|
||||
for (read_so_far = 0; read_so_far < sizeof(cdc_resp);) {
|
||||
iov.iov_base = (char*)&cdc_resp + read_so_far;
|
||||
iov.iov_len = sizeof(cdc_resp) - read_so_far;
|
||||
int read = kernel_recvmsg(shuckle_sock, &msg, &iov, 1, iov.iov_len, 0);
|
||||
if (read < 0) { err = read; goto out_sock; }
|
||||
read_so_far += read;
|
||||
}
|
||||
struct eggsfs_bincode_get_ctx ctx = {
|
||||
.buf = cdc_resp,
|
||||
.end = cdc_resp + sizeof(cdc_resp),
|
||||
.err = 0,
|
||||
};
|
||||
eggsfs_cdc_resp_get_start(&ctx, start);
|
||||
eggsfs_cdc_resp_get_ip(&ctx, start, cdc_ip);
|
||||
eggsfs_cdc_resp_get_port(&ctx, cdc_ip, cdc_port);
|
||||
eggsfs_cdc_resp_get_last_seen(&ctx, cdc_port, last_seen);
|
||||
eggsfs_cdc_resp_get_end(&ctx, last_seen, end);
|
||||
eggsfs_cdc_resp_get_finish(&ctx, end);
|
||||
if (ctx.err != 0) { err = eggsfs_error_to_linux(ctx.err); goto out_sock; }
|
||||
|
||||
struct sockaddr_in* addr = &info->cdc_addr;
|
||||
struct msghdr* hdr = &info->cdc_msghdr;
|
||||
|
||||
addr->sin_addr.s_addr = htonl(cdc_ip.x);
|
||||
addr->sin_family = AF_INET;
|
||||
addr->sin_port = htons(cdc_port.x);
|
||||
|
||||
eggsfs_debug_print("CDC has addr %pI4:%d", &addr->sin_addr, ntohs(addr->sin_port));
|
||||
|
||||
hdr->msg_name = addr;
|
||||
hdr->msg_namelen = sizeof(struct sockaddr_in);
|
||||
hdr->msg_control = NULL;
|
||||
hdr->msg_controllen = 0;
|
||||
hdr->msg_flags = 0;
|
||||
}
|
||||
|
||||
sock_release(shuckle_sock);
|
||||
|
||||
eggsfs_info_print("mount successful");
|
||||
|
||||
return info;
|
||||
|
||||
out_sock:
|
||||
sock_release(shuckle_sock);
|
||||
out_info:
|
||||
eggsfs_free_fs_info(info);
|
||||
out:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static void eggsfs_put_super(struct super_block* sb) {
|
||||
eggsfs_debug_print("sb=%p", sb);
|
||||
eggsfs_free_fs_info(sb->s_fs_info);
|
||||
sb->s_fs_info = NULL;
|
||||
}
|
||||
|
||||
static const struct super_operations eggsfs_super_ops = {
|
||||
.alloc_inode = eggsfs_inode_alloc,
|
||||
.evict_inode = eggsfs_inode_evict,
|
||||
.free_inode = eggsfs_inode_free,
|
||||
|
||||
.put_super = eggsfs_put_super,
|
||||
};
|
||||
|
||||
static struct dentry* eggsfs_mount(struct file_system_type* fs_type, int flags, const char* dev_name, void* data) {
|
||||
int err;
|
||||
|
||||
eggsfs_info_print("mounting at %s", dev_name);
|
||||
eggsfs_debug_print("fs_type=%p flags=%d dev_name=%s data=%p", fs_type, flags, dev_name, data);
|
||||
|
||||
struct eggsfs_fs_info* info = eggsfs_init_fs_info(dev_name);
|
||||
if (IS_ERR(info)) { err = PTR_ERR(info); goto out_err; }
|
||||
|
||||
struct super_block* sb = sget(fs_type, NULL, set_anon_super, flags, NULL);
|
||||
if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out_info; }
|
||||
|
||||
sb->s_fs_info = info;
|
||||
|
||||
sb->s_flags = SB_NOSUID | SB_NODEV | SB_NOEXEC | SB_NOATIME | SB_NODIRATIME;
|
||||
sb->s_iflags = SB_I_NOEXEC | SB_I_NODEV;
|
||||
|
||||
sb->s_op = &eggsfs_super_ops;
|
||||
sb->s_d_op = &eggsfs_dentry_ops;
|
||||
|
||||
struct inode* root = eggsfs_get_inode(sb, EGGSFS_ROOT_INODE);
|
||||
if (IS_ERR(root)) { err = PTR_ERR(root); goto out_sb; }
|
||||
|
||||
struct eggsfs_inode* root_enode = EGGSFS_I(root);
|
||||
|
||||
err = eggsfs_do_getattr(root_enode);
|
||||
if (err) { goto out_sb; }
|
||||
|
||||
BUG_ON(
|
||||
!root_enode->block_policies.len || !root_enode->span_policies.len || !root_enode->target_stripe_size
|
||||
);
|
||||
|
||||
sb->s_root = d_make_root(root);
|
||||
if (!sb->s_root) { err = -ENOMEM; goto out_sb; }
|
||||
|
||||
sb->s_flags |= SB_ACTIVE;
|
||||
|
||||
return dget(sb->s_root);
|
||||
|
||||
out_sb:
|
||||
deactivate_locked_super(sb);
|
||||
out_info:
|
||||
eggsfs_free_fs_info(info);
|
||||
out_err:
|
||||
eggsfs_debug_print("failed err=%d", err);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static void eggsfs_kill_sb(struct super_block* sb) {
|
||||
eggsfs_debug_print("sb=%p", sb);
|
||||
kill_anon_super(sb);
|
||||
}
|
||||
|
||||
static struct file_system_type eggsfs_fs_type = {
|
||||
.owner = THIS_MODULE,
|
||||
.name = "eggsfs",
|
||||
.mount = eggsfs_mount,
|
||||
.kill_sb = eggsfs_kill_sb,
|
||||
// TODO is FS_RENAME_DOES_D_MOVE needed?
|
||||
.fs_flags = FS_RENAME_DOES_D_MOVE,
|
||||
};
|
||||
MODULE_ALIAS_FS("eggsfs");
|
||||
|
||||
int __init eggsfs_fs_init(void) {
|
||||
return register_filesystem(&eggsfs_fs_type);
|
||||
}
|
||||
|
||||
void __cold eggsfs_fs_exit(void) {
|
||||
unregister_filesystem(&eggsfs_fs_type);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
#ifndef _EGGSFS_SUPER_H
|
||||
#define _EGGSFS_SUPER_H
|
||||
|
||||
#include <linux/inet.h>
|
||||
|
||||
#include "net.h"
|
||||
|
||||
struct eggsfs_fs_info {
|
||||
struct eggsfs_shard_socket sock;
|
||||
|
||||
// NB: ->msg_iter is used by `kernel_sendmsg`, so when you want to use the `struct msghhdr`
|
||||
// stored here you should copy it to some local structure, and then use it, otherwise
|
||||
// multiple users might step on each others' toes.
|
||||
|
||||
struct sockaddr_in shards_addrs[256];
|
||||
struct msghdr shards_msghdrs[256];
|
||||
struct sockaddr_in cdc_addr;
|
||||
struct msghdr cdc_msghdr;
|
||||
};
|
||||
|
||||
int __init eggsfs_fs_init(void);
|
||||
void __cold eggsfs_fs_exit(void);
|
||||
|
||||
#endif
|
||||
|
||||
Executable
+4
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set -eu -o pipefail
|
||||
|
||||
rsync -vaxAXL --include '*.py' --include '*.c' --include '*.h' --include 'Makefile' --exclude '*' ./ uovo:eggs-kmod/
|
||||
+106
@@ -0,0 +1,106 @@
|
||||
#include <linux/sysctl.h>
|
||||
|
||||
#include "dir.h"
|
||||
#include "span.h"
|
||||
|
||||
int eggsfs_max_request_retries = 3;
|
||||
|
||||
#if 0
|
||||
bool eggsfs_reclaim_span(void) { return false; }
|
||||
|
||||
static int eggsfs_drop_spancache_var;
|
||||
|
||||
static int eggsfs_drop_spancache_sysctl(struct ctl_table* table, int write, void __user* buffer, size_t* len, loff_t* ppos) {
|
||||
int ret;
|
||||
ret = proc_dointvec_minmax(table, write, buffer, len, ppos);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (write) {
|
||||
int n_spans = 0;
|
||||
while (eggsfs_reclaim_span()) { ++n_spans; }
|
||||
printk(KERN_INFO "eggsfs: reclaimed %d spans\n", n_spans);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define EGGSFS_CTL_ULONG(_name) \
|
||||
{ \
|
||||
.procname = #_name, \
|
||||
.data = &eggsfs_##_name, \
|
||||
.maxlen = sizeof(eggsfs_##_name), \
|
||||
.mode = 0644, \
|
||||
.proc_handler = proc_doulongvec_minmax, \
|
||||
}
|
||||
|
||||
#define EGGSFS_CTL_INT_TIME(_name) \
|
||||
{ \
|
||||
.procname = #_name "_ms", \
|
||||
.data = &eggsfs_##_name, \
|
||||
.maxlen = sizeof(eggsfs_##_name), \
|
||||
.mode = 0644, \
|
||||
.proc_handler = proc_dointvec_ms_jiffies, \
|
||||
}
|
||||
|
||||
static struct ctl_table eggsfs_cb_sysctls[] = {
|
||||
{
|
||||
.procname = "max_request_retries",
|
||||
.data = &eggsfs_max_request_retries,
|
||||
.maxlen = sizeof(eggsfs_max_request_retries),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
|
||||
EGGSFS_CTL_INT_TIME(dir_refresh_time),
|
||||
|
||||
#if 0
|
||||
EGGSFS_CTL_ULONG(spancache_max_size_async),
|
||||
EGGSFS_CTL_ULONG(spancache_min_avail_mem_async),
|
||||
EGGSFS_CTL_ULONG(spancache_max_size_sync),
|
||||
EGGSFS_CTL_ULONG(spancache_min_avail_mem_sync),
|
||||
|
||||
{
|
||||
.procname = "drop_spancache",
|
||||
.data = &eggsfs_drop_spancache_var,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0200,
|
||||
.proc_handler = eggsfs_drop_spancache_sysctl,
|
||||
},
|
||||
#endif
|
||||
|
||||
{}
|
||||
};
|
||||
|
||||
static struct ctl_table eggsfs_cb_sysctl_dir[] = {
|
||||
{
|
||||
.procname = "eggsfs",
|
||||
.mode = 0555,
|
||||
.child = eggsfs_cb_sysctls,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static struct ctl_table eggsfs_cb_sysctl_root[] = {
|
||||
{
|
||||
.procname = "fs",
|
||||
.mode = 0555,
|
||||
.child = eggsfs_cb_sysctl_dir,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static struct ctl_table_header* eggsfs_callback_sysctl_table;
|
||||
|
||||
int __init eggsfs_sysctl_init(void) {
|
||||
eggsfs_callback_sysctl_table = register_sysctl_table(eggsfs_cb_sysctl_root);
|
||||
if (eggsfs_callback_sysctl_table == NULL) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __cold eggsfs_sysctl_exit(void) {
|
||||
unregister_sysctl_table(eggsfs_callback_sysctl_table);
|
||||
eggsfs_callback_sysctl_table = NULL;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef _EGGSFS_SYSCTL_H
|
||||
#define _EGGSFS_SYSCTL_H
|
||||
|
||||
#include <linux/init.h>
|
||||
|
||||
int __init eggsfs_sysctl_init(void);
|
||||
void __cold eggsfs_sysctl_exit(void);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
#include "sysfs.h"
|
||||
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/fs.h>
|
||||
|
||||
#include "namei.h"
|
||||
|
||||
static struct kset* eggsfs_kset;
|
||||
|
||||
struct eggsfs_sysfs_counter {
|
||||
struct kobj_attribute kobj_attr;
|
||||
u64 __percpu * ptr;
|
||||
};
|
||||
|
||||
static ssize_t eggsfs_sysfs_counter_show(struct kobject* kobj, struct kobj_attribute* attr, char* buf) {
|
||||
struct eggsfs_sysfs_counter* ettr = container_of(attr, struct eggsfs_sysfs_counter, kobj_attr);
|
||||
u64 val = eggsfs_counter_get(ettr->ptr);
|
||||
return snprintf(buf, PAGE_SIZE, "%llu\n", val);
|
||||
}
|
||||
|
||||
static ssize_t eggsfs_sysfs_counter_store(struct kobject *kobj, struct kobj_attribute *a, const char *buf, size_t count) {
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
static umode_t eggsfs_stat_visible(struct kobject *kobj, struct attribute *attr, int unused) {
|
||||
return attr->mode;
|
||||
}
|
||||
|
||||
#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
|
||||
{ \
|
||||
.attr = { .name = __stringify(_name), .mode = _mode }, \
|
||||
.show = _show, \
|
||||
.store = _store, \
|
||||
}
|
||||
|
||||
#define EGGSFS_SYSFS_COUNTER(_name) \
|
||||
static struct eggsfs_sysfs_counter eggsfs_sysfs_counter_##_name = { \
|
||||
.kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, eggsfs_sysfs_counter_show, eggsfs_sysfs_counter_store), \
|
||||
.ptr = &eggsfs_stat_##_name, \
|
||||
}
|
||||
|
||||
#define EGGSFS_SYSFS_COUNTER_PTR(_name) (&eggsfs_sysfs_counter_##_name.kobj_attr.attr)
|
||||
|
||||
EGGSFS_SYSFS_COUNTER(dir_revalidations);
|
||||
|
||||
static struct attribute* eggsfs_stat_attrs[] = {
|
||||
EGGSFS_SYSFS_COUNTER_PTR(dir_revalidations),
|
||||
NULL
|
||||
};
|
||||
|
||||
static const struct attribute_group eggsfs_stat_attr_group = {
|
||||
.name = "stats",
|
||||
.is_visible = eggsfs_stat_visible,
|
||||
.attrs = eggsfs_stat_attrs,
|
||||
};
|
||||
|
||||
int __init eggsfs_sysfs_init(void) {
|
||||
int err;
|
||||
|
||||
eggsfs_kset = kset_create_and_add("eggsfs", NULL, fs_kobj);
|
||||
if (!eggsfs_kset) { return -ENOMEM; }
|
||||
|
||||
err = sysfs_create_group(&eggsfs_kset->kobj, &eggsfs_stat_attr_group);
|
||||
if (err) { goto out_unreg; }
|
||||
|
||||
return 0;
|
||||
|
||||
out_unreg:
|
||||
kset_unregister(eggsfs_kset);
|
||||
return err;
|
||||
}
|
||||
|
||||
void __cold eggsfs_sysfs_exit(void) {
|
||||
kset_unregister(eggsfs_kset);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef _EGGSFS_SYSFS_H
|
||||
#define _EGGSFS_SYSFS_H
|
||||
|
||||
#include <linux/init.h>
|
||||
|
||||
int __init eggsfs_sysfs_init(void);
|
||||
void __cold eggsfs_sysfs_exit(void);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include "trace.h"
|
||||
|
||||
+423
@@ -0,0 +1,423 @@
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM eggsfs
|
||||
|
||||
#if !defined(_TRACE_EGGFS_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_EGGSFS_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/net.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/sock.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <net/inet_common.h>
|
||||
|
||||
#include "inode.h"
|
||||
#include "rs.h"
|
||||
|
||||
TRACE_DEFINE_ENUM(LOOKUP_FOLLOW);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_DIRECTORY);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_PARENT);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_REVAL);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_RCU);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_OPEN);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_CREATE);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_EXCL);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_JUMPED);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_ROOT);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
|
||||
TRACE_DEFINE_ENUM(LOOKUP_DOWN);
|
||||
|
||||
#define show_lookup_flags(flags) \
|
||||
__print_flags(flags, "|", \
|
||||
{ LOOKUP_FOLLOW, "FOLLOW" }, \
|
||||
{ LOOKUP_DIRECTORY, "DIRECTORY" }, \
|
||||
{ LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
|
||||
{ LOOKUP_PARENT, "PARENT" }, \
|
||||
{ LOOKUP_REVAL, "REVAL" }, \
|
||||
{ LOOKUP_RCU, "RCU" }, \
|
||||
{ LOOKUP_NO_REVAL, "NO_REVAL" }, \
|
||||
{ LOOKUP_OPEN, "OPEN" }, \
|
||||
{ LOOKUP_CREATE, "CREATE" }, \
|
||||
{ LOOKUP_EXCL, "EXCL" }, \
|
||||
{ LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
|
||||
{ LOOKUP_JUMPED, "JUMPED" }, \
|
||||
{ LOOKUP_ROOT, "ROOT" }, \
|
||||
{ LOOKUP_EMPTY, "EMPTY" }, \
|
||||
{ LOOKUP_DOWN, "DOWN" })
|
||||
|
||||
|
||||
#define EGGSFS_TRACE_EVENT_inode(_name) \
|
||||
TRACE_EVENT(eggsfs_##_name, \
|
||||
TP_PROTO(struct inode* inode), \
|
||||
TP_ARGS(inode), \
|
||||
TP_STRUCT__entry( \
|
||||
__field(u64, ino) \
|
||||
), \
|
||||
TP_fast_assign( \
|
||||
__entry->ino = inode->i_ino; \
|
||||
), \
|
||||
TP_printk("enode=%#llx", __entry->ino) \
|
||||
)
|
||||
|
||||
#define EGGSFS_TRACE_EVENT_inode_ret(_name) \
|
||||
TRACE_EVENT(eggsfs_##_name, \
|
||||
TP_PROTO(struct inode* inode, int err), \
|
||||
TP_ARGS(inode, err), \
|
||||
TP_STRUCT__entry( \
|
||||
__field(u64, ino) \
|
||||
__field(int, err) \
|
||||
), \
|
||||
TP_fast_assign( \
|
||||
__entry->ino = inode->i_ino; \
|
||||
__entry->err = err; \
|
||||
), \
|
||||
TP_printk("enode=%#llx err=%d", __entry->ino, __entry->err) \
|
||||
)
|
||||
|
||||
#define EGGSFS_TRACE_EVENT_dir_dentry(_name) \
|
||||
TRACE_EVENT(eggsfs_##_name, \
|
||||
TP_PROTO(struct inode* dir, struct dentry* dentry), \
|
||||
TP_ARGS(dir, dentry), \
|
||||
TP_STRUCT__entry( \
|
||||
__field(u64, dir) \
|
||||
__string(name, dentry->d_name.name) \
|
||||
), \
|
||||
TP_fast_assign( \
|
||||
__entry->dir = dir->i_ino; \
|
||||
__assign_str(name, dentry->d_name.name); \
|
||||
), \
|
||||
TP_printk("dir=%#llx name=%s", __entry->dir, __get_str(name)) \
|
||||
)
|
||||
|
||||
#define EGGSFS_TRACE_EVENT_dir_dentry_ret(_name) \
|
||||
TRACE_EVENT(eggsfs_##_name, \
|
||||
TP_PROTO(struct inode* dir, struct dentry* dentry, int err), \
|
||||
TP_ARGS(dir, dentry, err), \
|
||||
TP_STRUCT__entry( \
|
||||
__field(u64, dir) \
|
||||
__field(int, err) \
|
||||
__string(name, dentry->d_name.name) \
|
||||
), \
|
||||
TP_fast_assign( \
|
||||
__entry->dir = dir->i_ino; \
|
||||
__entry->err = err; \
|
||||
__assign_str(name, dentry->d_name.name); \
|
||||
), \
|
||||
TP_printk("dir=%llx name=%s err=%d", __entry->dir, __get_str(name), __entry->err) \
|
||||
)
|
||||
|
||||
#define EGGSFS_TRACE_EVENT_dir_dentry_inode(_name) \
|
||||
TRACE_EVENT(eggsfs_##_name, \
|
||||
TP_PROTO(struct inode* dir, struct dentry* dentry, struct inode* inode), \
|
||||
TP_ARGS(dir, dentry, inode), \
|
||||
TP_STRUCT__entry( \
|
||||
__field(u64, dir) \
|
||||
__field(u64, ino) \
|
||||
__string(name, dentry->d_name.name) \
|
||||
), \
|
||||
TP_fast_assign( \
|
||||
__entry->dir = dir->i_ino; \
|
||||
__entry->ino = inode ? inode->i_ino : 0; \
|
||||
__assign_str(name, dentry->d_name.name); \
|
||||
), \
|
||||
TP_printk("dir=%llx name=%s ino=%llx", __entry->dir, __get_str(name), __entry->ino) \
|
||||
)
|
||||
|
||||
|
||||
#define EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(_name) \
|
||||
TRACE_EVENT(eggsfs_##_name, \
|
||||
TP_PROTO(struct inode* dir, struct dentry* dentry, struct inode* inode, int err), \
|
||||
TP_ARGS(dir, dentry, inode, err), \
|
||||
TP_STRUCT__entry( \
|
||||
__field(u64, dir) \
|
||||
__field(u64, ino) \
|
||||
__field(int, err) \
|
||||
__string(name, dentry->d_name.name) \
|
||||
), \
|
||||
TP_fast_assign( \
|
||||
__entry->dir = dir->i_ino; \
|
||||
__entry->ino = inode ? inode->i_ino : 0; \
|
||||
__entry->err = err; \
|
||||
__assign_str(name, dentry->d_name.name); \
|
||||
), \
|
||||
TP_printk("dir=%llx name=%s ino=%llx err=%d", __entry->dir, __get_str(name), __entry->ino, __entry->err) \
|
||||
)
|
||||
|
||||
// dcache
|
||||
EGGSFS_TRACE_EVENT_inode(dcache_delete_inode);
|
||||
EGGSFS_TRACE_EVENT_inode(dcache_invalidate_dir);
|
||||
EGGSFS_TRACE_EVENT_dir_dentry(dcache_invalidate_neg_entry);
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_inode(dcache_delete_entry);
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_inode(dcache_invalidate_entry);
|
||||
|
||||
// inode.c
|
||||
EGGSFS_TRACE_EVENT_inode(vfs_getattr_enter);
|
||||
EGGSFS_TRACE_EVENT_inode(vfs_getattr_lock);
|
||||
EGGSFS_TRACE_EVENT_inode_ret(vfs_getattr_exit);
|
||||
|
||||
// dir.c
|
||||
EGGSFS_TRACE_EVENT_inode(vfs_opendir_enter);
|
||||
EGGSFS_TRACE_EVENT_inode_ret(vfs_opendir_exit);
|
||||
|
||||
EGGSFS_TRACE_EVENT_inode(vfs_closedir_enter);
|
||||
EGGSFS_TRACE_EVENT_inode_ret(vfs_closedir_exit);
|
||||
|
||||
// namei.c
|
||||
EGGSFS_TRACE_EVENT_dir_dentry(vfs_lookup_enter);
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(vfs_lookup_exit);
|
||||
|
||||
EGGSFS_TRACE_EVENT_dir_dentry(vfs_mkdir_enter);
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_inode_ret(vfs_mkdir_exit);
|
||||
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_inode(vfs_rmdir_enter);
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_ret(vfs_rmdir_exit);
|
||||
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_inode(vfs_unlink_enter);
|
||||
EGGSFS_TRACE_EVENT_dir_dentry_ret(vfs_unlink_exit);
|
||||
|
||||
TRACE_EVENT(eggsfs_vfs_rename_enter,
|
||||
TP_PROTO(struct inode* old_dir, struct dentry* old_dentry, struct inode* new_dir, struct dentry* new_dentry),
|
||||
TP_ARGS(old_dir, old_dentry, new_dir, new_dentry),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, old_dir_ino)
|
||||
__field(u64, new_dir_ino)
|
||||
__field(u64, ino)
|
||||
__string(old_parent, old_dentry->d_parent->d_name.name)
|
||||
__string(old_name, old_dentry->d_name.name)
|
||||
__string(new_parent, new_dentry->d_parent->d_name.name)
|
||||
__string(new_name, new_dentry->d_name.name)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->old_dir_ino = old_dir->i_ino;
|
||||
__entry->new_dir_ino = new_dir->i_ino;
|
||||
__entry->ino = old_dentry->d_inode->i_ino;
|
||||
__assign_str(old_parent, old_dentry->d_parent->d_name.name);
|
||||
__assign_str(old_name, old_dentry->d_name.name)
|
||||
__assign_str(new_parent, new_dentry->d_parent->d_name.name);
|
||||
__assign_str(new_name, new_dentry->d_name.name)
|
||||
),
|
||||
TP_printk("ino=%lld old_dir=%lld old_name=%s/%s -> new_dir=%lld new_name=%s/%s", __entry->ino, __entry->old_dir_ino, __get_str(old_parent), __get_str(old_name), __entry->new_dir_ino, __get_str(new_parent), __get_str(new_name))
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_vfs_rename_exit,
|
||||
TP_PROTO(struct inode* old_dir, struct dentry* old_dentry, struct inode* new_dir, struct dentry* new_dentry, int err),
|
||||
TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, err),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, old_dir_ino)
|
||||
__field(u64, new_dir_ino)
|
||||
__field(u64, ino)
|
||||
__field(int, err)
|
||||
__string(old_parent, old_dentry->d_parent->d_name.name)
|
||||
__string(old_name, old_dentry->d_name.name)
|
||||
__string(new_parent, new_dentry->d_parent->d_name.name)
|
||||
__string(new_name, new_dentry->d_name.name)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->old_dir_ino = old_dir->i_ino;
|
||||
__entry->new_dir_ino = new_dir->i_ino;
|
||||
__entry->ino = old_dentry->d_inode->i_ino;
|
||||
__entry->err = err;
|
||||
__assign_str(old_parent, old_dentry->d_parent->d_name.name);
|
||||
__assign_str(old_name, old_dentry->d_name.name)
|
||||
__assign_str(new_parent, new_dentry->d_parent->d_name.name);
|
||||
__assign_str(new_name, new_dentry->d_name.name)
|
||||
),
|
||||
TP_printk("ino=%lld old_dir=%lld old_name=%s/%s -> new_dir=%lld new_name=%s/%s err=%d", __entry->ino, __entry->old_dir_ino, __get_str(old_parent), __get_str(old_name), __entry->new_dir_ino, __get_str(new_parent), __get_str(new_name), __entry->err)
|
||||
);
|
||||
|
||||
|
||||
TRACE_EVENT(eggsfs_metadata_request_enter,
|
||||
TP_PROTO(struct msghdr* msg, u64 req_id, u32 len, s16 shard_id, u8 kind),
|
||||
TP_ARGS(msg, req_id, len, shard_id, kind),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(u8, addr, sizeof(struct sockaddr_in))
|
||||
__field(u64, req_id)
|
||||
__field(u32, len)
|
||||
__field(s16, shard_id) // -1 is used for CDC
|
||||
__field(u8, kind)
|
||||
),
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->addr, msg->msg_name, sizeof(struct sockaddr_in));
|
||||
__entry->req_id = req_id;
|
||||
__entry->len = len;
|
||||
__entry->shard_id = shard_id;
|
||||
__entry->kind = kind;
|
||||
),
|
||||
TP_printk("dst=%pISp req_id=%llu shard_id=%d kind=%d len=%u", __entry->addr, __entry->req_id, __entry->shard_id, (int)__entry->kind, __entry->len)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_metadata_request_exit,
|
||||
TP_PROTO(u64 req_id, u32 n_attempts, u32 resp_len, int error),
|
||||
TP_ARGS(req_id, n_attempts, resp_len, error),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, req_id)
|
||||
__field(u32, n_attempts)
|
||||
__field(u32, resp_len)
|
||||
__field(int, error)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->req_id = req_id;
|
||||
__entry->n_attempts = n_attempts;
|
||||
__entry->resp_len = resp_len;
|
||||
__entry->error = error;
|
||||
),
|
||||
TP_printk("req_id=%llu n_attempts=%u resp_len=%u error=%d", __entry->req_id, __entry->n_attempts, __entry->resp_len, __entry->error)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_get_inode_enter,
|
||||
TP_PROTO(u64 ino),
|
||||
TP_ARGS(ino),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, ino)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->ino = ino;
|
||||
),
|
||||
TP_printk("ino=%lld", __entry->ino)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_get_inode_exit,
|
||||
TP_PROTO(u64 ino, struct inode* inode, bool new, int error),
|
||||
TP_ARGS(ino, inode, new, error),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, ino)
|
||||
__field(struct inode*, inode)
|
||||
__field(bool, new)
|
||||
__field(int, error)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->ino = ino;
|
||||
__entry->inode = inode;
|
||||
__entry->new = new;
|
||||
__entry->error = error;
|
||||
),
|
||||
TP_printk("ino=%lld inode=%p new=%d error=%d", __entry->ino, __entry->inode, __entry->new, __entry->error)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_dentry_handle_enoent,
|
||||
TP_PROTO(struct dentry* dentry),
|
||||
TP_ARGS(dentry),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, ino)
|
||||
__string(parent, dentry->d_parent->d_name.name)
|
||||
__string(name, dentry->d_name.name)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->ino = dentry->d_inode ? dentry->d_inode->i_ino : 0;
|
||||
__assign_str(name, dentry->d_parent->d_name.name);
|
||||
__assign_str(name, dentry->d_name.name);
|
||||
),
|
||||
TP_printk("ino=%lld name=%s/%s", __entry->ino, __get_str(parent), __get_str(name))
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_block_write_enter,
|
||||
TP_PROTO(u64 block_service_id, u64 block_id, u32 size),
|
||||
TP_ARGS(block_service_id, block_id, size),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, block_service_id)
|
||||
__field(u64, block_id)
|
||||
__field(u32, size)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->block_service_id = block_service_id;
|
||||
__entry->block_id = block_id;
|
||||
__entry->size = size;
|
||||
),
|
||||
TP_printk("block_service=%016llx block_id=%016llx size=%u", __entry->block_service_id, __entry->block_id, __entry->size)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_block_write_exit,
|
||||
TP_PROTO(u64 block_service_id, u64 block_id, int err),
|
||||
TP_ARGS(block_service_id, block_id, err),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, block_service_id)
|
||||
__field(u64, block_id)
|
||||
__field(int, err)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->block_service_id = block_service_id;
|
||||
__entry->block_id = block_id;
|
||||
__entry->err = err;
|
||||
),
|
||||
TP_printk("block_service=%016llx block_id=%016llx err=%u", __entry->block_service_id, __entry->block_id, __entry->err)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_span_flush_enter,
|
||||
TP_PROTO(u64 file_id, u64 offset, u32 size, u8 parity, u8 stripes, u8 storage_class),
|
||||
TP_ARGS( file_id, offset, size, parity, stripes, storage_class),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, file_id)
|
||||
__field(u64, offset)
|
||||
__field(u32, size)
|
||||
__field(u8, parity)
|
||||
__field(u8, stripes)
|
||||
__field(u8, storage_class)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->file_id = file_id;
|
||||
__entry->offset = offset;
|
||||
__entry->size = size;
|
||||
__entry->parity = parity;
|
||||
__entry->stripes = stripes;
|
||||
__entry->storage_class = storage_class;
|
||||
),
|
||||
TP_printk("file_id=%016llx offset=%llu size=%u parity=RS(%u,%u) stripes=%u, storage_class=%u", __entry->file_id, __entry->offset, __entry->size, eggsfs_data_blocks(__entry->parity), eggsfs_parity_blocks(__entry->parity), (int)__entry->stripes, (int)__entry->storage_class)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_span_flush_exit,
|
||||
TP_PROTO(u64 file_id, u64 offset, u32 size, int err),
|
||||
TP_ARGS( file_id, offset, size, err),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, file_id)
|
||||
__field(u64, offset)
|
||||
__field(u32, size)
|
||||
__field(int, err)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->file_id = file_id;
|
||||
__entry->offset = offset;
|
||||
__entry->size = size;
|
||||
__entry->err = err;
|
||||
),
|
||||
TP_printk("file_id=%016llx offset=%llu size=%u err=%d", __entry->file_id, __entry->offset, __entry->size, __entry->err)
|
||||
);
|
||||
|
||||
TRACE_EVENT(eggsfs_span_add,
|
||||
TP_PROTO(u64 file_id, u64 offset),
|
||||
TP_ARGS( file_id, offset),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, file_id)
|
||||
__field(u64, offset)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->file_id = file_id;
|
||||
__entry->offset = offset;
|
||||
),
|
||||
TP_printk("file_id=%016llx offset=%llu", __entry->file_id, __entry->offset)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_EGGFS_H */
|
||||
|
||||
#undef TRACE_INCLUDE_PATH
|
||||
#define TRACE_INCLUDE_PATH .
|
||||
#define TRACE_INCLUDE_FILE trace
|
||||
|
||||
#include <trace/define_trace.h>
|
||||
|
||||
Executable
+76
@@ -0,0 +1,76 @@
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
|
||||
static void bad_usage() {
|
||||
fprintf(stderr, "bad usage\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
int main(int argc, const char** argv) {
|
||||
ssize_t file_size = -1;
|
||||
ssize_t buf_size = -1; // if -1, all in one go
|
||||
const char* filename = NULL;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "-buf-size") == 0) {
|
||||
if (i+1 >= argc) { bad_usage(); } i++;
|
||||
buf_size = strtoull(argv[i], NULL, 0);
|
||||
if (buf_size == ULLONG_MAX) {
|
||||
perror("bad bad_size");
|
||||
exit(1);
|
||||
}
|
||||
} else if (strcmp(argv[i], "-size") == 0) {
|
||||
if (i+1 >= argc) { bad_usage(); } i++;
|
||||
file_size = strtoull(argv[i], NULL, 0);
|
||||
if (file_size == ULLONG_MAX) {
|
||||
perror("bad bad_size");
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
if (filename != NULL) { bad_usage(); }
|
||||
filename = argv[i];
|
||||
}
|
||||
}
|
||||
if (buf_size < 0) { buf_size = file_size; }
|
||||
if (file_size < 0 || filename == NULL) { bad_usage(); }
|
||||
|
||||
printf("writing %ld bytes with bufsize %ld to %s\n", file_size, buf_size, filename);
|
||||
|
||||
int fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666);
|
||||
if (fd < 0) {
|
||||
perror("open");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
unsigned char* buffer = malloc(buf_size);
|
||||
if (buffer == NULL) {
|
||||
perror("malloc");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
while (file_size > 0) {
|
||||
ssize_t res = write(fd, buffer, file_size > buf_size ? buf_size : file_size);
|
||||
if (res < 0) {
|
||||
perror("write");
|
||||
exit(1);
|
||||
}
|
||||
file_size -= res;
|
||||
}
|
||||
|
||||
printf("finished writing, will now close\n");
|
||||
|
||||
if (close(fd) < 0) {
|
||||
perror("close");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("done.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -3,26 +3,28 @@ set -eu -o pipefail
|
||||
|
||||
echo "$(tput bold)building requisites$(tput sgr0)"
|
||||
./cpp/build.py alpine rs crc32c # build libs for go
|
||||
(cd go/msgs && go generate ./...) # build cpp files
|
||||
(cd go/msgs && go generate ./...)
|
||||
|
||||
echo "$(tput bold)go tests$(tput sgr0)"
|
||||
(cd go && go test ./...)
|
||||
|
||||
./cpp/tests.sh
|
||||
|
||||
./go/build.py integrationtest
|
||||
(cd kmod && make bincode_tests && ./bincode_tests)
|
||||
|
||||
(cd go/eggstests && go build .)
|
||||
|
||||
echo "$(tput bold)integration tests$(tput sgr0)"
|
||||
set -x
|
||||
./go/integrationtest/integrationtest -repo-dir $(pwd)
|
||||
./go/eggstests/eggstests -repo-dir $(pwd)
|
||||
set +x
|
||||
|
||||
echo "$(tput bold)integration tests, sanitized, packet drop$(tput sgr0)"
|
||||
set -x
|
||||
./go/integrationtest/integrationtest -repo-dir $(pwd) -build-type sanitized -outgoing-packet-drop 0.1 -short
|
||||
./go/eggstests/eggstests -repo-dir $(pwd) -build-type sanitized -outgoing-packet-drop 0.1 -short
|
||||
set +x
|
||||
|
||||
echo "$(tput bold)integration tests, valgrind$(tput sgr0)"
|
||||
set -x
|
||||
./go/integrationtest/integrationtest -repo-dir $(pwd) -build-type valgrind -short
|
||||
./go/eggstests/eggstests -repo-dir $(pwd) -build-type valgrind -short
|
||||
set +x
|
||||
|
||||
Reference in New Issue
Block a user