First version of kernel module

Initial version really by Pawel, but many changes in between.

Big outstanding issues:

* span cache reclamation (unbounded memory otherwise...)
* bad block service detection and workarounds
* corrupted blocks detection and workaround

Co-authored-by: Paweł Dziepak <pawel.dziepak@xtxmarkets.com>
This commit is contained in:
Francesco Mazzoli
2022-12-01 15:09:50 +00:00
parent d94b582a4d
commit 6addbdee6a
96 changed files with 16355 additions and 2926 deletions

View File

@@ -16,8 +16,8 @@ if(NOT CMAKE_BUILD_TYPE)
)
endif()
if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|sanitized|valgrind)$"))
message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, got ${CMAKE_BUILD_TYPE}")
if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|alpine-debug|sanitized|valgrind)$"))
message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, alpine-debug got ${CMAKE_BUILD_TYPE}")
endif()
set(CMAKE_CXX_STANDARD 20)
@@ -33,13 +33,13 @@ add_compile_options("$<$<CONFIG:valgrind>:-march=haswell;-maes;-mgfni>")
add_compile_options("$<$<NOT:$<CONFIG:valgrind>>:-march=skylake;-mgfni>")
# performance/debug stuff
add_compile_options("$<$<NOT:$<CONFIG:debug>>:-O3>")
add_compile_options("$<$<CONFIG:debug>:-O;-DEGGS_DEBUG>")
add_compile_options("$<$<NOT:$<CONFIG:debug,alpine-debug>>:-O3>")
add_compile_options("$<$<CONFIG:debug,alpine-debug>:-O;-DEGGS_DEBUG>")
# We build the release build statically in Alpine
add_compile_options("$<$<CONFIG:alpine>:-DEGGS_ALPINE>")
add_link_options("$<$<CONFIG:alpine>:-static>")
add_link_options("$<$<NOT:$<CONFIG:alpine>>:-no-pie>")
add_compile_options("$<$<CONFIG:alpine,alpine-debug>:-DEGGS_ALPINE>")
add_link_options("$<$<CONFIG:alpine,alpine-debug>:-static>")
add_link_options("$<$<NOT:$<CONFIG:alpine,alpine-debug>>:-no-pie>")
# sanitizer options
set(SANITIZE_OPTIONS "-fsanitize=undefined,address,integer,function;-fno-sanitize-recover=all;-fsanitize-blacklist=${CMAKE_SOURCE_DIR}/ubsan-ignorelist")

View File

@@ -5,7 +5,7 @@ from pathlib import Path
import subprocess
if len(sys.argv) < 2:
print(f'Usage: {sys.argv[0]} release|alpine|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
print(f'Usage: {sys.argv[0]} release|alpine|alpine-debug|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
sys.exit(2)
if len(sys.argv) == 1:
@@ -19,9 +19,9 @@ repo_dir = cpp_dir.parent
build_dir = cpp_dir / 'build' / build_type
build_dir.mkdir(parents=True, exist_ok=True)
if build_type == 'alpine' and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
if build_type in ('alpine', 'alpine-debug') and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
subprocess.run(
['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', 'alpine'] + sys.argv[2:],
['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', build_type] + sys.argv[2:],
check=True,
)
else:

View File

@@ -264,7 +264,6 @@ struct MakeDirectoryStateMachine {
void createDirectoryInode() {
auto& shardReq = env.needsShard(MAKE_DIRECTORY_CREATE_DIR, state.dirId().shard()).setCreateDirectoryInode();
shardReq.id = state.dirId();
shardReq.info = req.info;
shardReq.ownerId = req.ownerId;
}
@@ -738,6 +737,7 @@ struct SoftUnlinkDirectoryStateMachine {
shardReq.name = req.name;
shardReq.targetId = req.targetId;
shardReq.wasMoved = false;
shardReq.creationTime = req.creationTime;
}
void afterRollback(EggsError err, const ShardRespContainer* resp) {

View File

@@ -3,26 +3,6 @@
std::ostream& operator<<(std::ostream& out, const BincodeBytesRef& x) {
return goLangBytesFmt(out, x.data(), x.size());
/*
out << "b\"";
uint8_t len = x.size();
const uint8_t* data = (const uint8_t*)x.data();
for (int i = 0; i < len; i++) {
uint8_t ch = data[i];
if (isprint(ch)) {
out << ch;
} else if (ch == 0) {
out << "\\0";
} else {
const char cfill = out.fill();
out << std::hex << std::setfill('0');
out << "\\x" << std::setw(2) << (int)ch;
out << std::setfill(cfill) << std::dec;
}
}
out << "\"";
return out;
*/
}
std::ostream& operator<<(std::ostream& out, const BincodeBytes& x) {

View File

@@ -4,6 +4,7 @@
#include <netinet/in.h>
#include <string.h>
#include <unistd.h>
#include <iomanip>
// Throwing in static initialization is nasty, and there is no useful stacktrace
// Also use direct syscalls to write the error as iostream might not be initialized
@@ -48,4 +49,27 @@ std::ostream& goLangBytesFmt(std::ostream& out, const char* str, size_t len) {
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes) {
return goLangBytesFmt(out, bytes.str, bytes.len);
}
std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* data, size_t len) {
out << "\"";
for (int i = 0; i < len; i++) {
uint8_t ch = data[i];
if (isprint(ch)) {
out << ch;
} else if (ch == 0) {
out << "\\0";
} else {
const char cfill = out.fill();
out << std::hex << std::setfill('0');
out << "\\x" << std::setw(2) << (int)ch;
out << std::setfill(cfill) << std::dec;
}
}
out << "\"";
return out;
}
std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes) {
return goLangQuotedStringFmt(out, bytes.str, bytes.len);
}

View File

@@ -52,4 +52,15 @@ struct GoLangBytesFmt {
GoLangBytesFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
};
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* str, size_t len);
struct GoLangQuotedStringFmt {
const char* str;
size_t len;
GoLangQuotedStringFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
};
std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
add_library(crc32c crc32c.h crc32c.cpp iscsi.hpp)
add_library(crc32c crc32c.h crc32c.c iscsi.h)
add_executable(crc32c-tables tables.cpp iscsi.hpp)
add_executable(crc32c-tables tables.cpp iscsi.h)
add_executable(crc32c-tests tests.cpp)
target_link_libraries(crc32c-tests PRIVATE crc32c)

View File

@@ -1,11 +1,29 @@
#ifndef __KERNEL__
#include "crc32c.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>
#define kernel_static
typedef uint8_t u8;
typedef uint32_t u32;
#else
#define kernel_static static
#endif
#ifdef __clang__
__attribute__((no_sanitize("integer")))
static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_blocks) {
#endif
#ifdef __KERNEL__
__attribute__((target("crc32,pclmul")))
#endif
static u32 crc32_fusion_kernel(u32 acc_a, const char* buf, size_t n_blocks) {
size_t stride = n_blocks * 24 + 8;
// Four chunks:
// Chunk A: 0 through stride
@@ -18,8 +36,8 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
__m128i x2 = _mm_loadu_si128((__m128i*)(buf2 + 16));
__m128i x3 = _mm_loadu_si128((__m128i*)(buf2 + 32));
__m128i x4 = _mm_loadu_si128((__m128i*)(buf2 + 48));
uint32_t acc_b = 0;
uint32_t acc_c = 0;
u32 acc_b = 0;
u32 acc_c = 0;
// Parallel fold remaining blocks of 64 from D, and 24 from each of A/B/C.
// k1 == magic(4*128+32-1)
// k2 == magic(4*128-32-1)
@@ -100,16 +118,16 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
stack_a = ~stack_a;
stack_b = ~stack_b;
stack_c = ~stack_c;
uint32_t magic_a = ((uint32_t)0x80000000) >> (bits_a & 31); bits_a >>= 5;
uint32_t magic_b = ((uint32_t)0x80000000) >> (bits_b & 31); bits_b >>= 5;
uint32_t magic_c = ((uint32_t)0x80000000) >> (bits_c & 31); bits_c >>= 5;
u32 magic_a = ((u32)0x80000000) >> (bits_a & 31); bits_a >>= 5;
u32 magic_b = ((u32)0x80000000) >> (bits_b & 31); bits_b >>= 5;
u32 magic_c = ((u32)0x80000000) >> (bits_c & 31); bits_c >>= 5;
bits_a -= bits_b;
bits_b -= bits_c;
for (; bits_c; --bits_c) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0), magic_c = _mm_crc32_u32(magic_c, 0);
for (; bits_b; --bits_b) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0);
for (; bits_a; --bits_a) magic_a = _mm_crc32_u32(magic_a, 0);
for (;;) {
uint32_t low = stack_a & 1;
u32 low = stack_a & 1;
if (!(stack_a >>= 1)) break;
__m128i x = _mm_cvtsi32_si128(magic_a);
uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
@@ -130,13 +148,18 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
x1 = _mm_xor_si128(x1, x5);
uint64_t abc = _mm_cvtsi128_si64(_mm_xor_si128(_mm_xor_si128(vec_c, vec_a), vec_b));
// Apply missing <<32 and fold down to 32-bits.
uint32_t crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
u32 crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
crc = _mm_crc32_u64(crc, abc ^ _mm_extract_epi64(x1, 1));
return crc;
}
#ifdef __clang__
__attribute__((no_sanitize("integer")))
uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
#endif
#ifdef __KERNEL__
__attribute__((target("crc32")))
#endif
kernel_static u32 crc32c(u32 crc, const char* buf, size_t length) {
crc = ~crc; // preset to -1 (distinguish leading zeros)
if (length >= 31) {
size_t n_blocks = (length - 16) / 136;
@@ -149,7 +172,7 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
const char* kernel_start = kernel_end - kernel_length;
length -= kernel_start - buf;
for (; buf != kernel_start; ++buf) {
crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
crc = _mm_crc32_u8(crc, *(const u8*)buf);
}
if (n_blocks) {
length -= kernel_length;
@@ -161,17 +184,34 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
}
for (; length; --length, ++buf) {
crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
crc = _mm_crc32_u8(crc, *(const u8*)buf);
}
return ~crc; // post-invert -- distinguish scaled multiples
}
#include "iscsi.hpp"
#ifdef __clang__
__attribute__((no_sanitize("integer")))
#endif
#ifdef __KERNEL__
__attribute__((target("crc32")))
#endif
kernel_static u32 crc32c_simple(u32 crc, const char* buf, size_t length) {
crc = ~crc; // preset to -1 (distinguish leading zeros)
for (; length >= 8; length -= 8, buf += 8) {
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
}
for (; length; --length, ++buf) {
crc = _mm_crc32_u8(crc, *(const u8*)buf);
}
return ~crc; // post-invert -- distinguish scaled multiples
}
#include "iscsi.h"
// In the comments below, multiplication is meant to be modulo ISCSI_POLY.
// Stores x^2^0, ..., x^2^31. Can be generated with `mult_mod_p`, see tables.cpp.
static uint32_t CRC_POWER_TABLE[31] = {
static u32 CRC_POWER_TABLE[31] = {
0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,
0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a,
@@ -182,7 +222,7 @@ static uint32_t CRC_POWER_TABLE[31] = {
};
// Stores x^-(2^0), ..., x^-(2^31). Can be generated with `mult_mod_p`, see tables.cpp.
static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
static u32 CRC_INVERSE_POWER_TABLE[31] = {
0x05ec76f1, 0x0bd8ede2, 0x2f63b788, 0xfde39562, 0xbef0965e,
0xd610d67e, 0xe67cce65, 0xa268b79e, 0x134fb088, 0x32998d96,
0xcedac2cc, 0x70118575, 0x0e004a40, 0xa7864c8b, 0xbc7be916,
@@ -193,7 +233,7 @@ static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
};
// Return x^(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
static uint32_t x2n_mod_p(size_t n, uint32_t k) {
static u32 x2n_mod_p(size_t n, u32 k) {
// CRC_POWER_TABLE has all powers of two can multiply combinations
// of these to achieve any power.
@@ -210,7 +250,7 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
// We walk along the p_is above and keep adding to the result.
//
// Note that 2^2^(31 + k) = 2^2^k TODO clarify
uint32_t p = 1u << 31;
u32 p = 1u << 31;
for (; n != 0; n >>= 1, k++) {
if (n & 1) {
// p(x) = p(x) * 2^(k % 31)
@@ -221,8 +261,8 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
}
// Return x^-(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
uint32_t p = 1u << 31;
static u32 x2n_mod_p_inv(size_t n, u32 k) {
u32 p = 1u << 31;
for (; n != 0; n >>= 1, k++) {
if (n & 1) {
p = crc32c_mult_mod_p(CRC_INVERSE_POWER_TABLE[k & 0x1F], p);
@@ -231,7 +271,7 @@ static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
return p;
}
uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
kernel_static u32 crc32c_zero_extend(u32 crc, ssize_t zeros) {
if (zeros > 0) {
return ~crc32c_mult_mod_p(x2n_mod_p(zeros, 3), ~crc);
} else {
@@ -239,17 +279,17 @@ uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
}
}
uint32_t crc32c_append(uint32_t crc_a, uint32_t crc_b, size_t len_b) {
kernel_static u32 crc32c_append(u32 crc_a, u32 crc_b, size_t len_b) {
// We need to extend crc_a with len_b*8 zeros (len_b is the number
// of bytes) (without the inversion).
// This amounts to performing `crc_a * x^(len_b*8)`.
return crc32c_mult_mod_p(x2n_mod_p(len_b, 3), crc_a) ^ crc_b;
}
uint32_t crc32c_xor(uint32_t crc_a, uint32_t crc_b, size_t len) {
kernel_static u32 crc32c_xor(u32 crc_a, u32 crc_b, size_t len) {
// We need to to extend crc_a with the crc of len*8 bits.
// We could do this in 32 steps rather than 32+32 with a dedicated
// table, but probably doesn't matter.
uint32_t crc_0 = ~crc32c_mult_mod_p(~(uint32_t)0, x2n_mod_p(len, 3));
u32 crc_0 = ~crc32c_mult_mod_p(~(u32)0, x2n_mod_p(len, 3));
return crc_a ^ crc_b ^ crc_0;
}
}

24
cpp/crc32c/iscsi.h Normal file
View File

@@ -0,0 +1,24 @@
#define ISCSI_POLY 0x82F63B78u
// Return a(x) multiplied by b(x) modulo ISCSI_POLY, For speed, this requires
// that a(x) not be zero.
static u32 crc32c_mult_mod_p(u32 a, u32 b) {
// m goes from x^0 to x^31
u32 m = 1u << 31;
u32 p = 0;
for (;;) {
// If a(x) contains x^n, add b(x)*x^n
if (a & m) {
p ^= b;
// Exit when there are no higher bits in a(x)
if ((a & (m - 1)) == 0) {
break;
}
}
// Go from x^n to x^(n+1)
m >>= 1;
// Go from b(x)*x^n to b(x)*x^(n+1)
b = (b & 1) ? ((b >> 1) ^ ISCSI_POLY) : b >> 1;
}
return p;
}

View File

@@ -30,6 +30,7 @@ int main() {
for (int i = 0; i < 1000; i++) {
auto s1 = randString(1 + wyhash64(&rand)%100);
uint32_t crc1 = crc32c(0, (const char*)s1.data(), s1.size());
ASSERT(crc1 == crc32c_append(0, crc1, s1.size()));
auto s2 = randString(1 + wyhash64(&rand)%100);
uint32_t crc2 = crc32c(0, (const char*)s2.data(), s2.size());
std::vector<uint8_t> s = s1;

View File

@@ -1,4 +1,4 @@
add_library(rs rs.h rs.cpp gf.hpp gf_tables.cpp)
add_library(rs rs.h rs.cpp gf_tables.c)
add_executable(rs-tests tests.cpp)
target_link_libraries(rs-tests PRIVATE rs)

View File

@@ -1,110 +0,0 @@
#pragma once
#include <immintrin.h>
#include <stdint.h>
#include <string.h>
extern const uint8_t rs_gf_inv_table[256];
extern const uint8_t rs_gf_log_table[256];
extern const uint8_t rs_gf_exp_table[256];
inline uint8_t gf_inv(uint8_t x) {
return rs_gf_inv_table[x];
}
inline uint8_t gf_mul(uint8_t x, uint8_t y) {
if (x == 0 || y == 0) {
return 0;
}
int i = rs_gf_log_table[x] + rs_gf_log_table[y];
return rs_gf_exp_table[i > 254 ? i - 255 : i];
}
inline void gf_mul_expand_factor(uint8_t x, uint8_t* expanded_x) {
for (int i = 0; i < 16; i++) {
expanded_x[i] = gf_mul(i, x);
}
for (int i = 0; i < 16; i++) {
expanded_x[16 + i] = gf_mul(i << 4, x);
}
}
inline uint8_t gf_mul_expanded(uint8_t x, const uint8_t* expanded_y) {
return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
}
inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
__m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
__m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
__m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
__m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
return _mm256_xor_si256(
_mm256_shuffle_epi8(expanded_y_lo, x_lo),
_mm256_shuffle_epi8(expanded_y_hi, x_hi)
);
}
// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
// just Gaussian elimination.
//
// TODO this is in row-major, it'd be nice to have it in column-major
// to save have the final operation to be more natural in rs_recover.
__attribute__((noinline))
static bool rs_gf_invert_matrix(uint8_t* in_mat, uint8_t* out_mat, const int n) {
int i, j, k;
uint8_t temp;
// Set out_mat[] to the identity matrix
memset(out_mat, 0, n*n);
for (i = 0; i < n; i++) {
out_mat[i * n + i] = 1;
}
// Inverse
for (i = 0; i < n; i++) {
// Check for 0 in pivot element
if (in_mat[i * n + i] == 0) {
// Find a row with non-zero in current column and swap
for (j = i + 1; j < n; j++) {
if (in_mat[j * n + i]) {
break;
}
}
if (j == n) { // Couldn't find means it's singular
return false;
}
for (k = 0; k < n; k++) { // Swap rows i,j
temp = in_mat[i * n + k];
in_mat[i * n + k] = in_mat[j * n + k];
in_mat[j * n + k] = temp;
temp = out_mat[i * n + k];
out_mat[i * n + k] = out_mat[j * n + k];
out_mat[j * n + k] = temp;
}
}
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
}
for (j = 0; j < n; j++) {
if (j == i) {
continue;
}
temp = in_mat[j * n + i];
for (k = 0; k < n; k++) {
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
}
}
}
return true;
}

View File

@@ -1,7 +1,9 @@
// generated with gf_tables.py
#ifndef __KERNEL__
#include <stdint.h>
#endif
extern const uint8_t rs_gf_exp_table[256] = {
const uint8_t rs_gf_exp_table[256] = {
0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
@@ -20,7 +22,7 @@ extern const uint8_t rs_gf_exp_table[256] = {
0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01,
};
extern const uint8_t rs_gf_log_table[256] = {
const uint8_t rs_gf_log_table[256] = {
0x00, 0xff, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
@@ -39,7 +41,7 @@ extern const uint8_t rs_gf_log_table[256] = {
0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07,
};
extern const uint8_t rs_gf_inv_table[256] = {
const uint8_t rs_gf_inv_table[256] = {
0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0, 0xb0, 0xe1, 0xe5, 0xc7,
0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f, 0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2,
0x3a, 0x6e, 0x5a, 0xf1, 0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2,

View File

@@ -6,108 +6,10 @@
#include <array>
#include "rs.h"
#include "gf.hpp"
#define die(...) do { fprintf(stderr, __VA_ARGS__); raise(SIGABRT); } while(false)
static void* malloc_or_die(size_t size, const char* what) {
void* ptr = malloc(size);
if (ptr == nullptr) {
die(what);
}
return ptr;
}
struct rs {
uint8_t parity;
// uint8_t[D*B], in column-major.
uint8_t* matrix;
// uint8_t[D*P][32], in column-major. These are the lookup tables
// to perform multiplication quickly.
uint8_t* expanded_matrix;
};
static struct rs* rs_cached[256] = {
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
};
// Note that we pervasively assume that the first column of the parity columns
// is 1s, which causes the first parity to be the XORs of the data. So you can't
// really change how the matrix is generated.
static void rs_cauchy_matrix(struct rs* r) {
int D = rs_data_blocks(r->parity);
int B = rs_blocks(r->parity);
uint8_t* matrix = r->matrix;
memset(matrix, 0, D*B);
// Identity in the d*d upper half
for (int i = 0; i < D; i++) {
matrix[D*i + i] = 1;
}
// Fill in the rest using cauchy
for (int col = D; col < B; col++) {
for (int row = 0; row < D; row++) {
matrix[col*D + row] = gf_inv(col ^ row);
}
}
// Scale the columns
for (int col = D; col < B; col++) {
uint8_t factor = gf_inv(matrix[col*D]);
for (int row = 0; row < D; row++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
// Scale the rows
for (int row = 1; row < D; row++) {
uint8_t factor = gf_inv(matrix[D*D + row]);
for (int col = D; col < B; col++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
}
static struct rs* rs_new(uint8_t parity) {
int B = rs_blocks(parity);
int D = rs_data_blocks(parity);
int P = rs_parity_blocks(parity);
struct rs* r = (struct rs*)malloc_or_die(sizeof(struct rs) + B*D + D*P*32, "rs_new\n");
r->parity = parity;
r->matrix = (uint8_t*)(r + 1);
r->expanded_matrix = r->matrix + B*D;
rs_cauchy_matrix(r);
for (int p = 0; p < P; p++) {
for (int d = 0; d < D; d++) {
gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
}
}
return r;
}
static void rs_delete(struct rs* r) {
free(r);
}
static std::array<uint32_t, 4> rs_cpuidex(uint32_t function_id, uint32_t subfunction_id) {
uint32_t a, b, c, d;
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
return {a, b, c, d};
}
static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
#define rs_malloc malloc
#define rs_free free
// See `valgrind.h`
static uint64_t rs_valgrind_client_request(uint64_t defaultResult, uint64_t reqID, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5) {
@@ -129,20 +31,48 @@ static bool rs_detect_valgrind() {
return rs_valgrind_client_request(0, 0x1001, 0, 0, 0, 0, 0);
}
bool rs_has_cpu_level(rs_cpu_level level) {
const auto cpuid7 = (rs_cpuidex(0, 0)[0] >= 7) ? rs_cpuidex(7, 0) : std::array<uint32_t, 4>{0, 0, 0, 0};
switch (level) {
case RS_CPU_SCALAR:
return true;
case RS_CPU_AVX2:
return cpuid7[1] & (1<<5);
case RS_CPU_GFNI:
return cpuid7[2] & (1<<8) && !rs_detect_valgrind();
default:
die("bad CPU level %d\n", level);
}
// This will emit vbroadcastb
__attribute__((no_sanitize("integer")))
static inline __m256i broadcast_u8(uint8_t x) {
return _mm256_set_epi8(
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x
);
}
#include "rs_core.c"
uint8_t rs_parity(struct rs* r) {
return r->parity;
}
static struct rs* rs_cached[256] = {
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
};
bool rs_has_cpu_level(rs_cpu_level level) {
return rs_has_cpu_level_core(level);
}
static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
__attribute__((constructor))
void rs_detect_cpu_level() {
if (rs_has_cpu_level(RS_CPU_GFNI)) {
@@ -172,124 +102,46 @@ struct rs* rs_get(uint8_t parity) {
}
struct rs* r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
if (__builtin_expect(r == nullptr, 0)) {
r = rs_new(parity);
r = rs_new_core(parity);
if (r == nullptr) {
die("could not allocate RS data");
}
struct rs* expected = nullptr;
if (!__atomic_compare_exchange_n(&rs_cached[parity], &expected, r, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
// somebody else got to it first
rs_delete(r);
rs_delete_core(r);
r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
}
}
return r;
}
uint8_t rs_parity(struct rs* r) {
return r->parity;
template<int D, int P> __attribute__((noinline))
static void rs_compute_parity_scalar_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
rs_compute_parity_scalar(D, P, r, size, data, parity);
}
// This will emit vbroadcastb
__attribute__((no_sanitize("integer")))
inline __m256i broadcast_u8(uint8_t x) {
return _mm256_set_epi8(
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x,
x, x, x, x, x, x, x, x
);
template<int D, int P> __attribute__((noinline))
static void rs_compute_parity_avx2_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
rs_compute_parity_avx2(D, P, r, size, data, parity);
}
template<int D, int P>
static void rs_compute_parity_single(struct rs* r, uint64_t i, const uint8_t** data, uint8_t** parity) {
parity[0][i] = 0;
for (int d = 0; d < D; d++) {
parity[0][i] ^= data[d][i];
}
for (int p = 1; p < P; p++) {
const uint8_t* factor = &r->expanded_matrix[D*32*p];
parity[p][i] = 0;
for (int d = 0; d < D; d++, factor += 32) {
parity[p][i] ^= gf_mul_expanded(data[d][i], factor);
}
}
}
template<int D, int P>
__attribute__((noinline))
void rs_compute_parity_scalar(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
// parity = r->matrix * data
for (uint64_t i = 0; i < size; i++) {
rs_compute_parity_single<D, P>(r, i, data, parity);
}
}
template<int D, int P>
__attribute__((noinline))
void rs_compute_parity_avx2(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
__m256i low_nibble_mask = broadcast_u8(0x0f);
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
{
__m256i parity_0 = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
}
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
}
for (int p = 1; p < P; p++) {
__m256i parity_p = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
__m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]);
parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask));
}
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
}
}
for (uint64_t i = avx_size; i < size; i++) {
rs_compute_parity_single<D, P>(r, i, data, parity);
}
}
template<int D, int P>
__attribute__((noinline))
void rs_compute_parity_gfni(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
{
__m256i parity_0 = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
}
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
}
for (int p = 1; p < P; p++) {
__m256i parity_p = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
__m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]);
parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor));
}
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
}
}
for (uint64_t i = avx_size; i < size; i++) {
rs_compute_parity_single<D, P>(r, i, data, parity);
}
template<int D, int P> __attribute__((noinline))
static void rs_compute_parity_gfni_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
rs_compute_parity_gfni(D, P, r, size, data, parity);
}
template<int D, int P>
static void rs_compute_parity_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
switch (rs_cpu_level l = rs_get_cpu_level()) {
case RS_CPU_SCALAR:
rs_compute_parity_scalar<D, P>(r, size, data, parity);
rs_compute_parity_scalar_tmpl<D, P>(r, size, data, parity);
break;
case RS_CPU_AVX2:
rs_compute_parity_avx2<D, P>(r, size, data, parity);
rs_compute_parity_avx2_tmpl<D, P>(r, size, data, parity);
break;
case RS_CPU_GFNI:
rs_compute_parity_gfni<D, P>(r, size, data, parity);
rs_compute_parity_gfni_tmpl<D, P>(r, size, data, parity);
break;
default:
die("bad cpu_level %d\n", l);
@@ -301,101 +153,32 @@ void rs_compute_parity(struct rs* r, uint64_t size, const uint8_t** data, uint8_
rs_compute_parity_funcs[r->parity](r, size, data, parity);
}
template<int D>
static void rs_recover_matmul_single(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
want[i] = 0;
for (int j = 0; j < D; j++) {
want[i] ^= gf_mul(have_to_want[j], have[j][i]);
}
template<int D> __attribute__((noinline))
static void rs_recover_matmul_scalar_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_scalar(D, size, have, want, mat);
}
template<int D>
static void rs_recover_matmul_single_expanded(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want_expanded) {
want[i] = 0;
for (int j = 0; j < D; j++) {
want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]);
}
template<int D> __attribute__((noinline))
static void rs_recover_matmul_avx2_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_avx2(D, size, have, want, mat);
}
template<int D>
__attribute__((noinline))
static void rs_recover_matmul_scalar(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
uint8_t have_to_want_expanded[D*32];
for (int i = 0; i < D; i++) {
gf_mul_expand_factor(have_to_want[i], &have_to_want_expanded[i*32]);
}
for (size_t i = 0; i < size; i++) {
rs_recover_matmul_single_expanded<D>(i, have, want, have_to_want_expanded);
}
}
template<int D>
__attribute__((noinline))
static void rs_recover_matmul_avx2(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
__m256i have_to_want_expanded[D];
for (int i = 0; i < D; i++) {
gf_mul_expand_factor(have_to_want[i], (uint8_t*)&have_to_want_expanded[i]);
}
__m256i low_nibble_mask = broadcast_u8(0x0f);
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
__m256i want_i = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
want_i = _mm256_xor_si256(
want_i,
gf_mul_expanded_avx2(
_mm256_loadu_si256((const __m256i*)(have[d] + i)),
have_to_want_expanded[d],
low_nibble_mask
)
);
}
_mm256_storeu_si256((__m256i*)(want + i), want_i);
}
for (uint64_t i = avx_size; i < size; i++) {
rs_recover_matmul_single_expanded<D>(i, have, want, (const uint8_t*)have_to_want_expanded);
}
}
template<int D>
__attribute__((noinline))
static void rs_recover_matmul_gfni(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
__m256i have_to_want_avx[D];
for (int i = 0; i < D; i++) {
have_to_want_avx[i] = broadcast_u8(have_to_want[i]);
}
size_t avx_leftover = size % 32;
size_t avx_size = size-avx_leftover;
for (uint64_t i = 0; i < avx_size; i += 32) {
__m256i want_i = _mm256_setzero_si256();
for (int d = 0; d < D; d++) {
want_i = _mm256_xor_si256(
want_i,
_mm256_gf2p8mul_epi8(
_mm256_loadu_si256((const __m256i*)(have[d] + i)),
have_to_want_avx[d]
)
);
}
_mm256_storeu_si256((__m256i*)(want + i), want_i);
}
for (uint64_t i = avx_size; i < size; i++) {
rs_recover_matmul_single<D>(i, have, want, have_to_want);
}
template<int D> __attribute__((noinline))
static void rs_recover_matmul_gfni_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_gfni(D, size, have, want, mat);
}
template<int D>
static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
switch (rs_cpu_level l = rs_get_cpu_level()) {
case RS_CPU_SCALAR:
rs_recover_matmul_scalar<D>(size, have, want, mat);
rs_recover_matmul_scalar_tmpl<D>(size, have, want, mat);
break;
case RS_CPU_AVX2:
rs_recover_matmul_avx2<D>(size, have, want, mat);
rs_recover_matmul_avx2_tmpl<D>(size, have, want, mat);
break;
case RS_CPU_GFNI:
rs_recover_matmul_gfni<D>(size, have, want, mat);
rs_recover_matmul_gfni_tmpl<D>(size, have, want, mat);
break;
default:
die("bad cpu_level %d\n", l);
@@ -403,7 +186,6 @@ static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t*
}
static void (*rs_recover_matmul_funcs[16])(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat);
void rs_recover(
struct rs* r,
uint64_t size,
@@ -412,55 +194,12 @@ void rs_recover(
uint8_t want_block,
uint8_t* want
) {
int D = rs_data_blocks(r->parity);
int B = rs_blocks(r->parity);
// Create some space
uint8_t* scratch = (uint8_t*)malloc(D*D + D*D);
uint8_t* mat_1 = scratch;
uint8_t* mat_2 = scratch + D*D;
// Preliminary checks
for (int i = 0; i < D; i++) {
if (have_blocks[i] >= B) {
die("have_blocks[%d]=%d >= %d\n", i, have_blocks[i], B);
rs_recover_core(
r, size, have_blocks, have, want_block, want,
[](int D, uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
rs_recover_matmul_funcs[D](size, have, want, mat);
}
if (have_blocks[i] == want_block) {
die("have_blocks[%d]=%d == want_block=%d\n", i, have_blocks[i], want_block);
}
if (i > 0 && have_blocks[i] <= have_blocks[i-1]) {
die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", i, have_blocks[i], i, have_blocks[i-1]);
}
}
// below in the dimensionality annotation we paper over transposes
// [DxD] matrix going from the data blocks to the blocks we currently have
uint8_t* data_to_have = mat_1;
for (int i = 0, have_cursor = 0; i < B; i++) {
if (have_cursor >= D || have_blocks[have_cursor] != i) {
continue;
}
memcpy(data_to_have + have_cursor*D, r->matrix + i*D, D);
have_cursor++;
}
// [DxD] matrix going from what we have to the original data blocks
uint8_t* have_to_data = mat_2;
if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
die("unexpected singular matrix\n");
}
data_to_have = nullptr;
// [Dx1] matrix going from the data blocks to the block we want
uint8_t* data_to_want = &r->matrix[want_block*D];
// have_to_want = data_to_want * have_to_data
// [Dx1] matrix going from `blocks` to the block we're into
uint8_t* have_to_want = mat_1;
for (int i = 0; i < D; i++) {
have_to_want[i] = 0;
for (int j = 0; j < D; j++) {
have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
}
}
// want = have_to_want * have
rs_recover_matmul_funcs[D](size, have, want, have_to_want);
// We're done.
free(scratch);
);
}
__attribute__((constructor))

451
cpp/rs/rs_core.c Normal file
View File

@@ -0,0 +1,451 @@
#ifndef __KERNEL__
typedef uint8_t u8;
typedef uint32_t u32;
typedef uint64_t u64;
#endif
extern const u8 rs_gf_inv_table[256];
extern const u8 rs_gf_log_table[256];
extern const u8 rs_gf_exp_table[256];
static inline u8 gf_inv(u8 x) {
return rs_gf_inv_table[x];
}
static inline u8 gf_mul(u8 x, u8 y) {
if (x == 0 || y == 0) {
return 0;
}
int i = rs_gf_log_table[x] + rs_gf_log_table[y];
return rs_gf_exp_table[i > 254 ? i - 255 : i];
}
static inline void gf_mul_expand_factor(u8 x, u8* expanded_x) {
int i;
for (i = 0; i < 16; i++) {
expanded_x[i] = gf_mul(i, x);
}
for (i = 0; i < 16; i++) {
expanded_x[16 + i] = gf_mul(i << 4, x);
}
}
static inline u8 gf_mul_expanded(u8 x, const u8* expanded_y) {
return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
}
__attribute__((target("avx2")))
static inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
__m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
__m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
__m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
__m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
return _mm256_xor_si256(
_mm256_shuffle_epi8(expanded_y_lo, x_lo),
_mm256_shuffle_epi8(expanded_y_hi, x_hi)
);
}
static inline u8 rs_data_blocks_core(u8 parity) {
return parity & 0x0F;
}
static inline u8 rs_parity_blocks_core(u8 parity) {
return parity >> 4;
}
static inline u8 rs_blocks_core(u8 parity) {
return rs_data_blocks_core(parity) + rs_parity_blocks_core(parity);
}
// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
// just Gaussian elimination.
//
// TODO this is in row-major, it'd be nice to have it in column-major
// to save have the final operation to be more natural in rs_recover.
static bool rs_gf_invert_matrix(u8* in_mat, u8* out_mat, const int n) {
int i, j, k;
u8 temp;
// Set out_mat[] to the identity matrix
memset(out_mat, 0, n*n);
for (i = 0; i < n; i++) {
out_mat[i * n + i] = 1;
}
// Inverse
for (i = 0; i < n; i++) {
// Check for 0 in pivot element
if (in_mat[i * n + i] == 0) {
// Find a row with non-zero in current column and swap
for (j = i + 1; j < n; j++) {
if (in_mat[j * n + i]) {
break;
}
}
if (j == n) { // Couldn't find means it's singular
return false;
}
for (k = 0; k < n; k++) { // Swap rows i,j
temp = in_mat[i * n + k];
in_mat[i * n + k] = in_mat[j * n + k];
in_mat[j * n + k] = temp;
temp = out_mat[i * n + k];
out_mat[i * n + k] = out_mat[j * n + k];
out_mat[j * n + k] = temp;
}
}
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
}
for (j = 0; j < n; j++) {
if (j == i) {
continue;
}
temp = in_mat[j * n + i];
for (k = 0; k < n; k++) {
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
}
}
}
return true;
}
struct rs {
u8 parity;
// u8[D*B], in column-major.
u8* matrix;
// u8[D*P][32], in column-major. These are the lookup tables
// to perform multiplication quickly.
u8* expanded_matrix;
};
// Note that we pervasively assume that the first column of the parity columns
// is 1s, which causes the first parity to be the XORs of the data. So you can't
// really change how the matrix is generated.
static void rs_cauchy_matrix(struct rs* r) {
int D = rs_data_blocks_core(r->parity);
int B = rs_blocks_core(r->parity);
u8* matrix = r->matrix;
memset(matrix, 0, D*B);
// Identity in the d*d upper half
int i;
for (i = 0; i < D; i++) {
matrix[D*i + i] = 1;
}
// Fill in the rest using cauchy
int col, row;
for (col = D; col < B; col++) {
for (row = 0; row < D; row++) {
matrix[col*D + row] = gf_inv(col ^ row);
}
}
// Scale the columns
for (col = D; col < B; col++) {
u8 factor = gf_inv(matrix[col*D]);
for (row = 0; row < D; row++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
// Scale the rows
for (row = 1; row < D; row++) {
u8 factor = gf_inv(matrix[D*D + row]);
for (col = D; col < B; col++) {
matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
}
}
}
// out must be at leas size 4
static void rs_cpuidex(u32 function_id, u32 subfunction_id, u32* out) {
u32 a, b, c, d;
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
}
static bool rs_has_cpu_level_core(enum rs_cpu_level level) {
u32 _0_0[4];
rs_cpuidex(0, 0, _0_0);
u32 _7_0[4];
if (_0_0[0] >= 7) {
rs_cpuidex(7, 0, _7_0);
} else {
memset(_7_0, 0, sizeof(_7_0));
}
switch (level) {
case RS_CPU_SCALAR:
return true;
case RS_CPU_AVX2:
return _7_0[1] & (1<<5);
case RS_CPU_GFNI:
return _7_0[2] & (1<<8) && !rs_detect_valgrind();
default:
die("bad CPU level %d\n", level);
}
}
#define rs_compute_parity_single(D, P, r, i, data, parity) do { \
parity[0][i] = 0; \
int d; \
int p; \
for (d = 0; d < D; d++) { \
parity[0][i] ^= data[d][i]; \
} \
for (p = 1; p < P; p++) { \
const u8* factor = &r->expanded_matrix[D*32*p]; \
parity[p][i] = 0; \
for (d = 0; d < D; d++, factor += 32) { \
parity[p][i] ^= gf_mul_expanded(data[d][i], factor); \
} \
} \
} while (0)
#define rs_compute_parity_scalar(D, P, r, size, data, parity) do { \
/* parity = r->matrix * data */ \
u64 i; \
for (i = 0; i < size; i++) { \
rs_compute_parity_single(D, P, r, i, data, parity); \
} \
} while (0)
#define rs_compute_parity_avx2(D, P, r, size, data, parity) do { \
__m256i low_nibble_mask = broadcast_u8(0x0f); \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u32 i; \
int p; \
int d; \
for (i = 0; i < avx_size; i += 32) { \
{ \
__m256i parity_0 = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
} \
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
} \
for (p = 1; p < P; p++) { \
__m256i parity_p = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
__m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]); \
parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask)); \
} \
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
} \
} \
for (i = avx_size; i < size; i++) { \
rs_compute_parity_single(D, P, r, i, data, parity); \
} \
} while (0)
#define rs_compute_parity_gfni(D, P, r, size, data, parity) do { \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u64 i; \
int p; \
int d; \
for (i = 0; i < avx_size; i += 32) { \
{ \
__m256i parity_0 = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
} \
_mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
} \
for (p = 1; p < P; p++) { \
__m256i parity_p = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
__m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
__m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]); \
parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor)); \
} \
_mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
} \
} \
for (i = avx_size; i < size; i++) { \
rs_compute_parity_single(D, P, r, i, data, parity); \
} \
} while (0)
#define rs_recover_matmul_single(D, i, have, want, have_to_want) do { \
want[i] = 0; \
int j; \
for (j = 0; j < D; j++) { \
want[i] ^= gf_mul(have_to_want[j], have[j][i]); \
} \
} while (0)
#define rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded) do { \
want[i] = 0; \
int j; \
for (j = 0; j < D; j++) { \
want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]); \
} \
} while(0)
#define rs_recover_matmul_scalar(D, size, have, want, have_to_want) do { \
u8 have_to_want_expanded[D*32]; \
int d; \
for (d = 0; d < D; d++) { \
gf_mul_expand_factor(have_to_want[d], &have_to_want_expanded[d*32]); \
} \
size_t i; \
for (i = 0; i < size; i++) { \
rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded); \
} \
} while (0) \
#define rs_recover_matmul_avx2(D, size, have, want, have_to_want) do { \
__m256i have_to_want_expanded[D]; \
int d; \
for (d = 0; d < D; d++) { \
gf_mul_expand_factor(have_to_want[d], ((u8*)&have_to_want_expanded[d])); \
} \
__m256i low_nibble_mask = broadcast_u8(0x0f); \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u64 i; \
for (i = 0; i < avx_size; i += 32) { \
__m256i want_i = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
want_i = _mm256_xor_si256( \
want_i, \
gf_mul_expanded_avx2( \
_mm256_loadu_si256((const __m256i*)(have[d] + i)), \
have_to_want_expanded[d], \
low_nibble_mask \
) \
); \
} \
_mm256_storeu_si256((__m256i*)(want + i), want_i); \
} \
for (i = avx_size; i < size; i++) { \
rs_recover_matmul_single_expanded(D, i, have, want, ((const u8*)have_to_want_expanded)); \
} \
} while(0)
#define rs_recover_matmul_gfni(D, size, have, want, have_to_want) do { \
__m256i have_to_want_avx[D]; \
int d; \
for (d = 0; d < D; d++) { \
have_to_want_avx[d] = broadcast_u8(have_to_want[d]); \
} \
size_t avx_leftover = size % 32; \
size_t avx_size = size-avx_leftover; \
u64 i; \
for (i = 0; i < avx_size; i += 32) { \
__m256i want_i = _mm256_setzero_si256(); \
for (d = 0; d < D; d++) { \
want_i = _mm256_xor_si256( \
want_i, \
_mm256_gf2p8mul_epi8( \
_mm256_loadu_si256((const __m256i*)(have[d] + i)), \
have_to_want_avx[d] \
) \
); \
} \
_mm256_storeu_si256((__m256i*)(want + i), want_i); \
} \
for (i = avx_size; i < size; i++) { \
rs_recover_matmul_single(D, i, have, want, have_to_want); \
} \
} while (0)
static struct rs* rs_new_core(u8 parity) {
int B = rs_blocks_core(parity);
int D = rs_data_blocks_core(parity);
int P = rs_parity_blocks_core(parity);
struct rs* r = (struct rs*)rs_malloc(sizeof(struct rs) + B*D + D*P*32);
if (r == NULL) { return NULL; }
r->parity = parity;
r->matrix = (u8*)(r + 1);
r->expanded_matrix = r->matrix + B*D;
rs_cauchy_matrix(r);
int p;
int d;
for (p = 0; p < P; p++) {
for (d = 0; d < D; d++) {
gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
}
}
return r;
}
static void rs_delete_core(struct rs* r) {
rs_free(r);
}
static void rs_recover_core(
struct rs* r,
u64 size,
const u8* have_blocks,
const u8** have,
u8 want_block,
u8* want,
void (*recover_func)(int D, u64 size, const u8** have, u8* want, const u8* mat)
) {
int D = rs_data_blocks_core(r->parity);
int B = rs_blocks_core(r->parity);
// Create some space
u8* scratch = (u8*)rs_malloc(D*D + D*D);
u8* mat_1 = scratch;
u8* mat_2 = scratch + D*D;
// Preliminary checks
int i, j, d, b;
for (d = 0; d < D; d++) {
if (have_blocks[d] >= B) {
die("have_blocks[%d]=%d >= %d\n", d, have_blocks[d], B);
}
if (have_blocks[d] == want_block) {
die("have_blocks[%d]=%d == want_block=%d\n", d, have_blocks[d], want_block);
}
if (d > 0 && have_blocks[d] <= have_blocks[d-1]) {
die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", d, have_blocks[d], d, have_blocks[d-1]);
}
}
// below in the dimensionality annotation we paper over transposes
// [DxD] matrix going from the data blocks to the blocks we currently have
u8* data_to_have = mat_1;
int have_cursor;
for (b = 0, have_cursor = 0; b < B; b++) {
if (have_cursor >= D || have_blocks[have_cursor] != b) {
continue;
}
memcpy(data_to_have + have_cursor*D, r->matrix + b*D, D);
have_cursor++;
}
// [DxD] matrix going from what we have to the original data blocks
u8* have_to_data = mat_2;
if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
die("unexpected singular matrix\n");
}
data_to_have = NULL;
// [Dx1] matrix going from the data blocks to the block we want
u8* data_to_want = &r->matrix[want_block*D];
// have_to_want = data_to_want * have_to_data
// [Dx1] matrix going from `blocks` to the block we're into
u8* have_to_want = mat_1;
for (i = 0; i < D; i++) {
have_to_want[i] = 0;
for (j = 0; j < D; j++) {
have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
}
}
// want = have_to_want * have
recover_func(D, size, have, want, have_to_want);
// We're done.
rs_free(scratch);
}

View File

@@ -156,6 +156,8 @@
//
// TODO fill in results
static constexpr uint64_t EGGSFS_PAGE_SIZE = 4096;
static auto wrappedSnapshot(rocksdb::DB* db) {
auto deleter = [db](const rocksdb::Snapshot* snapshot) {
db->ReleaseSnapshot(snapshot);
@@ -726,7 +728,11 @@ struct ShardDBImpl {
beginKey().setOffset(req.byteOffset);
{
std::unique_ptr<rocksdb::Iterator> it(_db->NewIterator(options, _spansCf));
for (it->SeekForPrev(beginKey.toSlice()); it->Valid(); it->Next()) {
for (
it->SeekForPrev(beginKey.toSlice());
it->Valid() && (req.limit == 0 || resp.spans.els.size() < req.limit);
it->Next()
) {
auto key = ExternalValue<SpanKey>::FromSlice(it->key());
if (key().fileId() != req.fileId) {
break;
@@ -1258,6 +1264,11 @@ struct ShardDBImpl {
LOG_DEBUG(_env, "inline span has bad storage class %s", req.storageClass);
return EggsError::BAD_SPAN_BODY;
}
if (req.byteOffset%EGGSFS_PAGE_SIZE != 0) {
LOG_DEBUG(_env, "req.byteOffset=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, EGGSFS_PAGE_SIZE);
return EggsError::BAD_SPAN_BODY;
}
uint32_t expectedCrc = crc32c(0, req.body.data(), req.body.size());
expectedCrc = crc32c_zero_extend(expectedCrc, req.size - req.body.size());
@@ -1293,10 +1304,14 @@ struct ShardDBImpl {
LOG_DEBUG(_env, "bad storage class %s for blocks span", (int)req.storageClass);
return EggsError::BAD_SPAN_BODY;
}
if (req.byteOffset%EGGSFS_PAGE_SIZE != 0 || req.cellSize%EGGSFS_PAGE_SIZE != 0) {
LOG_DEBUG(_env, "req.byteOffset=%s or cellSize=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, req.cellSize, EGGSFS_PAGE_SIZE);
return EggsError::BAD_SPAN_BODY;
}
if (!_checkSpanBody(req)) {
return EggsError::BAD_SPAN_BODY;
}
// start filling in entry
entry.fileId = req.fileId;
entry.byteOffset = req.byteOffset;
@@ -1888,6 +1903,7 @@ struct ShardDBImpl {
return EggsError::MISMATCHING_TARGET;
}
if (edgeBody().creationTime() != creationTime) {
LOG_DEBUG(_env, "expected time %s, got %s", edgeBody().creationTime(), creationTime);
return EggsError::MISMATCHING_CREATION_TIME;
}
if (edgeBody().targetIdWithLocked().extra()) { // locked
@@ -1992,6 +2008,7 @@ struct ShardDBImpl {
}
ExternalValue<CurrentEdgeBody> edge(edgeValue);
if (edge().creationTime() != entry.creationTime) {
LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
return EggsError::MISMATCHING_CREATION_TIME;
}
if (edge().locked()) {
@@ -2046,6 +2063,7 @@ struct ShardDBImpl {
}
ExternalValue<CurrentEdgeBody> edge(edgeValue);
if (edge().creationTime() != entry.creationTime) {
LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
return EggsError::MISMATCHING_CREATION_TIME;
}
if (!edge().locked()) {
@@ -2732,7 +2750,13 @@ struct ShardDBImpl {
const auto& cache = _blockServicesCache.at(blockServiceId.u64);
auto expectedProof = cbcmac(cache.secretKey, (uint8_t*)buf, sizeof(buf));
return proof.proof == expectedProof;
bool good = proof.proof == expectedProof;
if (!good) {
LOG_DEBUG(_env, "bad block write proof, expected %s, got %s", BincodeFixedBytes<8>(expectedProof), proof);
}
return good;
}
std::array<uint8_t, 8> _blockEraseCertificate(uint32_t blockSize, const BlockBody block, const AES128Key& secretKey) {
@@ -3335,8 +3359,16 @@ DirectoryInfo defaultDirectoryInfo() {
hddBlocks.storageClass = storageClassByName("HDD");
addSegment(BLOCK_POLICY_TAG, blockPolicy);
// Span policy: up to 10MiB: RS(4,4), up to 100MiB (max span size): RS(10,4)
// Span policy:
// * up to 64KiB: RS(1,4). This mirroring span simplifies things in the kernel (so that we
// we never have cell sizes that are not multiple of span sizes). We still set things
// up so that we can lose 4 copies and still be fine.
// * up to 10MiB: RS(4,4).
// * up to 100MiB (max span size): RS(10,4).
SpanPolicy spanPolicy;
auto& tinySpans = spanPolicy.entries.els.emplace_back();
tinySpans.maxSize = 1 << 16;
tinySpans.parity = Parity(1, 4);
auto& smallSpans = spanPolicy.entries.els.emplace_back();
smallSpans.maxSize = 10 << 20;
smallSpans.parity = Parity(4, 4);