First version of kernel module

Initial version really by Pawel, but many changes in between. Big outstanding issues: * span cache reclamation (unbounded memory otherwise...) * bad block service detection and workarounds * corrupted blocks detection and workaround Co-authored-by: Paweł Dziepak <pawel.dziepak@xtxmarkets.com>
2026-01-06 11:00:10 -06:00 · 2022-12-01 15:09:50 +00:00
parent d94b582a4d
commit 6addbdee6a
96 changed files with 16355 additions and 2926 deletions
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -16,8 +16,8 @@ if(NOT CMAKE_BUILD_TYPE)
    )
 endif()

-if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|sanitized|valgrind)$"))
-    message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, got ${CMAKE_BUILD_TYPE}")
+if (NOT (${CMAKE_BUILD_TYPE} MATCHES "^(debug|release|alpine|alpine-debug|sanitized|valgrind)$"))
+    message(FATAL_ERROR "Build type must be one of debug, release, sanitized, alpine, alpine-debug got ${CMAKE_BUILD_TYPE}")
 endif()

 set(CMAKE_CXX_STANDARD 20)
@@ -33,13 +33,13 @@ add_compile_options("$<$<CONFIG:valgrind>:-march=haswell;-maes;-mgfni>")
 add_compile_options("$<$<NOT:$<CONFIG:valgrind>>:-march=skylake;-mgfni>")

 # performance/debug stuff
-add_compile_options("$<$<NOT:$<CONFIG:debug>>:-O3>")
-add_compile_options("$<$<CONFIG:debug>:-O;-DEGGS_DEBUG>")
+add_compile_options("$<$<NOT:$<CONFIG:debug,alpine-debug>>:-O3>")
+add_compile_options("$<$<CONFIG:debug,alpine-debug>:-O;-DEGGS_DEBUG>")

 # We build the release build statically in Alpine
-add_compile_options("$<$<CONFIG:alpine>:-DEGGS_ALPINE>")
-add_link_options("$<$<CONFIG:alpine>:-static>")
-add_link_options("$<$<NOT:$<CONFIG:alpine>>:-no-pie>")
+add_compile_options("$<$<CONFIG:alpine,alpine-debug>:-DEGGS_ALPINE>")
+add_link_options("$<$<CONFIG:alpine,alpine-debug>:-static>")
+add_link_options("$<$<NOT:$<CONFIG:alpine,alpine-debug>>:-no-pie>")

 # sanitizer options
 set(SANITIZE_OPTIONS "-fsanitize=undefined,address,integer,function;-fno-sanitize-recover=all;-fsanitize-blacklist=${CMAKE_SOURCE_DIR}/ubsan-ignorelist")
--- a/cpp/build.py
+++ b/cpp/build.py
@@ -5,7 +5,7 @@ from pathlib import Path
 import subprocess

 if len(sys.argv) < 2:
-    print(f'Usage: {sys.argv[0]} release|alpine|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
+    print(f'Usage: {sys.argv[0]} release|alpine|alpine-debug|sanitized|debug|valgrind [NINJA_ARG ...]', file=sys.stderr)
    sys.exit(2)

 if len(sys.argv) == 1:
@@ -19,9 +19,9 @@ repo_dir = cpp_dir.parent
 build_dir = cpp_dir / 'build' / build_type
 build_dir.mkdir(parents=True, exist_ok=True)

-if build_type == 'alpine' and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
+if build_type in ('alpine', 'alpine-debug') and 'IN_EGGS_BUILD_CONTAINER' not in os.environ:
    subprocess.run(
-        ['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', 'alpine'] + sys.argv[2:],
+        ['docker', 'run', '--rm', '-i', '--mount', f'type=bind,src={repo_dir},dst=/eggsfs', 'REDACTED', '/eggsfs/cpp/build.py', build_type] + sys.argv[2:],
        check=True,
    )
 else:
--- a/cpp/cdc/CDCDB.cpp
+++ b/cpp/cdc/CDCDB.cpp
@@ -264,7 +264,6 @@ struct MakeDirectoryStateMachine {
    void createDirectoryInode() {
        auto& shardReq = env.needsShard(MAKE_DIRECTORY_CREATE_DIR, state.dirId().shard()).setCreateDirectoryInode();
        shardReq.id = state.dirId();
-        shardReq.info = req.info;
        shardReq.ownerId = req.ownerId;
    }

@@ -738,6 +737,7 @@ struct SoftUnlinkDirectoryStateMachine {
        shardReq.name = req.name;
        shardReq.targetId = req.targetId;
        shardReq.wasMoved = false;
+        shardReq.creationTime = req.creationTime;
    }

    void afterRollback(EggsError err, const ShardRespContainer* resp) {
--- a/cpp/core/Bincode.cpp
+++ b/cpp/core/Bincode.cpp
@@ -3,26 +3,6 @@

 std::ostream& operator<<(std::ostream& out, const BincodeBytesRef& x) {
    return goLangBytesFmt(out, x.data(), x.size());
-    /*
-    out << "b\"";
-    uint8_t len = x.size();
-    const uint8_t* data = (const uint8_t*)x.data();
-    for (int i = 0; i < len; i++) {
-        uint8_t ch = data[i];
-        if (isprint(ch)) {
-            out << ch;
-        } else if (ch == 0) {
-            out << "\\0";
-        } else {
-            const char cfill = out.fill();
-            out << std::hex << std::setfill('0');
-            out << "\\x" << std::setw(2) << (int)ch;
-            out << std::setfill(cfill) << std::dec;
-        }
-    }
-    out << "\"";
-    return out;
-    */
 }

 std::ostream& operator<<(std::ostream& out, const BincodeBytes& x) {
--- a/cpp/core/Common.cpp
+++ b/cpp/core/Common.cpp
@@ -4,6 +4,7 @@
 #include <netinet/in.h>
 #include <string.h>
 #include <unistd.h>
+#include <iomanip>

 // Throwing in static initialization is nasty, and there is no useful stacktrace
 // Also use direct syscalls to write the error as iostream might not be initialized
@@ -48,4 +49,27 @@ std::ostream& goLangBytesFmt(std::ostream& out, const char* str, size_t len) {

 std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes) {
    return goLangBytesFmt(out, bytes.str, bytes.len);
+}
+
+std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* data, size_t len) {
+    out << "\"";
+    for (int i = 0; i < len; i++) {
+        uint8_t ch = data[i];
+        if (isprint(ch)) {
+            out << ch;
+        } else if (ch == 0) {
+            out << "\\0";
+        } else {
+            const char cfill = out.fill();
+            out << std::hex << std::setfill('0');
+            out << "\\x" << std::setw(2) << (int)ch;
+            out << std::setfill(cfill) << std::dec;
+        }
+    }
+    out << "\"";
+    return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes) {
+    return goLangQuotedStringFmt(out, bytes.str, bytes.len);
 }
--- a/cpp/core/Common.hpp
+++ b/cpp/core/Common.hpp
@@ -52,4 +52,15 @@ struct GoLangBytesFmt {
    GoLangBytesFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
 };

-std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
+std::ostream& operator<<(std::ostream& out, const GoLangBytesFmt& bytes);
+
+std::ostream& goLangQuotedStringFmt(std::ostream& out, const char* str, size_t len);
+
+struct GoLangQuotedStringFmt {
+    const char* str;
+    size_t len;
+
+    GoLangQuotedStringFmt(const char* str_, size_t len_) : str(str_), len(len_) {}
+};
+
+std::ostream& operator<<(std::ostream& out, const GoLangQuotedStringFmt& bytes);
--- a/cpp/core/MsgsGen.cpp
+++ b/cpp/core/MsgsGen.cpp
--- a/cpp/core/MsgsGen.hpp
+++ b/cpp/core/MsgsGen.hpp
--- a/cpp/crc32c/CMakeLists.txt
+++ b/cpp/crc32c/CMakeLists.txt
@@ -1,6 +1,6 @@
-add_library(crc32c crc32c.h crc32c.cpp iscsi.hpp)
+add_library(crc32c crc32c.h crc32c.c iscsi.h)

-add_executable(crc32c-tables tables.cpp iscsi.hpp)
+add_executable(crc32c-tables tables.cpp iscsi.h)

 add_executable(crc32c-tests tests.cpp)
 target_link_libraries(crc32c-tests PRIVATE crc32c)
--- a/cpp/crc32c/crc32c.cpp
+++ b/cpp/crc32c/crc32c.cpp
@@ -1,11 +1,29 @@
+#ifndef __KERNEL__
+
 #include "crc32c.h"

 #include <immintrin.h>
 #include <stdint.h>
 #include <string.h>

+#define kernel_static
+
+typedef uint8_t u8;
+typedef uint32_t u32;
+
+#else
+
+#define kernel_static static
+
+#endif
+
+#ifdef __clang__
 __attribute__((no_sanitize("integer")))
-static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_blocks) {
+#endif
+#ifdef __KERNEL__
+__attribute__((target("crc32,pclmul")))
+#endif
+static u32 crc32_fusion_kernel(u32 acc_a, const char* buf, size_t n_blocks) {
    size_t stride = n_blocks * 24 + 8;
    // Four chunks:
    //  Chunk A: 0 through stride
@@ -18,8 +36,8 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
    __m128i x2 = _mm_loadu_si128((__m128i*)(buf2 + 16));
    __m128i x3 = _mm_loadu_si128((__m128i*)(buf2 + 32));
    __m128i x4 = _mm_loadu_si128((__m128i*)(buf2 + 48));
-    uint32_t acc_b = 0;
-    uint32_t acc_c = 0;
+    u32 acc_b = 0;
+    u32 acc_c = 0;
    // Parallel fold remaining blocks of 64 from D, and 24 from each of A/B/C.
    // k1 == magic(4*128+32-1)
    // k2 == magic(4*128-32-1)
@@ -100,16 +118,16 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
    stack_a = ~stack_a;
    stack_b = ~stack_b;
    stack_c = ~stack_c;
-    uint32_t magic_a = ((uint32_t)0x80000000) >> (bits_a & 31); bits_a >>= 5;
-    uint32_t magic_b = ((uint32_t)0x80000000) >> (bits_b & 31); bits_b >>= 5;
-    uint32_t magic_c = ((uint32_t)0x80000000) >> (bits_c & 31); bits_c >>= 5;
+    u32 magic_a = ((u32)0x80000000) >> (bits_a & 31); bits_a >>= 5;
+    u32 magic_b = ((u32)0x80000000) >> (bits_b & 31); bits_b >>= 5;
+    u32 magic_c = ((u32)0x80000000) >> (bits_c & 31); bits_c >>= 5;
    bits_a -= bits_b;
    bits_b -= bits_c;
    for (; bits_c; --bits_c) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0), magic_c = _mm_crc32_u32(magic_c, 0);
    for (; bits_b; --bits_b) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0);
    for (; bits_a; --bits_a) magic_a = _mm_crc32_u32(magic_a, 0);
    for (;;) {
-        uint32_t low = stack_a & 1;
+        u32 low = stack_a & 1;
        if (!(stack_a >>= 1)) break;
        __m128i x = _mm_cvtsi32_si128(magic_a);
        uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
@@ -130,13 +148,18 @@ static uint32_t crc32_fusion_kernel(uint32_t acc_a, const char* buf, size_t n_bl
    x1 = _mm_xor_si128(x1, x5);
    uint64_t abc = _mm_cvtsi128_si64(_mm_xor_si128(_mm_xor_si128(vec_c, vec_a), vec_b));
    // Apply missing <<32 and fold down to 32-bits.
-    uint32_t crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
+    u32 crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
    crc = _mm_crc32_u64(crc, abc ^ _mm_extract_epi64(x1, 1));
    return crc;
 }

+#ifdef __clang__
 __attribute__((no_sanitize("integer")))
-uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
+#endif
+#ifdef __KERNEL__
+__attribute__((target("crc32")))
+#endif
+kernel_static u32 crc32c(u32 crc, const char* buf, size_t length) {
    crc = ~crc; // preset to -1 (distinguish leading zeros)
    if (length >= 31) {
        size_t n_blocks = (length - 16) / 136;
@@ -149,7 +172,7 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
        const char* kernel_start = kernel_end - kernel_length;
        length -= kernel_start - buf;
        for (; buf != kernel_start; ++buf) {
-            crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
+            crc = _mm_crc32_u8(crc, *(const u8*)buf);
        }
        if (n_blocks) {
            length -= kernel_length;
@@ -161,17 +184,34 @@ uint32_t crc32c(uint32_t crc, const char* buf, size_t length) {
        crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
    }
    for (; length; --length, ++buf) {
-        crc = _mm_crc32_u8(crc, *(const uint8_t*)buf);
+        crc = _mm_crc32_u8(crc, *(const u8*)buf);
    }
    return ~crc; // post-invert -- distinguish scaled multiples
 }

-#include "iscsi.hpp"
+#ifdef __clang__
+__attribute__((no_sanitize("integer")))
+#endif
+#ifdef __KERNEL__
+__attribute__((target("crc32")))
+#endif
+kernel_static u32 crc32c_simple(u32 crc, const char* buf, size_t length) {
+    crc = ~crc; // preset to -1 (distinguish leading zeros)
+    for (; length >= 8; length -= 8, buf += 8) {
+        crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
+    }
+    for (; length; --length, ++buf) {
+        crc = _mm_crc32_u8(crc, *(const u8*)buf);
+    }
+    return ~crc; // post-invert -- distinguish scaled multiples
+}
+
+#include "iscsi.h"

 // In the comments below, multiplication is meant to be modulo ISCSI_POLY.

 // Stores x^2^0, ..., x^2^31. Can be generated with `mult_mod_p`, see tables.cpp.
-static uint32_t CRC_POWER_TABLE[31] = {
+static u32 CRC_POWER_TABLE[31] = {
    0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000, 
    0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 
    0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 
@@ -182,7 +222,7 @@ static uint32_t CRC_POWER_TABLE[31] = {
 };

 // Stores x^-(2^0), ..., x^-(2^31). Can be generated with `mult_mod_p`, see tables.cpp.
-static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
+static u32 CRC_INVERSE_POWER_TABLE[31] = {
    0x05ec76f1, 0x0bd8ede2, 0x2f63b788, 0xfde39562, 0xbef0965e, 
    0xd610d67e, 0xe67cce65, 0xa268b79e, 0x134fb088, 0x32998d96, 
    0xcedac2cc, 0x70118575, 0x0e004a40, 0xa7864c8b, 0xbc7be916, 
@@ -193,7 +233,7 @@ static uint32_t CRC_INVERSE_POWER_TABLE[31] = {
 };

 // Return x^(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
-static uint32_t x2n_mod_p(size_t n, uint32_t k) {
+static u32 x2n_mod_p(size_t n, u32 k) {
    // CRC_POWER_TABLE has all powers of two can multiply combinations
    // of these to achieve any power.

@@ -210,7 +250,7 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
    // We walk along the p_is above and keep adding to the result.
    //
    // Note that 2^2^(31 + k) = 2^2^k TODO clarify
-    uint32_t p = 1u << 31;
+    u32 p = 1u << 31;
    for (; n != 0; n >>= 1, k++) {
        if (n & 1) {
            // p(x) = p(x) * 2^(k % 31)
@@ -221,8 +261,8 @@ static uint32_t x2n_mod_p(size_t n, uint32_t k) {
 }

 // Return x^-(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
-static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
-    uint32_t p = 1u << 31;
+static u32 x2n_mod_p_inv(size_t n, u32 k) {
+    u32 p = 1u << 31;
    for (; n != 0; n >>= 1, k++) {
        if (n & 1) {
            p = crc32c_mult_mod_p(CRC_INVERSE_POWER_TABLE[k & 0x1F], p);
@@ -231,7 +271,7 @@ static uint32_t x2n_mod_p_inv(size_t n, uint32_t k) {
    return p;
 }

-uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
+kernel_static u32 crc32c_zero_extend(u32 crc, ssize_t zeros) {
    if (zeros > 0) {
        return ~crc32c_mult_mod_p(x2n_mod_p(zeros, 3), ~crc);
    } else {
@@ -239,17 +279,17 @@ uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros) {
    }
 }

-uint32_t crc32c_append(uint32_t crc_a, uint32_t crc_b, size_t len_b) {
+kernel_static u32 crc32c_append(u32 crc_a, u32 crc_b, size_t len_b) {
    // We need to extend crc_a with len_b*8 zeros (len_b is the number
    // of bytes) (without the inversion).
    // This amounts to performing `crc_a * x^(len_b*8)`.
    return crc32c_mult_mod_p(x2n_mod_p(len_b, 3), crc_a) ^ crc_b;
 }

-uint32_t crc32c_xor(uint32_t crc_a, uint32_t crc_b, size_t len) {
+kernel_static u32 crc32c_xor(u32 crc_a, u32 crc_b, size_t len) {
    // We need to to extend crc_a with the crc of len*8 bits.
    // We could do this in 32 steps rather than 32+32 with a dedicated
    // table, but probably doesn't matter.
-    uint32_t crc_0 = ~crc32c_mult_mod_p(~(uint32_t)0, x2n_mod_p(len, 3));
+    u32 crc_0 = ~crc32c_mult_mod_p(~(u32)0, x2n_mod_p(len, 3));
    return crc_a ^ crc_b ^ crc_0;
-}
+}
--- a/cpp/crc32c/iscsi.h
+++ b/cpp/crc32c/iscsi.h
@@ -0,0 +1,24 @@
+#define ISCSI_POLY 0x82F63B78u
+
+// Return a(x) multiplied by b(x) modulo ISCSI_POLY, For speed, this requires
+// that a(x) not be zero.
+static u32 crc32c_mult_mod_p(u32 a, u32 b) {
+    // m goes from x^0 to x^31
+    u32 m = 1u << 31;
+    u32 p = 0;
+    for (;;) {
+        // If a(x) contains x^n, add b(x)*x^n
+        if (a & m) {
+            p ^= b;
+            // Exit when there are no higher bits in a(x)
+            if ((a & (m - 1)) == 0) {
+                break;
+            }
+        }
+        // Go from x^n to x^(n+1)
+        m >>= 1;
+        // Go from b(x)*x^n to b(x)*x^(n+1)
+        b = (b & 1) ? ((b >> 1) ^ ISCSI_POLY) : b >> 1;
+    }
+    return p;
+}
--- a/cpp/crc32c/tests.cpp
+++ b/cpp/crc32c/tests.cpp
@@ -30,6 +30,7 @@ int main() {
    for (int i = 0; i < 1000; i++) {
        auto s1 = randString(1 + wyhash64(&rand)%100);
        uint32_t crc1 = crc32c(0, (const char*)s1.data(), s1.size());
+        ASSERT(crc1 == crc32c_append(0, crc1, s1.size()));
        auto s2 = randString(1 + wyhash64(&rand)%100);
        uint32_t crc2 = crc32c(0, (const char*)s2.data(), s2.size());
        std::vector<uint8_t> s = s1;
--- a/cpp/rs/CMakeLists.txt
+++ b/cpp/rs/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(rs rs.h rs.cpp gf.hpp gf_tables.cpp)
+add_library(rs rs.h rs.cpp gf_tables.c)

 add_executable(rs-tests tests.cpp)
 target_link_libraries(rs-tests PRIVATE rs)
--- a/cpp/rs/gf.hpp
+++ b/cpp/rs/gf.hpp
@@ -1,110 +0,0 @@
-#pragma once
-
-#include <immintrin.h>
-#include <stdint.h>
-#include <string.h>
-
-extern const uint8_t rs_gf_inv_table[256];
-extern const uint8_t rs_gf_log_table[256];
-extern const uint8_t rs_gf_exp_table[256];
-
-inline uint8_t gf_inv(uint8_t x) {
-    return rs_gf_inv_table[x];
-}
-
-inline uint8_t gf_mul(uint8_t x, uint8_t y) {
-    if (x == 0 || y == 0) {
-        return 0;
-    }
-    int i = rs_gf_log_table[x] + rs_gf_log_table[y];
-    return rs_gf_exp_table[i > 254 ? i - 255 : i];
-}
-
-inline void gf_mul_expand_factor(uint8_t x, uint8_t* expanded_x) {
-    for (int i = 0; i < 16; i++) {
-        expanded_x[i] = gf_mul(i, x);
-    }
-    for (int i = 0; i < 16; i++) {
-        expanded_x[16 + i] = gf_mul(i << 4, x);
-    }
-}
-
-inline uint8_t gf_mul_expanded(uint8_t x, const uint8_t* expanded_y) {
-    return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
-}
-
-inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
-    __m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
-    __m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
-
-    __m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
-    __m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
-
-    return _mm256_xor_si256(
-        _mm256_shuffle_epi8(expanded_y_lo, x_lo),
-        _mm256_shuffle_epi8(expanded_y_hi, x_hi)
-    );
-}
-
-// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
-// just Gaussian elimination.
-//
-// TODO this is in row-major, it'd be nice to have it in column-major
-// to save have the final operation to be more natural in rs_recover.
-__attribute__((noinline))
-static bool rs_gf_invert_matrix(uint8_t* in_mat, uint8_t* out_mat, const int n) {
-    int i, j, k;
-    uint8_t temp;
-
-    // Set out_mat[] to the identity matrix
-    memset(out_mat, 0, n*n);
-    for (i = 0; i < n; i++) {
-        out_mat[i * n + i] = 1;
-    }
-
-    // Inverse
-    for (i = 0; i < n; i++) {
-        // Check for 0 in pivot element
-        if (in_mat[i * n + i] == 0) {
-            // Find a row with non-zero in current column and swap
-            for (j = i + 1; j < n; j++) {
-                if (in_mat[j * n + i]) {
-                    break;
-                }
-            }
-
-            if (j == n) {    // Couldn't find means it's singular
-                return false;
-            }
-
-            for (k = 0; k < n; k++) {    // Swap rows i,j
-                temp = in_mat[i * n + k];
-                in_mat[i * n + k] = in_mat[j * n + k];
-                in_mat[j * n + k] = temp;
-
-                temp = out_mat[i * n + k];
-                out_mat[i * n + k] = out_mat[j * n + k];
-                out_mat[j * n + k] = temp;
-            }
-        }
-
-        temp = gf_inv(in_mat[i * n + i]);    // 1/pivot
-        for (j = 0; j < n; j++) {    // Scale row i by 1/pivot
-            in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
-            out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
-        }
-
-        for (j = 0; j < n; j++) {
-            if (j == i) {
-                continue;
-            }
-
-            temp = in_mat[j * n + i];
-            for (k = 0; k < n; k++) {
-                out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
-                in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
-            }
-        }
-    }
-    return true;
-}
--- a/cpp/rs/gf_tables.cpp
+++ b/cpp/rs/gf_tables.cpp
@@ -1,7 +1,9 @@
 // generated with gf_tables.py
+#ifndef __KERNEL__
 #include <stdint.h>
+#endif

-extern const uint8_t rs_gf_exp_table[256] = {
+const uint8_t rs_gf_exp_table[256] = {
    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35, 
    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa, 
    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31, 
@@ -20,7 +22,7 @@ extern const uint8_t rs_gf_exp_table[256] = {
    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01, 
 };

-extern const uint8_t rs_gf_log_table[256] = {
+const uint8_t rs_gf_log_table[256] = {
    0x00, 0xff, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, 
    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, 
    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, 
@@ -39,7 +41,7 @@ extern const uint8_t rs_gf_log_table[256] = {
    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07, 
 };

-extern const uint8_t rs_gf_inv_table[256] = {
+const uint8_t rs_gf_inv_table[256] = {
    0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0, 0xb0, 0xe1, 0xe5, 0xc7, 
    0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f, 0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2, 
    0x3a, 0x6e, 0x5a, 0xf1, 0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2, 
--- a/cpp/rs/rs.cpp
+++ b/cpp/rs/rs.cpp
@@ -6,108 +6,10 @@
 #include <array>

 #include "rs.h"
-#include "gf.hpp"

 #define die(...) do { fprintf(stderr, __VA_ARGS__); raise(SIGABRT); } while(false)
-
-static void* malloc_or_die(size_t size, const char* what) {
-    void* ptr = malloc(size);
-    if (ptr == nullptr) {
-        die(what);
-    }
-    return ptr;
-}
-
-struct rs {
-    uint8_t parity;
-    // uint8_t[D*B], in column-major.
-    uint8_t* matrix;
-    // uint8_t[D*P][32], in column-major. These are the lookup tables
-    // to perform multiplication quickly.
-    uint8_t* expanded_matrix;
-};
-
-static struct rs* rs_cached[256] = {
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-};
-
-// Note that we pervasively assume that the first column of the parity columns
-// is 1s, which causes the first parity to be the XORs of the data. So you can't
-// really change how the matrix is generated.
-static void rs_cauchy_matrix(struct rs* r) {
-    int D = rs_data_blocks(r->parity);
-    int B = rs_blocks(r->parity);
-    uint8_t* matrix = r->matrix;
-    memset(matrix, 0, D*B);
-    // Identity in the d*d upper half
-    for (int i = 0; i < D; i++) {
-        matrix[D*i + i] = 1;
-    }
-    // Fill in the rest using cauchy
-    for (int col = D; col < B; col++) {
-        for (int row = 0; row < D; row++) {
-            matrix[col*D + row] = gf_inv(col ^ row);
-        }
-    }
-    // Scale the columns
-    for (int col = D; col < B; col++) {
-        uint8_t factor = gf_inv(matrix[col*D]);
-        for (int row = 0; row < D; row++) {
-            matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
-        }
-    }
-    // Scale the rows
-    for (int row = 1; row < D; row++) {
-        uint8_t factor = gf_inv(matrix[D*D + row]);
-        for (int col = D; col < B; col++) {
-            matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
-        }
-    }
-}
-
-static struct rs* rs_new(uint8_t parity) {
-    int B = rs_blocks(parity);
-    int D = rs_data_blocks(parity);
-    int P = rs_parity_blocks(parity);
-    struct rs* r = (struct rs*)malloc_or_die(sizeof(struct rs) + B*D + D*P*32, "rs_new\n");
-    r->parity = parity;
-    r->matrix = (uint8_t*)(r + 1);
-    r->expanded_matrix = r->matrix + B*D;
-    rs_cauchy_matrix(r);
-    for (int p = 0; p < P; p++) {
-        for (int d = 0; d < D; d++) {
-            gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
-        }
-    }
-    return r;
-}
-
-static void rs_delete(struct rs* r) {
-    free(r);
-}
-
-static std::array<uint32_t, 4> rs_cpuidex(uint32_t function_id, uint32_t subfunction_id) {
-    uint32_t a, b, c, d;
-    __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
-    return {a, b, c, d};
-}
-
-static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
+#define rs_malloc malloc
+#define rs_free free

 // See `valgrind.h`
 static uint64_t rs_valgrind_client_request(uint64_t defaultResult, uint64_t reqID, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5) {
@@ -129,20 +31,48 @@ static bool rs_detect_valgrind() {
    return rs_valgrind_client_request(0, 0x1001, 0, 0, 0, 0, 0);
 }

-bool rs_has_cpu_level(rs_cpu_level level) {
-    const auto cpuid7 = (rs_cpuidex(0, 0)[0] >= 7) ? rs_cpuidex(7, 0) : std::array<uint32_t, 4>{0, 0, 0, 0};
-    switch (level) {
-    case RS_CPU_SCALAR:
-        return true;
-    case RS_CPU_AVX2:
-        return cpuid7[1] & (1<<5);
-    case RS_CPU_GFNI:
-        return cpuid7[2] & (1<<8) && !rs_detect_valgrind();
-    default:
-        die("bad CPU level %d\n", level);
-    }
+// This will emit vbroadcastb
+__attribute__((no_sanitize("integer")))
+static inline __m256i broadcast_u8(uint8_t x) {
+    return _mm256_set_epi8(
+        x, x, x, x, x, x, x, x,
+        x, x, x, x, x, x, x, x,
+        x, x, x, x, x, x, x, x,
+        x, x, x, x, x, x, x, x
+    );
 }

+#include "rs_core.c"
+
+uint8_t rs_parity(struct rs* r) {
+    return r->parity;
+}
+
+static struct rs* rs_cached[256] = {
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+};
+
+bool rs_has_cpu_level(rs_cpu_level level) {
+    return rs_has_cpu_level_core(level);
+}
+
+static uint8_t rs_chosen_cpu_level = RS_CPU_SCALAR;
+
 __attribute__((constructor))
 void rs_detect_cpu_level() {
    if (rs_has_cpu_level(RS_CPU_GFNI)) {
@@ -172,124 +102,46 @@ struct rs* rs_get(uint8_t parity) {
    }
    struct rs* r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
    if (__builtin_expect(r == nullptr, 0)) {
-        r = rs_new(parity);
+        r = rs_new_core(parity);
+        if (r == nullptr) {
+            die("could not allocate RS data");
+        }
        struct rs* expected = nullptr;
        if (!__atomic_compare_exchange_n(&rs_cached[parity], &expected, r, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
            // somebody else got to it first
-            rs_delete(r);
+            rs_delete_core(r);
            r = __atomic_load_n(&rs_cached[parity], __ATOMIC_RELAXED);
        }
    }
    return r;
 }

-uint8_t rs_parity(struct rs* r) {
-    return r->parity;
+template<int D, int P> __attribute__((noinline))
+static void rs_compute_parity_scalar_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
+    rs_compute_parity_scalar(D, P, r, size, data, parity);
 }

-// This will emit vbroadcastb
-__attribute__((no_sanitize("integer")))
-inline __m256i broadcast_u8(uint8_t x) {
-    return _mm256_set_epi8(
-        x, x, x, x, x, x, x, x,
-        x, x, x, x, x, x, x, x,
-        x, x, x, x, x, x, x, x,
-        x, x, x, x, x, x, x, x
-    );
+template<int D, int P> __attribute__((noinline))
+static void rs_compute_parity_avx2_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
+    rs_compute_parity_avx2(D, P, r, size, data, parity);
 }

-template<int D, int P>
-static void rs_compute_parity_single(struct rs* r, uint64_t i, const uint8_t** data, uint8_t** parity) {
-    parity[0][i] = 0;
-    for (int d = 0; d < D; d++) {
-        parity[0][i] ^= data[d][i];
-    }
-    for (int p = 1; p < P; p++) {
-        const uint8_t* factor = &r->expanded_matrix[D*32*p];
-        parity[p][i] = 0;
-        for (int d = 0; d < D; d++, factor += 32) {
-            parity[p][i] ^= gf_mul_expanded(data[d][i], factor);
-        }
-    }
-}
-
-template<int D, int P>
-__attribute__((noinline))
-void rs_compute_parity_scalar(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
-    // parity = r->matrix * data
-    for (uint64_t i = 0; i < size; i++) {
-        rs_compute_parity_single<D, P>(r, i, data, parity);
-    }
-}
-
-template<int D, int P>
-__attribute__((noinline))
-void rs_compute_parity_avx2(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
-    __m256i low_nibble_mask = broadcast_u8(0x0f);
-    size_t avx_leftover = size % 32;
-    size_t avx_size = size-avx_leftover;
-    for (uint64_t i = 0; i < avx_size; i += 32) {
-        {
-            __m256i parity_0 = _mm256_setzero_si256();
-            for (int d = 0; d < D; d++) {
-                parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
-            }
-            _mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
-        }
-        for (int p = 1; p < P; p++) {
-            __m256i parity_p = _mm256_setzero_si256();
-            for (int d = 0; d < D; d++) {
-                __m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
-                __m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]);
-                parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask));
-            }
-            _mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
-        }
-    }
-    for (uint64_t i = avx_size; i < size; i++) {
-        rs_compute_parity_single<D, P>(r, i, data, parity);
-    }
-}
-
-template<int D, int P>
-__attribute__((noinline))
-void rs_compute_parity_gfni(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
-    size_t avx_leftover = size % 32;
-    size_t avx_size = size-avx_leftover;
-    for (uint64_t i = 0; i < avx_size; i += 32) {
-        {
-            __m256i parity_0 = _mm256_setzero_si256();
-            for (int d = 0; d < D; d++) {
-                parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i)));
-            }
-            _mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0);
-        }
-        for (int p = 1; p < P; p++) {
-            __m256i parity_p = _mm256_setzero_si256();
-            for (int d = 0; d < D; d++) {
-                __m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i));
-                __m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]);
-                parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor));
-            }
-            _mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p);
-        }
-    }
-    for (uint64_t i = avx_size; i < size; i++) {
-        rs_compute_parity_single<D, P>(r, i, data, parity);
-    }
+template<int D, int P> __attribute__((noinline))
+static void rs_compute_parity_gfni_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
+    rs_compute_parity_gfni(D, P, r, size, data, parity);
 }

 template<int D, int P>
 static void rs_compute_parity_tmpl(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
    switch (rs_cpu_level l = rs_get_cpu_level()) {
    case RS_CPU_SCALAR:
-        rs_compute_parity_scalar<D, P>(r, size, data, parity);
+        rs_compute_parity_scalar_tmpl<D, P>(r, size, data, parity);
        break;
    case RS_CPU_AVX2:
-        rs_compute_parity_avx2<D, P>(r, size, data, parity);
+        rs_compute_parity_avx2_tmpl<D, P>(r, size, data, parity);
        break;
    case RS_CPU_GFNI:
-        rs_compute_parity_gfni<D, P>(r, size, data, parity);
+        rs_compute_parity_gfni_tmpl<D, P>(r, size, data, parity);
        break;
    default:
        die("bad cpu_level %d\n", l);
@@ -301,101 +153,32 @@ void rs_compute_parity(struct rs* r, uint64_t size, const uint8_t** data, uint8_
    rs_compute_parity_funcs[r->parity](r, size, data, parity);
 }

-template<int D>
-static void rs_recover_matmul_single(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
-    want[i] = 0;
-    for (int j = 0; j < D; j++) {
-        want[i] ^= gf_mul(have_to_want[j], have[j][i]);
-    }
+template<int D> __attribute__((noinline))
+static void rs_recover_matmul_scalar_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
+    rs_recover_matmul_scalar(D, size, have, want, mat);
 }

-template<int D>
-static void rs_recover_matmul_single_expanded(uint64_t i, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want_expanded) {
-    want[i] = 0;
-    for (int j = 0; j < D; j++) {
-        want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]);
-    }
+template<int D> __attribute__((noinline))
+static void rs_recover_matmul_avx2_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
+    rs_recover_matmul_avx2(D, size, have, want, mat);
 }

-template<int D>
-__attribute__((noinline))
-static void rs_recover_matmul_scalar(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
-    uint8_t have_to_want_expanded[D*32];
-    for (int i = 0; i < D; i++) {
-        gf_mul_expand_factor(have_to_want[i], &have_to_want_expanded[i*32]);
-    }
-    for (size_t i = 0; i < size; i++) {
-        rs_recover_matmul_single_expanded<D>(i, have, want, have_to_want_expanded);
-    }
-}
-
-template<int D>
-__attribute__((noinline))
-static void rs_recover_matmul_avx2(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
-    __m256i have_to_want_expanded[D];
-    for (int i = 0; i < D; i++) {
-        gf_mul_expand_factor(have_to_want[i], (uint8_t*)&have_to_want_expanded[i]);
-    }
-    __m256i low_nibble_mask = broadcast_u8(0x0f);
-    size_t avx_leftover = size % 32;
-    size_t avx_size = size-avx_leftover;
-    for (uint64_t i = 0; i < avx_size; i += 32) {
-        __m256i want_i = _mm256_setzero_si256();
-        for (int d = 0; d < D; d++) {
-            want_i = _mm256_xor_si256(
-                want_i,
-                gf_mul_expanded_avx2(
-                    _mm256_loadu_si256((const __m256i*)(have[d] + i)),
-                    have_to_want_expanded[d],
-                    low_nibble_mask
-                )
-            );
-        }
-        _mm256_storeu_si256((__m256i*)(want + i), want_i);
-    }
-    for (uint64_t i = avx_size; i < size; i++) {
-        rs_recover_matmul_single_expanded<D>(i, have, want, (const uint8_t*)have_to_want_expanded);
-    }
-}
-
-template<int D>
-__attribute__((noinline))
-static void rs_recover_matmul_gfni(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* have_to_want) {
-    __m256i have_to_want_avx[D];
-    for (int i = 0; i < D; i++) {
-        have_to_want_avx[i] = broadcast_u8(have_to_want[i]);
-    }
-    size_t avx_leftover = size % 32;
-    size_t avx_size = size-avx_leftover;
-    for (uint64_t i = 0; i < avx_size; i += 32) {
-        __m256i want_i = _mm256_setzero_si256();
-        for (int d = 0; d < D; d++) {
-            want_i = _mm256_xor_si256(
-                want_i,
-                _mm256_gf2p8mul_epi8(
-                    _mm256_loadu_si256((const __m256i*)(have[d] + i)),
-                    have_to_want_avx[d]
-                )
-            );
-        }
-        _mm256_storeu_si256((__m256i*)(want + i), want_i);
-    }
-    for (uint64_t i = avx_size; i < size; i++) {
-        rs_recover_matmul_single<D>(i, have, want, have_to_want);
-    }
+template<int D> __attribute__((noinline))
+static void rs_recover_matmul_gfni_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
+    rs_recover_matmul_gfni(D, size, have, want, mat);
 }

 template<int D>
 static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
    switch (rs_cpu_level l = rs_get_cpu_level()) {
    case RS_CPU_SCALAR:
-        rs_recover_matmul_scalar<D>(size, have, want, mat);
+        rs_recover_matmul_scalar_tmpl<D>(size, have, want, mat);
        break;
    case RS_CPU_AVX2:
-        rs_recover_matmul_avx2<D>(size, have, want, mat);
+        rs_recover_matmul_avx2_tmpl<D>(size, have, want, mat);
        break;
    case RS_CPU_GFNI:
-        rs_recover_matmul_gfni<D>(size, have, want, mat);
+        rs_recover_matmul_gfni_tmpl<D>(size, have, want, mat);
        break;
    default:
        die("bad cpu_level %d\n", l);
@@ -403,7 +186,6 @@ static void rs_recover_matmul_tmpl(uint64_t size, const uint8_t** have, uint8_t*
 }

 static void (*rs_recover_matmul_funcs[16])(uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat);
-
 void rs_recover(
    struct rs* r,
    uint64_t size,
@@ -412,55 +194,12 @@ void rs_recover(
    uint8_t want_block,
    uint8_t* want
 ) {
-    int D = rs_data_blocks(r->parity);
-    int B = rs_blocks(r->parity);
-    // Create some space
-    uint8_t* scratch = (uint8_t*)malloc(D*D + D*D);
-    uint8_t* mat_1 = scratch;
-    uint8_t* mat_2 = scratch + D*D;
-    // Preliminary checks
-    for (int i = 0; i < D; i++) {
-        if (have_blocks[i] >= B) {
-            die("have_blocks[%d]=%d >= %d\n", i, have_blocks[i], B);
+    rs_recover_core(
+        r, size, have_blocks, have, want_block, want,
+        [](int D, uint64_t size, const uint8_t** have, uint8_t* want, const uint8_t* mat) {
+            rs_recover_matmul_funcs[D](size, have, want, mat);
        }
-        if (have_blocks[i] == want_block) {
-            die("have_blocks[%d]=%d == want_block=%d\n", i, have_blocks[i], want_block);
-        }
-        if (i > 0 && have_blocks[i] <= have_blocks[i-1]) {
-            die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", i, have_blocks[i], i, have_blocks[i-1]);
-        }
-    }
-    // below in the dimensionality annotation we paper over transposes
-    // [DxD] matrix going from the data blocks to the blocks we currently have
-    uint8_t* data_to_have = mat_1;
-    for (int i = 0, have_cursor = 0; i < B; i++) {
-        if (have_cursor >= D || have_blocks[have_cursor] != i) {
-            continue;
-        }
-        memcpy(data_to_have + have_cursor*D, r->matrix + i*D, D);
-        have_cursor++;
-    }
-    // [DxD] matrix going from what we have to the original data blocks
-    uint8_t* have_to_data = mat_2;
-    if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
-        die("unexpected singular matrix\n");
-    }
-    data_to_have = nullptr;
-    // [Dx1] matrix going from the data blocks to the block we want
-    uint8_t* data_to_want = &r->matrix[want_block*D];
-    // have_to_want = data_to_want * have_to_data
-    // [Dx1] matrix going from `blocks` to the block we're into
-    uint8_t* have_to_want = mat_1;
-    for (int i = 0; i < D; i++) {
-        have_to_want[i] = 0;
-        for (int j = 0; j < D; j++) {
-            have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
-        }
-    }
-    // want = have_to_want * have
-    rs_recover_matmul_funcs[D](size, have, want, have_to_want);
-    // We're done.
-    free(scratch);
+    );
 }

 __attribute__((constructor))
--- a/cpp/rs/rs_core.c
+++ b/cpp/rs/rs_core.c
@@ -0,0 +1,451 @@
+#ifndef __KERNEL__
+
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#endif
+
+extern const u8 rs_gf_inv_table[256];
+extern const u8 rs_gf_log_table[256];
+extern const u8 rs_gf_exp_table[256];
+
+static inline u8 gf_inv(u8 x) {
+    return rs_gf_inv_table[x];
+}
+
+static inline u8 gf_mul(u8 x, u8 y) {
+    if (x == 0 || y == 0) {
+        return 0;
+    }
+    int i = rs_gf_log_table[x] + rs_gf_log_table[y];
+    return rs_gf_exp_table[i > 254 ? i - 255 : i];
+}
+
+static inline void gf_mul_expand_factor(u8 x, u8* expanded_x) {
+    int i;
+    for (i = 0; i < 16; i++) {
+        expanded_x[i] = gf_mul(i, x);
+    }
+    for (i = 0; i < 16; i++) {
+        expanded_x[16 + i] = gf_mul(i << 4, x);
+    }
+}
+
+static inline u8 gf_mul_expanded(u8 x, const u8* expanded_y) {
+    return expanded_y[x & 0x0f] ^ expanded_y[16 + ((x & 0xf0) >> 4)];
+}
+
+__attribute__((target("avx2")))
+static inline __m256i gf_mul_expanded_avx2(__m256i x, __m256i expanded_y, __m256i low_nibble_mask) {
+    __m256i expanded_y_lo = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x00);
+    __m256i expanded_y_hi = _mm256_permute2x128_si256(expanded_y, expanded_y, 0x11);
+
+    __m256i x_lo = _mm256_and_si256(x, low_nibble_mask);
+    __m256i x_hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_nibble_mask);
+
+    return _mm256_xor_si256(
+        _mm256_shuffle_epi8(expanded_y_lo, x_lo),
+        _mm256_shuffle_epi8(expanded_y_hi, x_hi)
+    );
+}
+
+static inline u8 rs_data_blocks_core(u8 parity) {
+    return parity & 0x0F;
+}
+
+static inline u8 rs_parity_blocks_core(u8 parity) {
+    return parity >> 4;
+}
+
+static inline u8 rs_blocks_core(u8 parity) {
+    return rs_data_blocks_core(parity) + rs_parity_blocks_core(parity);
+}
+
+// From <https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/erasure_code/ec_base.c#L110>,
+// just Gaussian elimination.
+//
+// TODO this is in row-major, it'd be nice to have it in column-major
+// to save have the final operation to be more natural in rs_recover.
+static bool rs_gf_invert_matrix(u8* in_mat, u8* out_mat, const int n) {
+    int i, j, k;
+    u8 temp;
+
+    // Set out_mat[] to the identity matrix
+    memset(out_mat, 0, n*n);
+    for (i = 0; i < n; i++) {
+        out_mat[i * n + i] = 1;
+    }
+
+    // Inverse
+    for (i = 0; i < n; i++) {
+        // Check for 0 in pivot element
+        if (in_mat[i * n + i] == 0) {
+            // Find a row with non-zero in current column and swap
+            for (j = i + 1; j < n; j++) {
+                if (in_mat[j * n + i]) {
+                    break;
+                }
+            }
+
+            if (j == n) {    // Couldn't find means it's singular
+                return false;
+            }
+
+            for (k = 0; k < n; k++) {    // Swap rows i,j
+                temp = in_mat[i * n + k];
+                in_mat[i * n + k] = in_mat[j * n + k];
+                in_mat[j * n + k] = temp;
+
+                temp = out_mat[i * n + k];
+                out_mat[i * n + k] = out_mat[j * n + k];
+                out_mat[j * n + k] = temp;
+            }
+        }
+
+        temp = gf_inv(in_mat[i * n + i]);    // 1/pivot
+        for (j = 0; j < n; j++) {    // Scale row i by 1/pivot
+            in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
+            out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
+        }
+
+        for (j = 0; j < n; j++) {
+            if (j == i) {
+                continue;
+            }
+
+            temp = in_mat[j * n + i];
+            for (k = 0; k < n; k++) {
+                out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
+                in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
+            }
+        }
+    }
+    return true;
+}
+
+struct rs {
+    u8 parity;
+    // u8[D*B], in column-major.
+    u8* matrix;
+    // u8[D*P][32], in column-major. These are the lookup tables
+    // to perform multiplication quickly.
+    u8* expanded_matrix;
+};
+
+// Note that we pervasively assume that the first column of the parity columns
+// is 1s, which causes the first parity to be the XORs of the data. So you can't
+// really change how the matrix is generated.
+static void rs_cauchy_matrix(struct rs* r) {
+    int D = rs_data_blocks_core(r->parity);
+    int B = rs_blocks_core(r->parity);
+    u8* matrix = r->matrix;
+    memset(matrix, 0, D*B);
+    // Identity in the d*d upper half
+    int i;
+    for (i = 0; i < D; i++) {
+        matrix[D*i + i] = 1;
+    }
+    // Fill in the rest using cauchy
+    int col, row;
+    for (col = D; col < B; col++) {
+        for (row = 0; row < D; row++) {
+            matrix[col*D + row] = gf_inv(col ^ row);
+        }
+    }
+    // Scale the columns
+    for (col = D; col < B; col++) {
+        u8 factor = gf_inv(matrix[col*D]);
+        for (row = 0; row < D; row++) {
+            matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
+        }
+    }
+    // Scale the rows
+    for (row = 1; row < D; row++) {
+        u8 factor = gf_inv(matrix[D*D + row]);
+        for (col = D; col < B; col++) {
+            matrix[col*D + row] = gf_mul(matrix[col*D + row], factor);
+        }
+    }
+}
+
+// out must be at leas size 4
+static void rs_cpuidex(u32 function_id, u32 subfunction_id, u32* out) {
+    u32 a, b, c, d;
+    __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"0"(function_id),"2"(subfunction_id));
+    out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+
+static bool rs_has_cpu_level_core(enum rs_cpu_level level) {
+    u32 _0_0[4];
+    rs_cpuidex(0, 0, _0_0);
+    u32 _7_0[4];
+    if (_0_0[0] >= 7) {
+        rs_cpuidex(7, 0, _7_0);
+    } else {
+        memset(_7_0, 0, sizeof(_7_0));
+    }
+    switch (level) {
+    case RS_CPU_SCALAR:
+        return true;
+    case RS_CPU_AVX2:
+        return _7_0[1] & (1<<5);
+    case RS_CPU_GFNI:
+        return _7_0[2] & (1<<8) && !rs_detect_valgrind();
+    default:
+        die("bad CPU level %d\n", level);
+    }
+}
+
+#define rs_compute_parity_single(D, P, r, i, data, parity) do { \
+        parity[0][i] = 0; \
+        int d; \
+        int p; \
+        for (d = 0; d < D; d++) { \
+            parity[0][i] ^= data[d][i]; \
+        } \
+        for (p = 1; p < P; p++) { \
+            const u8* factor = &r->expanded_matrix[D*32*p]; \
+            parity[p][i] = 0; \
+            for (d = 0; d < D; d++, factor += 32) { \
+                parity[p][i] ^= gf_mul_expanded(data[d][i], factor); \
+            } \
+        } \
+    } while (0)
+
+#define rs_compute_parity_scalar(D, P, r, size, data, parity) do { \
+        /* parity = r->matrix * data */ \
+        u64 i; \
+        for (i = 0; i < size; i++) { \
+            rs_compute_parity_single(D, P, r, i, data, parity); \
+        } \
+    } while (0)
+
+#define rs_compute_parity_avx2(D, P, r, size, data, parity) do { \
+        __m256i low_nibble_mask = broadcast_u8(0x0f); \
+        size_t avx_leftover = size % 32; \
+        size_t avx_size = size-avx_leftover; \
+        u32 i; \
+        int p; \
+        int d; \
+        for (i = 0; i < avx_size; i += 32) { \
+            { \
+                __m256i parity_0 = _mm256_setzero_si256(); \
+                for (d = 0; d < D; d++) { \
+                    parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
+                } \
+                _mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
+            } \
+            for (p = 1; p < P; p++) { \
+                __m256i parity_p = _mm256_setzero_si256(); \
+                for (d = 0; d < D; d++) { \
+                    __m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
+                    __m256i factor = _mm256_loadu_si256((const __m256i*)&r->expanded_matrix[D*p*32 + 32*d]); \
+                    parity_p = _mm256_xor_si256(parity_p, gf_mul_expanded_avx2(data_d, factor, low_nibble_mask)); \
+                } \
+                _mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
+            } \
+        } \
+        for (i = avx_size; i < size; i++) { \
+            rs_compute_parity_single(D, P, r, i, data, parity); \
+        } \
+    } while (0)
+
+#define rs_compute_parity_gfni(D, P, r, size, data, parity) do { \
+        size_t avx_leftover = size % 32; \
+        size_t avx_size = size-avx_leftover; \
+        u64 i; \
+        int p; \
+        int d; \
+        for (i = 0; i < avx_size; i += 32) { \
+            { \
+                __m256i parity_0 = _mm256_setzero_si256(); \
+                for (d = 0; d < D; d++) { \
+                    parity_0 = _mm256_xor_si256(parity_0, _mm256_loadu_si256((const __m256i*)(data[d] + i))); \
+                } \
+                _mm256_storeu_si256((__m256i*)(parity[0] + i), parity_0); \
+            } \
+            for (p = 1; p < P; p++) { \
+                __m256i parity_p = _mm256_setzero_si256(); \
+                for (d = 0; d < D; d++) { \
+                    __m256i data_d = _mm256_loadu_si256((const __m256i*)(data[d] + i)); \
+                    __m256i factor = broadcast_u8(r->matrix[D*D + D*p + d]); \
+                    parity_p = _mm256_xor_si256(parity_p, _mm256_gf2p8mul_epi8(data_d, factor)); \
+                } \
+                _mm256_storeu_si256((__m256i*)(parity[p] + i), parity_p); \
+            } \
+        } \
+        for (i = avx_size; i < size; i++) { \
+            rs_compute_parity_single(D, P, r, i, data, parity); \
+        } \
+    } while (0)
+
+#define rs_recover_matmul_single(D, i, have, want, have_to_want) do { \
+        want[i] = 0; \
+        int j; \
+        for (j = 0; j < D; j++) { \
+            want[i] ^= gf_mul(have_to_want[j], have[j][i]); \
+        } \
+    } while (0)
+
+#define rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded) do { \
+        want[i] = 0; \
+        int j; \
+        for (j = 0; j < D; j++) { \
+            want[i] ^= gf_mul_expanded(have[j][i], &have_to_want_expanded[j*32]); \
+        } \
+    } while(0)
+
+#define rs_recover_matmul_scalar(D, size, have, want, have_to_want) do { \
+        u8 have_to_want_expanded[D*32]; \
+        int d; \
+        for (d = 0; d < D; d++) { \
+            gf_mul_expand_factor(have_to_want[d], &have_to_want_expanded[d*32]); \
+        } \
+        size_t i; \
+        for (i = 0; i < size; i++) { \
+            rs_recover_matmul_single_expanded(D, i, have, want, have_to_want_expanded); \
+        } \
+    } while (0) \
+
+#define rs_recover_matmul_avx2(D, size, have, want, have_to_want) do { \
+        __m256i have_to_want_expanded[D]; \
+        int d; \
+        for (d = 0; d < D; d++) { \
+            gf_mul_expand_factor(have_to_want[d], ((u8*)&have_to_want_expanded[d])); \
+        } \
+        __m256i low_nibble_mask = broadcast_u8(0x0f); \
+        size_t avx_leftover = size % 32; \
+        size_t avx_size = size-avx_leftover; \
+        u64 i; \
+        for (i = 0; i < avx_size; i += 32) { \
+            __m256i want_i = _mm256_setzero_si256(); \
+            for (d = 0; d < D; d++) { \
+                want_i = _mm256_xor_si256( \
+                    want_i, \
+                    gf_mul_expanded_avx2( \
+                        _mm256_loadu_si256((const __m256i*)(have[d] + i)), \
+                        have_to_want_expanded[d], \
+                        low_nibble_mask \
+                    ) \
+                ); \
+            } \
+            _mm256_storeu_si256((__m256i*)(want + i), want_i); \
+        } \
+        for (i = avx_size; i < size; i++) { \
+            rs_recover_matmul_single_expanded(D, i, have, want, ((const u8*)have_to_want_expanded)); \
+        } \
+    } while(0)
+
+#define rs_recover_matmul_gfni(D, size, have, want, have_to_want) do { \
+        __m256i have_to_want_avx[D]; \
+        int d; \
+        for (d = 0; d < D; d++) { \
+            have_to_want_avx[d] = broadcast_u8(have_to_want[d]); \
+        } \
+        size_t avx_leftover = size % 32; \
+        size_t avx_size = size-avx_leftover; \
+        u64 i; \
+        for (i = 0; i < avx_size; i += 32) { \
+            __m256i want_i = _mm256_setzero_si256(); \
+            for (d = 0; d < D; d++) { \
+                want_i = _mm256_xor_si256( \
+                    want_i, \
+                    _mm256_gf2p8mul_epi8( \
+                        _mm256_loadu_si256((const __m256i*)(have[d] + i)), \
+                        have_to_want_avx[d] \
+                    ) \
+                ); \
+            } \
+            _mm256_storeu_si256((__m256i*)(want + i), want_i); \
+        } \
+        for (i = avx_size; i < size; i++) { \
+            rs_recover_matmul_single(D, i, have, want, have_to_want); \
+        } \
+    } while (0)
+
+static struct rs* rs_new_core(u8 parity) {
+    int B = rs_blocks_core(parity);
+    int D = rs_data_blocks_core(parity);
+    int P = rs_parity_blocks_core(parity);
+    struct rs* r = (struct rs*)rs_malloc(sizeof(struct rs) + B*D + D*P*32);
+    if (r == NULL) { return NULL; }
+    r->parity = parity;
+    r->matrix = (u8*)(r + 1);
+    r->expanded_matrix = r->matrix + B*D;
+    rs_cauchy_matrix(r);
+    int p;
+    int d;
+    for (p = 0; p < P; p++) {
+        for (d = 0; d < D; d++) {
+            gf_mul_expand_factor(r->matrix[D*D + D*p + d], &r->expanded_matrix[D*32*p + 32*d]);
+        }
+    }
+    return r;
+}
+
+static void rs_delete_core(struct rs* r) {
+    rs_free(r);
+}
+
+static void rs_recover_core(
+    struct rs* r,
+    u64 size,
+    const u8* have_blocks,
+    const u8** have,
+    u8 want_block,
+    u8* want,
+    void (*recover_func)(int D, u64 size, const u8** have, u8* want, const u8* mat)
+) {
+    int D = rs_data_blocks_core(r->parity);
+    int B = rs_blocks_core(r->parity);
+    // Create some space
+    u8* scratch = (u8*)rs_malloc(D*D + D*D);
+    u8* mat_1 = scratch;
+    u8* mat_2 = scratch + D*D;
+    // Preliminary checks
+    int i, j, d, b;
+    for (d = 0; d < D; d++) {
+        if (have_blocks[d] >= B) {
+            die("have_blocks[%d]=%d >= %d\n", d, have_blocks[d], B);
+        }
+        if (have_blocks[d] == want_block) {
+            die("have_blocks[%d]=%d == want_block=%d\n", d, have_blocks[d], want_block);
+        }
+        if (d > 0 && have_blocks[d] <= have_blocks[d-1]) {
+            die("have_blocks[%d]=%d <= have_blocks[%d-1]=%d\n", d, have_blocks[d], d, have_blocks[d-1]);
+        }
+    }
+    // below in the dimensionality annotation we paper over transposes
+    // [DxD] matrix going from the data blocks to the blocks we currently have
+    u8* data_to_have = mat_1;
+    int have_cursor;
+    for (b = 0, have_cursor = 0; b < B; b++) {
+        if (have_cursor >= D || have_blocks[have_cursor] != b) {
+            continue;
+        }
+        memcpy(data_to_have + have_cursor*D, r->matrix + b*D, D);
+        have_cursor++;
+    }
+    // [DxD] matrix going from what we have to the original data blocks
+    u8* have_to_data = mat_2;
+    if (!rs_gf_invert_matrix(data_to_have, have_to_data, D)) {
+        die("unexpected singular matrix\n");
+    }
+    data_to_have = NULL;
+    // [Dx1] matrix going from the data blocks to the block we want
+    u8* data_to_want = &r->matrix[want_block*D];
+    // have_to_want = data_to_want * have_to_data
+    // [Dx1] matrix going from `blocks` to the block we're into
+    u8* have_to_want = mat_1;
+    for (i = 0; i < D; i++) {
+        have_to_want[i] = 0;
+        for (j = 0; j < D; j++) {
+            have_to_want[i] ^= gf_mul(data_to_want[j], have_to_data[j*D + i]);
+        }
+    }
+    // want = have_to_want * have
+    recover_func(D, size, have, want, have_to_want);
+    // We're done.
+    rs_free(scratch);
+}
--- a/cpp/shard/ShardDB.cpp
+++ b/cpp/shard/ShardDB.cpp
@@ -156,6 +156,8 @@
 // 
 // TODO fill in results

+static constexpr uint64_t EGGSFS_PAGE_SIZE = 4096;
+
 static auto wrappedSnapshot(rocksdb::DB* db) {
    auto deleter = [db](const rocksdb::Snapshot* snapshot) {
        db->ReleaseSnapshot(snapshot);
@@ -726,7 +728,11 @@ struct ShardDBImpl {
        beginKey().setOffset(req.byteOffset);
        {
            std::unique_ptr<rocksdb::Iterator> it(_db->NewIterator(options, _spansCf));
-            for (it->SeekForPrev(beginKey.toSlice()); it->Valid(); it->Next()) {
+            for (
+                it->SeekForPrev(beginKey.toSlice());
+                it->Valid() && (req.limit == 0 || resp.spans.els.size() < req.limit);
+                it->Next()
+            ) {
                auto key = ExternalValue<SpanKey>::FromSlice(it->key());
                if (key().fileId() != req.fileId) {
                    break;
@@ -1258,6 +1264,11 @@ struct ShardDBImpl {
            LOG_DEBUG(_env, "inline span has bad storage class %s", req.storageClass);
            return EggsError::BAD_SPAN_BODY;
        }
+    
+        if (req.byteOffset%EGGSFS_PAGE_SIZE != 0) {
+            LOG_DEBUG(_env, "req.byteOffset=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, EGGSFS_PAGE_SIZE);
+            return EggsError::BAD_SPAN_BODY;
+        }

        uint32_t expectedCrc = crc32c(0, req.body.data(), req.body.size());
        expectedCrc = crc32c_zero_extend(expectedCrc, req.size - req.body.size());
@@ -1293,10 +1304,14 @@ struct ShardDBImpl {
            LOG_DEBUG(_env, "bad storage class %s for blocks span", (int)req.storageClass);
            return EggsError::BAD_SPAN_BODY;
        }
+        if (req.byteOffset%EGGSFS_PAGE_SIZE != 0 || req.cellSize%EGGSFS_PAGE_SIZE != 0) {
+            LOG_DEBUG(_env, "req.byteOffset=%s or cellSize=%s is not a multiple of PAGE_SIZE=%s", req.byteOffset, req.cellSize, EGGSFS_PAGE_SIZE);
+            return EggsError::BAD_SPAN_BODY;
+        }
        if (!_checkSpanBody(req)) {
            return EggsError::BAD_SPAN_BODY;
        }
-    
+
        // start filling in entry
        entry.fileId = req.fileId;
        entry.byteOffset = req.byteOffset;
@@ -1888,6 +1903,7 @@ struct ShardDBImpl {
            return EggsError::MISMATCHING_TARGET;
        }
        if (edgeBody().creationTime() != creationTime) {
+            LOG_DEBUG(_env, "expected time %s, got %s", edgeBody().creationTime(), creationTime);
            return EggsError::MISMATCHING_CREATION_TIME;
        }
        if (edgeBody().targetIdWithLocked().extra()) { // locked
@@ -1992,6 +2008,7 @@ struct ShardDBImpl {
        }
        ExternalValue<CurrentEdgeBody> edge(edgeValue);
        if (edge().creationTime() != entry.creationTime) {
+            LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
            return EggsError::MISMATCHING_CREATION_TIME;
        }
        if (edge().locked()) {
@@ -2046,6 +2063,7 @@ struct ShardDBImpl {
        }
        ExternalValue<CurrentEdgeBody> edge(edgeValue);
        if (edge().creationTime() != entry.creationTime) {
+            LOG_DEBUG(_env, "expected time %s, got %s", edge().creationTime(), entry.creationTime);
            return EggsError::MISMATCHING_CREATION_TIME;
        }
        if (!edge().locked()) {
@@ -2732,7 +2750,13 @@ struct ShardDBImpl {
        const auto& cache = _blockServicesCache.at(blockServiceId.u64);
        auto expectedProof = cbcmac(cache.secretKey, (uint8_t*)buf, sizeof(buf));

-        return proof.proof == expectedProof;
+        bool good = proof.proof == expectedProof;
+
+        if (!good) {
+            LOG_DEBUG(_env, "bad block write proof, expected %s, got %s", BincodeFixedBytes<8>(expectedProof), proof);
+        }
+
+        return good;
    }

    std::array<uint8_t, 8> _blockEraseCertificate(uint32_t blockSize, const BlockBody block, const AES128Key& secretKey) {
@@ -3335,8 +3359,16 @@ DirectoryInfo defaultDirectoryInfo() {
    hddBlocks.storageClass = storageClassByName("HDD");
    addSegment(BLOCK_POLICY_TAG, blockPolicy);

-    // Span policy: up to 10MiB: RS(4,4), up to 100MiB (max span size): RS(10,4)
+    // Span policy:
+    // * up to 64KiB: RS(1,4). This mirroring span simplifies things in the kernel (so that we
+    //     we never have cell sizes that are not multiple of span sizes). We still set things
+    //     up so that we can lose 4 copies and still be fine.
+    // * up to 10MiB: RS(4,4).
+    // * up to 100MiB (max span size): RS(10,4).
    SpanPolicy spanPolicy;
+    auto& tinySpans = spanPolicy.entries.els.emplace_back();
+    tinySpans.maxSize = 1 << 16;
+    tinySpans.parity = Parity(1, 4);
    auto& smallSpans = spanPolicy.entries.els.emplace_back();
    smallSpans.maxSize = 10 << 20;
    smallSpans.parity = Parity(4, 4);