This commit is contained in:
Francesco Mazzoli
2023-02-17 10:39:31 +00:00
parent a387c6a0c8
commit 82cd4a3756
6 changed files with 94 additions and 124 deletions

View File

@@ -25,7 +25,22 @@ struct rs {
};
static struct rs* cached[256] = {
nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
};
static uint8_t* rs_cauchy_matrix(uint8_t parity) {
@@ -62,9 +77,6 @@ static uint8_t* rs_cauchy_matrix(uint8_t parity) {
static struct rs* rs_new(uint8_t parity) {
struct rs* r = (struct rs*)malloc_or_die(sizeof(struct rs), "cannot allocate 'struct rs'\n");
if (r == nullptr) {
die("can't allocate 'struct rs'\n");
}
r->parity = parity;
r->matrix = rs_cauchy_matrix(parity);
return r;
@@ -102,11 +114,11 @@ uint64_t rs_block_size(struct rs* r, uint64_t size) {
return (size + (d - 1)) / d;
}
void rs_compute_parity(struct rs* r, uint64_t block_size, const uint8_t** data, uint8_t** parity) {
void rs_compute_parity(struct rs* r, uint64_t size, const uint8_t** data, uint8_t** parity) {
int D = rs_data_blocks(r->parity);
int P = rs_parity_blocks(r->parity);
// parity = r->matrix * data
for (size_t i = 0; i < block_size; i++) {
for (size_t i = 0; i < size; i++) {
for (int j = 0; j < P; j++) {
const uint8_t* col = &r->matrix[D*D + j*D];
parity[j][i] = 0;
@@ -119,7 +131,7 @@ void rs_compute_parity(struct rs* r, uint64_t block_size, const uint8_t** data,
void rs_recover(
struct rs* r,
uint64_t block_size,
uint64_t size,
const uint8_t* have_blocks,
const uint8_t** have,
uint8_t want_block,
@@ -127,7 +139,6 @@ void rs_recover(
) {
int d = rs_data_blocks(r->parity);
int b = rs_blocks(r->parity);
bool have_all_data = true;
// Preliminary checks
for (int i = 0; i < d; i++) {
if (have_blocks[i] >= b) {
@@ -136,7 +147,6 @@ void rs_recover(
if (have_blocks[i] == want_block) {
die("have_blocks[%d]=%d == want_block=%d\n", i, have_blocks[i], want_block);
}
have_all_data = have_all_data && have_blocks[i] < d;
if (i == 0) {
continue;
}
@@ -174,7 +184,7 @@ void rs_recover(
}
}
// want = have_to_want * have
for (size_t i = 0; i < block_size; i++) {
for (size_t i = 0; i < size; i++) {
want[i] = 0;
for (int j = 0; j < d; j++) {
want[i] ^= gf_mul(have_to_want[j], have[j][i]);

View File

@@ -19,9 +19,6 @@
// of a uint8_t, but this is a fairly irrelevant quirk of EggsFS,
// although it does nicely enforce that we do not go beyond what's
// resonable for data storage purposes (rather than for error correction).
//
// TODO allow to pass short data blocks so that the input won't have to
// be copied to compute the parity in the simple case.
#ifndef EGGS_RS
#define EGGS_RS
@@ -56,31 +53,24 @@ struct rs* rs_get(uint8_t parity);
uint8_t rs_parity(struct rs* rs);
// Tells us how big each block will be given a certain size we want to store.
//
// We'll have that `rs_block_size() * D >= size`, and the user
// is responsible with creating the data blocks by just filling the blocks
// with the data, and padding the remainder with zeros.
uint64_t rs_block_size(struct rs* rs, uint64_t size);
// Computes all parity blocks at once. This is what you use when you store
// something for the first time.
void rs_compute_parity(
struct rs* rs,
uint64_t block_size,
const uint8_t** data, // input, uint8_t[D][block_size]
uint8_t** parity // output, uint8_t[P][block_size]
uint64_t size,
const uint8_t** data, // input, uint8_t[D][size]
uint8_t** parity // output, uint8_t[P][size]
);
// Computes an arbitrary block given at least `D` other blocks.
// This is what you use to recover a lost block.
void rs_recover(
struct rs* rs,
uint64_t block_size,
uint64_t size,
const uint8_t* have_blocks, // [0, B)[D], must be sorted.
const uint8_t** blocks, // uint8_t[D][block_size]
const uint8_t** blocks, // uint8_t[D][size]
uint8_t want_block, // [0, B) and not in `have_blocks`
uint8_t* block // uint8_t[block_size]
uint8_t* block // uint8_t[size]
);
#ifdef __cplusplus

View File

@@ -1207,25 +1207,27 @@ struct ShardDBImpl {
}
}
} else {
// consistency check for the general case
//
// For parity mode 1+M (mirroring), for any M, the server can validate
// that the CRC of the span equals the CRC of every block.
// For N+M in the general case, it can probably validate that the CRC of
// the span equals the concatenation of the CRCs of the N, and that the
// CRC of the 1st of the M equals the XOR of the CRCs of the N.
// It cannot validate anything about the rest of the M though (scrubbing
// can validate the rest, if it wants to, but it would be too expensive
// for the shard to validate).
// Consistency check for the general case. Given what we do in
// `rs.h`, we know that the span is the concatenation of the
// data blocks, and that the first parity block is the XOR of the
// data blocks. We can't check the rest without the data though.
std::array<uint8_t, 4> blocksCrc32;
memset(&blocksCrc32[0], 0, 4);
std::array<uint8_t, 4> parity0Crc32;
memset(&parity0Crc32[0], 0, 4);
for (const auto& block: req.bodyBlocks.els) {
blocksCrc32 = crc32cCombine(blocksCrc32, block.crc32.data, req.blockSize);
parity0Crc32[0] ^= blocksCrc32[0];
parity0Crc32[1] ^= blocksCrc32[1];
parity0Crc32[2] ^= blocksCrc32[2];
parity0Crc32[3] ^= blocksCrc32[3];
}
if (req.crc32.data != blocksCrc32) {
return false;
}
// TODO check parity block CRC32
if (req.bodyBlocks.els[req.parity.dataBlocks()].crc32.data != parity0Crc32) {
return false;
}
}
return true;

View File

@@ -639,54 +639,45 @@ TEST_CASE("crc32c") {
TEST_CASE("RS") {
uint64_t rand = 0;
for (int i = 0; i < 16*16*100; i++) {
uint8_t numData = 2 + splitmix64(rand)%(16-2);
uint8_t numParity = 1 + splitmix64(rand)%(16-1);
std::vector<uint8_t> data(1 + splitmix64(rand)%1000);
for (size_t i = 0; i < data.size(); i++) {
int numData = 2 + splitmix64(rand)%(16-2);
int numParity = 1 + splitmix64(rand)%(16-1);
int blockSize = 1 + splitmix64(rand)%100;
std::vector<uint8_t> data(blockSize*(numData+numParity)); // all blocks
for (size_t i = 0; i < numData*blockSize; i++) { // fill data blocks with random bytes
data[i] = splitmix64(rand) & 0xFF;
}
Parity parity(numData, numParity);
auto rs = rs_get(parity.u8);
size_t blockSize = rs_block_size(rs, data.size());
// prepare data blocks
std::vector<uint8_t> blocks(blockSize*parity.blocks(), 0);
memcpy(&blocks[0], &data[0], data.size());
std::vector<const uint8_t*> dataBlocksPtrs(parity.dataBlocks());
for (int i = 0; i < parity.dataBlocks(); i++) {
dataBlocksPtrs[i] = &blocks[i*blockSize];
auto rs = rs_get(Parity(numData, numParity).u8);
std::vector<uint8_t*> blocksPtrs(numData+numParity);
for (int i = 0; i < numData+numParity; i++) {
blocksPtrs[i] = &data[i*blockSize];
}
// prepare parity blocks
std::vector<uint8_t*> parityBlocksPtrs(parity.parityBlocks());
for (int i = 0; i < parity.parityBlocks(); i++) {
parityBlocksPtrs[i] = &blocks[(parity.dataBlocks()+i)*blockSize];
}
rs_compute_parity(rs, blockSize, &dataBlocksPtrs[0], &parityBlocksPtrs[0]);
rs_compute_parity(rs, blockSize, (const uint8_t**)&blocksPtrs[0], &blocksPtrs[numData]);
// verify that the first parity block is the XOR of all the original data
for (size_t i = 0; i < blockSize; i++) {
uint8_t expectedParity = 0;
for (int j = 0; j < parity.dataBlocks(); j++) {
expectedParity ^= dataBlocksPtrs[j][i];
for (int j = 0; j < numData; j++) {
expectedParity ^= data[j*blockSize + i];
}
REQUIRE(expectedParity == parityBlocksPtrs[0][i]);
REQUIRE(expectedParity == data[numData*blockSize + i]);
}
// restore a random block, using random blocks.
{
std::vector<uint8_t> allBlocks(parity.blocks());
for (int i = 0; i < parity.blocks(); i++) {
std::vector<uint8_t> allBlocks(numData+numParity);
for (int i = 0; i < allBlocks.size(); i++) {
allBlocks[i] = i;
}
for (int i = 0; i < std::min(parity.dataBlocks()+1, parity.blocks()-1); i++) {
std::swap(allBlocks[i], allBlocks[i+1+ splitmix64(rand)%(parity.blocks()-1-i)]);
for (int i = 0; i < std::min(numData+1, numData+numParity-1); i++) {
std::swap(allBlocks[i], allBlocks[i+1+ splitmix64(rand)%(numData+numParity-1-i)]);
}
std::sort(allBlocks.begin(), allBlocks.begin()+parity.dataBlocks());
std::vector<const uint8_t*> havePtrs(parity.dataBlocks());
for (int i = 0; i < parity.dataBlocks(); i++) {
havePtrs[i] = &blocks[allBlocks[i]*blockSize];
std::sort(allBlocks.begin(), allBlocks.begin()+numData);
std::vector<const uint8_t*> havePtrs(numData);
for (int i = 0; i < numData; i++) {
havePtrs[i] = &data[allBlocks[i]*blockSize];
}
uint8_t wantBlock = allBlocks[parity.dataBlocks()];
uint8_t wantBlock = allBlocks[numData];
std::vector<uint8_t> recoveredBlock(blockSize);
rs_recover(rs, blockSize, &allBlocks[0], &havePtrs[0], wantBlock, &recoveredBlock[0]);
std::vector<uint8_t> expectedBlock(blocks.begin() + wantBlock*blockSize, blocks.begin() + (wantBlock+1)*blockSize);
std::vector<uint8_t> expectedBlock(data.begin() + wantBlock*blockSize, data.begin() + (wantBlock+1)*blockSize);
REQUIRE(expectedBlock == recoveredBlock);
}
}

View File

@@ -73,30 +73,6 @@ func (r *Rs) Parity() Parity {
return Parity(C.rs_parity(r.r))
}
func (r *Rs) BlockSize(size int) int {
if size < 0 {
panic(fmt.Errorf("negative size %v", size))
}
return int(C.rs_block_size(r.r, C.ulong(size)))
}
func (r *Rs) SplitData(data []byte) [][]byte {
if len(data) == 0 {
panic("empty data")
}
dataBlocks := make([][]byte, r.Parity().DataBlocks())
blockSize := r.BlockSize(len(data))
for i := range dataBlocks {
dataBlocks[i] = make([]byte, blockSize)
// some blocks might be entirely empty, if the data is small and the number
// of data blocks is large
if i*blockSize <= len(data) {
copy(dataBlocks[i], data[i*blockSize:])
}
}
return dataBlocks
}
func (r *Rs) ComputeParity(data [][]byte) [][]byte {
parity := make([][]byte, r.Parity().ParityBlocks())
blockSize := len(data[0])
@@ -115,7 +91,7 @@ func (r *Rs) checkBlockSize(data [][]byte, parity [][]byte) int {
blockSize := len(data[0])
for i := range data {
if len(data[i]) != blockSize {
panic(fmt.Errorf("bad block size, expected %v, got %v", blockSize, len(data[i])))
panic(fmt.Errorf("differing block size, expected %v, got %v", blockSize, len(data[i])))
}
}
if parity != nil {
@@ -124,14 +100,10 @@ func (r *Rs) checkBlockSize(data [][]byte, parity [][]byte) int {
}
for i := range parity {
if len(parity[i]) != blockSize {
panic(fmt.Errorf("bad block size, expected %v, got %v", blockSize, len(parity[i])))
panic(fmt.Errorf("differing block size, expected %v, got %v", blockSize, len(parity[i])))
}
}
}
expectedBlockSize := r.BlockSize(blockSize * len(data))
if expectedBlockSize != blockSize {
panic(fmt.Errorf("bad block size, expected %v, got %v", expectedBlockSize, blockSize))
}
return blockSize
}
@@ -168,9 +140,22 @@ func (r *Rs) RecoverInto(
) {
blockSize := r.checkBlockSize(blocks, nil)
if len(block) != blockSize {
panic(fmt.Errorf("bad block size, expected %v, got %v", blockSize, len(block)))
panic(fmt.Errorf("differing block size, expected %v, got %v", blockSize, len(block)))
}
for i, haveBlock := range haveBlocks {
if int(haveBlock) >= r.Parity().DataBlocks() {
panic(fmt.Errorf("have_blocks[%d]=%d >= %d", i, haveBlock, r.Parity().ParityBlocks()))
}
if haveBlock == wantBlock {
panic(fmt.Errorf("have_blocks[%d]=%d == want_block=%d", i, haveBlock, wantBlock))
}
if i == 0 {
continue
}
if haveBlock <= haveBlocks[i-1] {
panic(fmt.Errorf("have_blocks[%d]=%d <= have_blocks[%d-1]=%d", i, haveBlock, i, haveBlocks[i-1]))
}
}
// TODO check forr sortedness etc.
blocksPtrs := (**C.uchar)(C.malloc(C.size_t(uintptr(r.Parity().DataBlocks()) * unsafe.Sizeof((*C.uchar)(nil)))))
for i := range blocks {
C.set_ptr(blocksPtrs, C.ulong(i), (*C.uchar)(&blocks[i][0]))

View File

@@ -19,31 +19,23 @@ func TestComputeParity(t *testing.T) {
for i := 0; i < 16*16*100; i++ {
numData := int(2 + rand.Uint32()%(16-2))
numParity := int(1 + rand.Uint32()%(16-1))
data := make([]byte, 1+int(rand.Uint32()%1000))
rand.Read(data)
blockSize := 1 + int(rand.Uint32()%100)
data := make([]byte, blockSize*(numData+numParity))
rand.Read(data[:blockSize*numData])
blocks := make([][]byte, numData+numParity)
for i := range blocks {
blocks[i] = data[i*blockSize : (i+1)*blockSize]
}
rs := GetRs(MkParity(uint8(numData), uint8(numParity)))
dataBlocks := rs.SplitData(data)
parityBlocks := rs.ComputeParity(dataBlocks)
allBlocks := append(append([][]byte{}, dataBlocks...), parityBlocks...)
assert.Equal(t, rs.Parity().DataBlocks(), len(dataBlocks))
assert.Equal(t, rs.Parity().ParityBlocks(), len(parityBlocks))
// Verify that the concatenation is the original data
concatData := []byte{}
for _, dataBlock := range dataBlocks {
concatData = append(concatData, dataBlock...)
}
assert.Equal(t, data, concatData[:len(data)])
for _, b := range concatData[len(data):] {
assert.Equal(t, uint8(0), b)
}
rs.ComputeParityInto(blocks[:numData], blocks[numData:])
// Verify that the first parity block is the XOR of all the data blocks
expectedParity0 := make([]byte, len(dataBlocks[0]))
for _, dataBlock := range dataBlocks {
for i, b := range dataBlock {
expectedParity0[i] ^= b
expectedParity0 := make([]byte, blockSize)
for i := 0; i < numData; i++ {
for j, b := range blocks[i] {
expectedParity0[j] ^= b
}
}
assert.Equal(t, expectedParity0, parityBlocks[0])
assert.Equal(t, expectedParity0, blocks[numData])
// Restore a random block
{
haveBlocks := make([]uint8, numData+numParity)
@@ -62,10 +54,10 @@ func TestComputeParity(t *testing.T) {
sort.Slice(haveBlocks, func(i, j int) bool { return haveBlocks[i] < haveBlocks[j] })
have := make([][]byte, numData)
for i := range have {
have[i] = allBlocks[haveBlocks[i]]
have[i] = blocks[haveBlocks[i]]
}
recoveredBlock := rs.Recover(haveBlocks, have, wantBlock)
assert.Equal(t, allBlocks[wantBlock], recoveredBlock)
assert.Equal(t, blocks[wantBlock], recoveredBlock)
}
}
}