mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2025-12-17 00:35:13 -06:00
Cleanup CRC32C code
Specifically:
* Extend tables so that they won't wrap around wrongly for big sizes
(we would never hit this since our blocks are at most a few MBs
big).
* Use CRC instructions to compute remainders.
This commit is contained in:
committed by
Francesco Mazzoli
parent
3eb40cfee0
commit
136d55bff0
@@ -4,9 +4,7 @@
|
||||
|
||||
include_directories(${ternfs_SOURCE_DIR}/core)
|
||||
|
||||
add_library(crc32c crc32c.h crc32c.c iscsi.h)
|
||||
|
||||
add_executable(crc32c-tables tables.cpp iscsi.h)
|
||||
add_library(crc32c crc32c.h crc32c.c crc32c_pclmul.c)
|
||||
|
||||
add_executable(crc32c-tests tests.cpp)
|
||||
target_link_libraries(crc32c-tests PRIVATE crc32c core)
|
||||
|
||||
@@ -1,300 +1,15 @@
|
||||
// Copyright 2022 Peter Cawley <corsix@corsix.org>
|
||||
// Copyright 2025 XTX Markets Technologies Limited
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
#ifndef __KERNEL__
|
||||
|
||||
#include "crc32c.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#define kernel_static
|
||||
#include <sys/types.h>
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint32_t u32;
|
||||
typedef uint64_t u64;
|
||||
|
||||
#else
|
||||
|
||||
#define kernel_static static
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32,pclmul")))
|
||||
#endif
|
||||
static u32 crc32_fusion_kernel(u32 acc_a, const char* buf, size_t n_blocks) {
|
||||
size_t stride = n_blocks * 24 + 8;
|
||||
// Four chunks:
|
||||
// Chunk A: 0 through stride
|
||||
// Chunk B: stride through stride*2
|
||||
// Chunk C: stride*2 through stride*3-8
|
||||
// Chunk D: stride*3-8 through n_blocks*136+16
|
||||
// First block of 64 from D is easy.
|
||||
const char* buf2 = buf + n_blocks * 72 + 16;
|
||||
__m128i x1 = _mm_loadu_si128((__m128i*)buf2);
|
||||
__m128i x2 = _mm_loadu_si128((__m128i*)(buf2 + 16));
|
||||
__m128i x3 = _mm_loadu_si128((__m128i*)(buf2 + 32));
|
||||
__m128i x4 = _mm_loadu_si128((__m128i*)(buf2 + 48));
|
||||
u32 acc_b = 0;
|
||||
u32 acc_c = 0;
|
||||
// Parallel fold remaining blocks of 64 from D, and 24 from each of A/B/C.
|
||||
// k1 == magic(4*128+32-1)
|
||||
// k2 == magic(4*128-32-1)
|
||||
__m128i k1k2 = _mm_setr_epi32(/*k1*/ 0x740EEF02, 0, /*k2*/ 0x9E4ADDF8, 0);
|
||||
const char* end = buf + (n_blocks * 136 + 16) - 64;
|
||||
while (buf2 < end) {
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)buf);
|
||||
__m128i x5 = _mm_clmulepi64_si128(x1, k1k2, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride));
|
||||
x1 = _mm_clmulepi64_si128(x1, k1k2, 0x11);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2));
|
||||
__m128i x6 = _mm_clmulepi64_si128(x2, k1k2, 0x00);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 8));
|
||||
x2 = _mm_clmulepi64_si128(x2, k1k2, 0x11);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 8));
|
||||
__m128i x7 = _mm_clmulepi64_si128(x3, k1k2, 0x00);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 8));
|
||||
x3 = _mm_clmulepi64_si128(x3, k1k2, 0x11);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 16));
|
||||
__m128i x8 = _mm_clmulepi64_si128(x4, k1k2, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 16));
|
||||
x4 = _mm_clmulepi64_si128(x4, k1k2, 0x11);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 16));
|
||||
x5 = _mm_xor_si128(x5, _mm_loadu_si128((__m128i*)(buf2 + 64)));
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x6 = _mm_xor_si128(x6, _mm_loadu_si128((__m128i*)(buf2 + 80)));
|
||||
x2 = _mm_xor_si128(x2, x6);
|
||||
x7 = _mm_xor_si128(x7, _mm_loadu_si128((__m128i*)(buf2 + 96)));
|
||||
x3 = _mm_xor_si128(x3, x7);
|
||||
x8 = _mm_xor_si128(x8, _mm_loadu_si128((__m128i*)(buf2 + 112)));
|
||||
x4 = _mm_xor_si128(x4, x8);
|
||||
buf2 += 64;
|
||||
buf += 24;
|
||||
}
|
||||
// Next 24 bytes from A/B/C, and 8 more from A/B, then merge A/B/C.
|
||||
// Meanwhile, fold together D's four parallel streams.
|
||||
// k3 == magic(128+32-1)
|
||||
// k4 == magic(128-32-1)
|
||||
__m128i k3k4 = _mm_setr_epi32(/*k3*/ 0xF20C0DFE, 0, /*k4*/ 0x493C7D27, 0);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)buf);
|
||||
__m128i x5 = _mm_clmulepi64_si128(x1, k3k4, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride));
|
||||
x1 = _mm_clmulepi64_si128(x1, k3k4, 0x11);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2));
|
||||
__m128i x6 = _mm_clmulepi64_si128(x3, k3k4, 0x00);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 8));
|
||||
x3 = _mm_clmulepi64_si128(x3, k3k4, 0x11);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 8));
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 8));
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 16));
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 16));
|
||||
x5 = _mm_xor_si128(x5, x2);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 16));
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 24));
|
||||
// k5 == magic(2*128+32-1)
|
||||
// k6 == magic(2*128-32-1)
|
||||
__m128i k5k6 = _mm_setr_epi32(/*k5*/ 0x3DA6D0CB, 0, /*k6*/ 0xBA4FC28E, 0);
|
||||
x6 = _mm_xor_si128(x6, x4);
|
||||
x3 = _mm_xor_si128(x3, x6);
|
||||
x5 = _mm_clmulepi64_si128(x1, k5k6, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 24));
|
||||
x1 = _mm_clmulepi64_si128(x1, k5k6, 0x11);
|
||||
|
||||
// Compute the magic numbers which depend upon n_blocks
|
||||
// (required for merging A/B/C/D)
|
||||
uint64_t bits_c = n_blocks*64 - 33;
|
||||
uint64_t bits_b = bits_c + stride - 8;
|
||||
uint64_t bits_a = bits_b + stride;
|
||||
uint64_t stack_a = ~(uint64_t)8;
|
||||
uint64_t stack_b = stack_a;
|
||||
uint64_t stack_c = stack_a;
|
||||
while (bits_a > 191) {
|
||||
stack_a = (stack_a << 1) + (bits_a & 1); bits_a = (bits_a >> 1) - 16;
|
||||
stack_b = (stack_b << 1) + (bits_b & 1); bits_b = (bits_b >> 1) - 16;
|
||||
stack_c = (stack_c << 1) + (bits_c & 1); bits_c = (bits_c >> 1) - 16;
|
||||
}
|
||||
stack_a = ~stack_a;
|
||||
stack_b = ~stack_b;
|
||||
stack_c = ~stack_c;
|
||||
u32 magic_a = ((u32)0x80000000) >> (bits_a & 31); bits_a >>= 5;
|
||||
u32 magic_b = ((u32)0x80000000) >> (bits_b & 31); bits_b >>= 5;
|
||||
u32 magic_c = ((u32)0x80000000) >> (bits_c & 31); bits_c >>= 5;
|
||||
bits_a -= bits_b;
|
||||
bits_b -= bits_c;
|
||||
for (; bits_c; --bits_c) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0), magic_c = _mm_crc32_u32(magic_c, 0);
|
||||
for (; bits_b; --bits_b) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0);
|
||||
for (; bits_a; --bits_a) magic_a = _mm_crc32_u32(magic_a, 0);
|
||||
for (;;) {
|
||||
u32 low = stack_a & 1;
|
||||
if (!(stack_a >>= 1)) break;
|
||||
__m128i x = _mm_cvtsi32_si128(magic_a);
|
||||
uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
|
||||
magic_a = _mm_crc32_u64(0, y << low);
|
||||
x = _mm_cvtsi32_si128(magic_c);
|
||||
y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
|
||||
magic_c = _mm_crc32_u64(0, y << (stack_c & 1));
|
||||
stack_c >>= 1;
|
||||
x = _mm_cvtsi32_si128(magic_b);
|
||||
y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
|
||||
magic_b = _mm_crc32_u64(0, y << (stack_b & 1));
|
||||
stack_b >>= 1;
|
||||
}
|
||||
__m128i vec_c = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_c), _mm_cvtsi32_si128(magic_c), 0x00);
|
||||
__m128i vec_a = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_a), _mm_cvtsi32_si128(magic_a), 0x00);
|
||||
__m128i vec_b = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_b), _mm_cvtsi32_si128(magic_b), 0x00);
|
||||
x5 = _mm_xor_si128(x5, x3);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
uint64_t abc = _mm_cvtsi128_si64(_mm_xor_si128(_mm_xor_si128(vec_c, vec_a), vec_b));
|
||||
// Apply missing <<32 and fold down to 32-bits.
|
||||
u32 crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
|
||||
crc = _mm_crc32_u64(crc, abc ^ _mm_extract_epi64(x1, 1));
|
||||
return crc;
|
||||
}
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32")))
|
||||
#endif
|
||||
kernel_static u32 crc32c(u32 crc, const char* buf, size_t length) {
|
||||
crc = ~crc; // preset to -1 (distinguish leading zeros)
|
||||
if (length >= 31) {
|
||||
size_t n_blocks = (length - 16) / 136;
|
||||
size_t kernel_length = n_blocks * 136 + 16;
|
||||
if (kernel_length + (-((uintptr_t)buf + n_blocks * 8) & 15) > length) {
|
||||
n_blocks -= 1;
|
||||
kernel_length -= 136;
|
||||
}
|
||||
const char* kernel_end = (const char*)((uintptr_t)(buf + kernel_length + 15) & ~(uintptr_t)15);
|
||||
const char* kernel_start = kernel_end - kernel_length;
|
||||
length -= kernel_start - buf;
|
||||
for (; buf != kernel_start; ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
if (n_blocks) {
|
||||
length -= kernel_length;
|
||||
crc = crc32_fusion_kernel(crc, buf, n_blocks);
|
||||
buf = kernel_end;
|
||||
}
|
||||
}
|
||||
for (; length >= 8; length -= 8, buf += 8) {
|
||||
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
|
||||
}
|
||||
for (; length; --length, ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
return ~crc; // post-invert -- distinguish scaled multiples
|
||||
}
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32")))
|
||||
#endif
|
||||
kernel_static u32 crc32c_simple(u32 crc, const char* buf, size_t length) {
|
||||
crc = ~crc; // preset to -1 (distinguish leading zeros)
|
||||
for (; length >= 8; length -= 8, buf += 8) {
|
||||
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
|
||||
}
|
||||
for (; length; --length, ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
return ~crc; // post-invert -- distinguish scaled multiples
|
||||
}
|
||||
|
||||
#include "iscsi.h"
|
||||
|
||||
// In the comments below, multiplication is meant to be modulo ISCSI_POLY.
|
||||
|
||||
// Stores x^2^0, ..., x^2^31. Can be generated with `mult_mod_p`, see tables.cpp.
|
||||
static u32 CRC_POWER_TABLE[31] = {
|
||||
0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,
|
||||
0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
|
||||
0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a,
|
||||
0x35d73a62, 0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6,
|
||||
0xf946610b, 0x3c204f8f, 0x538586e3, 0x59726915, 0x734d5309,
|
||||
0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc, 0x05b74f3f,
|
||||
0xa51e1f42,
|
||||
};
|
||||
|
||||
// Stores x^-(2^0), ..., x^-(2^31). Can be generated with `mult_mod_p`, see tables.cpp.
|
||||
static u32 CRC_INVERSE_POWER_TABLE[31] = {
|
||||
0x05ec76f1, 0x0bd8ede2, 0x2f63b788, 0xfde39562, 0xbef0965e,
|
||||
0xd610d67e, 0xe67cce65, 0xa268b79e, 0x134fb088, 0x32998d96,
|
||||
0xcedac2cc, 0x70118575, 0x0e004a40, 0xa7864c8b, 0xbc7be916,
|
||||
0x10ba2894, 0x6077197b, 0x98448e4e, 0x8baf845d, 0xe93e07fc,
|
||||
0xf58027d7, 0x5e2b422d, 0x9db2851c, 0x9270ed25, 0x5984e7b3,
|
||||
0x7af026f1, 0xe0f4116b, 0xace8a6b0, 0x9e09f006, 0x6a60ea71,
|
||||
0x4fd04875,
|
||||
};
|
||||
|
||||
// Return x^(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
|
||||
static u32 x2n_mod_p(size_t n, u32 k) {
|
||||
// CRC_POWER_TABLE has all powers of two can multiply combinations
|
||||
// of these to achieve any power.
|
||||
|
||||
// Decompose n into powers of two, say `2^(p_0) + ... + 2^(p_m)`,
|
||||
// for some numbers of powers `m`.
|
||||
//
|
||||
// Then we have
|
||||
//
|
||||
// x^(n * 2^k)
|
||||
// x^((2^(p_0) + ... p^(p_m)) * 2^k)
|
||||
// x^(2^(p_0 + k) + ... + 2^(p_m + k))
|
||||
// x^(2^(p_0 + k)) * ... * x^(2^(p_0 + k))
|
||||
//
|
||||
// We walk along the p_is above and keep adding to the result.
|
||||
//
|
||||
// Note that 2^2^(31 + k) = 2^2^k TODO clarify
|
||||
u32 p = 1u << 31;
|
||||
for (; n != 0; n >>= 1, k++) {
|
||||
if (n & 1) {
|
||||
// p(x) = p(x) * 2^(k % 31)
|
||||
p = crc32c_mult_mod_p(CRC_POWER_TABLE[k & 0x1F], p);
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
// Return x^-(n * 2^k), or in other words, the factor to use to extend a CRC with zeros.
|
||||
static u32 x2n_mod_p_inv(size_t n, u32 k) {
|
||||
u32 p = 1u << 31;
|
||||
for (; n != 0; n >>= 1, k++) {
|
||||
if (n & 1) {
|
||||
p = crc32c_mult_mod_p(CRC_INVERSE_POWER_TABLE[k & 0x1F], p);
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
kernel_static u32 crc32c_zero_extend(u32 crc, ssize_t zeros) {
|
||||
if (zeros > 0) {
|
||||
return ~crc32c_mult_mod_p(x2n_mod_p(zeros, 3), ~crc);
|
||||
} else {
|
||||
return ~crc32c_mult_mod_p(x2n_mod_p_inv(-zeros, 3), ~crc);
|
||||
}
|
||||
}
|
||||
|
||||
kernel_static u32 crc32c_append(u32 crc_a, u32 crc_b, size_t len_b) {
|
||||
// We need to extend crc_a with len_b*8 zeros (len_b is the number
|
||||
// of bytes) (without the inversion).
|
||||
// This amounts to performing `crc_a * x^(len_b*8)`.
|
||||
return crc32c_mult_mod_p(x2n_mod_p(len_b, 3), crc_a) ^ crc_b;
|
||||
}
|
||||
|
||||
kernel_static u32 crc32c_xor(u32 crc_a, u32 crc_b, size_t len) {
|
||||
// We need to to extend crc_a with the crc of len*8 bits.
|
||||
// We could do this in 32 steps rather than 32+32 with a dedicated
|
||||
// table, but probably doesn't matter.
|
||||
u32 crc_0 = ~crc32c_mult_mod_p(~(u32)0, x2n_mod_p(len, 3));
|
||||
return crc_a ^ crc_b ^ crc_0;
|
||||
}
|
||||
#define CRC32C_NAME(a) a
|
||||
#include "crc32c_body.c"
|
||||
@@ -9,6 +9,9 @@
|
||||
// we just invert the crc at the beginning and at the end.
|
||||
//
|
||||
// See <https://en.wikipedia.org/wiki/Computation_of_cyclic_redundancy_checks#CRC_variants>.
|
||||
//
|
||||
// The pclmul versions use pclmul instructions, and are therefore generally faster. They're
|
||||
// otherwise identical to the non-pclmul versions.
|
||||
#ifndef TERN_CRC32C
|
||||
#define TERN_CRC32C
|
||||
|
||||
@@ -40,6 +43,11 @@ uint32_t crc32c_append(uint32_t crc1, uint32_t crc2, size_t len2);
|
||||
// zeroes.
|
||||
uint32_t crc32c_zero_extend(uint32_t crc, ssize_t zeros);
|
||||
|
||||
uint32_t crc32c_pclmul(uint32_t crc, const char* buf, size_t len);
|
||||
uint32_t crc32c_xor_pclmul(uint32_t crc1, uint32_t crc2, size_t len);
|
||||
uint32_t crc32c_append_pclmul(uint32_t crc1, uint32_t crc2, size_t len2);
|
||||
uint32_t crc32c_zero_extend_pclmul(uint32_t crc, ssize_t zeros);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
313
cpp/crc32c/crc32c_body.c
Normal file
313
cpp/crc32c/crc32c_body.c
Normal file
@@ -0,0 +1,313 @@
|
||||
// Copyright 2025 XTX Markets Technologies Limited
|
||||
// crc32c_4k_fusion Copyright 2022 Peter Cawley <corsix@corsix.org>
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
// See <https://www.corsix.org/content/fast-crc32c-4k> for the 4k fusion kernel
|
||||
// and <https://mazzo.li/posts/crc-tips.html> for the other transformations.
|
||||
|
||||
#if CRC32C_USE_PCLMUL
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32,pclmul")))
|
||||
#endif
|
||||
static u32 crc32c_4k_fusion(u32 acc_a, const char* buf, size_t n_blocks) {
|
||||
size_t stride = n_blocks * 24 + 8;
|
||||
// Four chunks:
|
||||
// Chunk A: 0 through stride
|
||||
// Chunk B: stride through stride*2
|
||||
// Chunk C: stride*2 through stride*3-8
|
||||
// Chunk D: stride*3-8 through n_blocks*136+16
|
||||
// First block of 64 from D is easy.
|
||||
const char* buf2 = buf + n_blocks * 72 + 16;
|
||||
__m128i x1 = _mm_loadu_si128((__m128i*)buf2);
|
||||
__m128i x2 = _mm_loadu_si128((__m128i*)(buf2 + 16));
|
||||
__m128i x3 = _mm_loadu_si128((__m128i*)(buf2 + 32));
|
||||
__m128i x4 = _mm_loadu_si128((__m128i*)(buf2 + 48));
|
||||
u32 acc_b = 0;
|
||||
u32 acc_c = 0;
|
||||
// Parallel fold remaining blocks of 64 from D, and 24 from each of A/B/C.
|
||||
// k1 == magic(4*128+32-1)
|
||||
// k2 == magic(4*128-32-1)
|
||||
__m128i k1k2 = _mm_setr_epi32(/*k1*/ 0x740EEF02, 0, /*k2*/ 0x9E4ADDF8, 0);
|
||||
const char* end = buf + (n_blocks * 136 + 16) - 64;
|
||||
while (buf2 < end) {
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)buf);
|
||||
__m128i x5 = _mm_clmulepi64_si128(x1, k1k2, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride));
|
||||
x1 = _mm_clmulepi64_si128(x1, k1k2, 0x11);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2));
|
||||
__m128i x6 = _mm_clmulepi64_si128(x2, k1k2, 0x00);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 8));
|
||||
x2 = _mm_clmulepi64_si128(x2, k1k2, 0x11);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 8));
|
||||
__m128i x7 = _mm_clmulepi64_si128(x3, k1k2, 0x00);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 8));
|
||||
x3 = _mm_clmulepi64_si128(x3, k1k2, 0x11);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 16));
|
||||
__m128i x8 = _mm_clmulepi64_si128(x4, k1k2, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 16));
|
||||
x4 = _mm_clmulepi64_si128(x4, k1k2, 0x11);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 16));
|
||||
x5 = _mm_xor_si128(x5, _mm_loadu_si128((__m128i*)(buf2 + 64)));
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x6 = _mm_xor_si128(x6, _mm_loadu_si128((__m128i*)(buf2 + 80)));
|
||||
x2 = _mm_xor_si128(x2, x6);
|
||||
x7 = _mm_xor_si128(x7, _mm_loadu_si128((__m128i*)(buf2 + 96)));
|
||||
x3 = _mm_xor_si128(x3, x7);
|
||||
x8 = _mm_xor_si128(x8, _mm_loadu_si128((__m128i*)(buf2 + 112)));
|
||||
x4 = _mm_xor_si128(x4, x8);
|
||||
buf2 += 64;
|
||||
buf += 24;
|
||||
}
|
||||
// Next 24 bytes from A/B/C, and 8 more from A/B, then merge A/B/C.
|
||||
// Meanwhile, fold together D's four parallel streams.
|
||||
// k3 == magic(128+32-1)
|
||||
// k4 == magic(128-32-1)
|
||||
__m128i k3k4 = _mm_setr_epi32(/*k3*/ 0xF20C0DFE, 0, /*k4*/ 0x493C7D27, 0);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)buf);
|
||||
__m128i x5 = _mm_clmulepi64_si128(x1, k3k4, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride));
|
||||
x1 = _mm_clmulepi64_si128(x1, k3k4, 0x11);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2));
|
||||
__m128i x6 = _mm_clmulepi64_si128(x3, k3k4, 0x00);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 8));
|
||||
x3 = _mm_clmulepi64_si128(x3, k3k4, 0x11);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 8));
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 8));
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 16));
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 16));
|
||||
x5 = _mm_xor_si128(x5, x2);
|
||||
acc_c = _mm_crc32_u64(acc_c, *(uint64_t*)(buf + stride*2 + 16));
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
acc_a = _mm_crc32_u64(acc_a, *(uint64_t*)(buf + 24));
|
||||
// k5 == magic(2*128+32-1)
|
||||
// k6 == magic(2*128-32-1)
|
||||
__m128i k5k6 = _mm_setr_epi32(/*k5*/ 0x3DA6D0CB, 0, /*k6*/ 0xBA4FC28E, 0);
|
||||
x6 = _mm_xor_si128(x6, x4);
|
||||
x3 = _mm_xor_si128(x3, x6);
|
||||
x5 = _mm_clmulepi64_si128(x1, k5k6, 0x00);
|
||||
acc_b = _mm_crc32_u64(acc_b, *(uint64_t*)(buf + stride + 24));
|
||||
x1 = _mm_clmulepi64_si128(x1, k5k6, 0x11);
|
||||
|
||||
// Compute the magic numbers which depend upon n_blocks
|
||||
// (required for merging A/B/C/D)
|
||||
uint64_t bits_c = n_blocks*64 - 33;
|
||||
uint64_t bits_b = bits_c + stride - 8;
|
||||
uint64_t bits_a = bits_b + stride;
|
||||
uint64_t stack_a = ~(uint64_t)8;
|
||||
uint64_t stack_b = stack_a;
|
||||
uint64_t stack_c = stack_a;
|
||||
while (bits_a > 191) {
|
||||
stack_a = (stack_a << 1) + (bits_a & 1); bits_a = (bits_a >> 1) - 16;
|
||||
stack_b = (stack_b << 1) + (bits_b & 1); bits_b = (bits_b >> 1) - 16;
|
||||
stack_c = (stack_c << 1) + (bits_c & 1); bits_c = (bits_c >> 1) - 16;
|
||||
}
|
||||
stack_a = ~stack_a;
|
||||
stack_b = ~stack_b;
|
||||
stack_c = ~stack_c;
|
||||
u32 magic_a = ((u32)0x80000000) >> (bits_a & 31); bits_a >>= 5;
|
||||
u32 magic_b = ((u32)0x80000000) >> (bits_b & 31); bits_b >>= 5;
|
||||
u32 magic_c = ((u32)0x80000000) >> (bits_c & 31); bits_c >>= 5;
|
||||
bits_a -= bits_b;
|
||||
bits_b -= bits_c;
|
||||
for (; bits_c; --bits_c) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0), magic_c = _mm_crc32_u32(magic_c, 0);
|
||||
for (; bits_b; --bits_b) magic_a = _mm_crc32_u32(magic_a, 0), magic_b = _mm_crc32_u32(magic_b, 0);
|
||||
for (; bits_a; --bits_a) magic_a = _mm_crc32_u32(magic_a, 0);
|
||||
for (;;) {
|
||||
u32 low = stack_a & 1;
|
||||
if (!(stack_a >>= 1)) break;
|
||||
__m128i x = _mm_cvtsi32_si128(magic_a);
|
||||
uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
|
||||
magic_a = _mm_crc32_u64(0, y << low);
|
||||
x = _mm_cvtsi32_si128(magic_c);
|
||||
y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
|
||||
magic_c = _mm_crc32_u64(0, y << (stack_c & 1));
|
||||
stack_c >>= 1;
|
||||
x = _mm_cvtsi32_si128(magic_b);
|
||||
y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
|
||||
magic_b = _mm_crc32_u64(0, y << (stack_b & 1));
|
||||
stack_b >>= 1;
|
||||
}
|
||||
__m128i vec_c = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_c), _mm_cvtsi32_si128(magic_c), 0x00);
|
||||
__m128i vec_a = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_a), _mm_cvtsi32_si128(magic_a), 0x00);
|
||||
__m128i vec_b = _mm_clmulepi64_si128(_mm_cvtsi32_si128(acc_b), _mm_cvtsi32_si128(magic_b), 0x00);
|
||||
x5 = _mm_xor_si128(x5, x3);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
uint64_t abc = _mm_cvtsi128_si64(_mm_xor_si128(_mm_xor_si128(vec_c, vec_a), vec_b));
|
||||
// Apply missing <<32 and fold down to 32-bits.
|
||||
u32 crc = _mm_crc32_u64(0, _mm_extract_epi64(x1, 0));
|
||||
crc = _mm_crc32_u64(crc, abc ^ _mm_extract_epi64(x1, 1));
|
||||
return crc;
|
||||
}
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32")))
|
||||
#endif
|
||||
u32 CRC32C_NAME(crc32c)(u32 crc, const char* buf, size_t length) {
|
||||
crc = ~crc; // preset to -1 (distinguish leading zeros)
|
||||
if (length >= 31) {
|
||||
size_t n_blocks = (length - 16) / 136;
|
||||
size_t kernel_length = n_blocks * 136 + 16;
|
||||
if (kernel_length + (-((uintptr_t)buf + n_blocks * 8) & 15) > length) {
|
||||
n_blocks -= 1;
|
||||
kernel_length -= 136;
|
||||
}
|
||||
const char* kernel_end = (const char*)((uintptr_t)(buf + kernel_length + 15) & ~(uintptr_t)15);
|
||||
const char* kernel_start = kernel_end - kernel_length;
|
||||
length -= kernel_start - buf;
|
||||
for (; buf != kernel_start; ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
if (n_blocks) {
|
||||
length -= kernel_length;
|
||||
crc = crc32c_4k_fusion(crc, buf, n_blocks);
|
||||
buf = kernel_end;
|
||||
}
|
||||
}
|
||||
for (; length >= 8; length -= 8, buf += 8) {
|
||||
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
|
||||
}
|
||||
for (; length; --length, ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
return ~crc; // post-invert -- distinguish scaled multiples
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32")))
|
||||
#endif
|
||||
u32 CRC32C_NAME(crc32c)(u32 crc, const char* buf, size_t length) {
|
||||
crc = ~crc; // preset to -1 (distinguish leading zeros)
|
||||
for (; length >= 8; length -= 8, buf += 8) {
|
||||
crc = _mm_crc32_u64(crc, *(const uint64_t*)buf);
|
||||
}
|
||||
for (; length; --length, ++buf) {
|
||||
crc = _mm_crc32_u8(crc, *(const u8*)buf);
|
||||
}
|
||||
return ~crc; // post-invert -- distinguish scaled multiples
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define CASTAGNOLI_POLY 0x82F63B78u
|
||||
|
||||
#if CRC32C_USE_PCLMUL
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("pclmul")))
|
||||
#endif
|
||||
static u64 crc32c_mul(u32 a, u32 b) {
|
||||
uint64_t c = _mm_cvtsi128_si64(
|
||||
_mm_clmulepi64_si128(_mm_set_epi32(0, 0, 0, a), _mm_set_epi32(0, 0, 0, b), 0)
|
||||
);
|
||||
return c << 1; // unused bit
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
u64 crc32c_mul(u32 a, u32 b32) {
|
||||
u64 b = (u64)b32 << 32;
|
||||
u64 c = 0;
|
||||
int i;
|
||||
for (i = 0; i < 32; i++, a <<= 1, b >>= 1) {
|
||||
c ^= (a & (1u<<31)) ? b : 0;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// `a mod CASTAGNOLI_POLY`
|
||||
#ifdef __clang__
|
||||
__attribute__((no_sanitize("integer")))
|
||||
#endif
|
||||
#ifdef __KERNEL__
|
||||
__attribute__((target("crc32,pclmul")))
|
||||
#endif
|
||||
static u32 crc32c_mod_p(u64 a) {
|
||||
return _mm_crc32_u32(0, a) ^ (a >> 32);
|
||||
}
|
||||
|
||||
static u32 crc32c_mul_mod_p(u32 a, u32 b) {
|
||||
return crc32c_mod_p(crc32c_mul(a, b));
|
||||
}
|
||||
|
||||
// x^2^3, x^2^4, ..., x^2^63
|
||||
static u32 CRC_POWER_TABLE[64] = {
|
||||
0x00800000, 0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
|
||||
0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564, 0xbf455269, 0xe2ea32dc,
|
||||
0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc,
|
||||
0xd289cabe, 0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
|
||||
0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7, 0x88e56f72,
|
||||
0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6,
|
||||
0xf946610b, 0x3c204f8f, 0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
|
||||
0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,
|
||||
};
|
||||
|
||||
static u32 crc32c_x_pow_n(size_t n) {
|
||||
u32 x_pow_n = 1u << 31;
|
||||
int k;
|
||||
for (k = 0; n; k++, n >>= 1) {
|
||||
if (n&1) {
|
||||
x_pow_n = crc32c_mul_mod_p(x_pow_n, CRC_POWER_TABLE[k]);
|
||||
}
|
||||
}
|
||||
return x_pow_n;
|
||||
}
|
||||
|
||||
// x^-(2^3), x^-(2^4), ..., x^-(2^63)
|
||||
static u32 CRC_INVERSE_POWER_TABLE[64] = {
|
||||
0xfde39562, 0xbef0965e, 0xd610d67e, 0xe67cce65, 0xa268b79e, 0x134fb088, 0x32998d96, 0xcedac2cc,
|
||||
0x70118575, 0x0e004a40, 0xa7864c8b, 0xbc7be916, 0x10ba2894, 0x6077197b, 0x98448e4e, 0x8baf845d,
|
||||
0xe93e07fc, 0xf58027d7, 0x5e2b422d, 0x9db2851c, 0x9270ed25, 0x5984e7b3, 0x7af026f1, 0xe0f4116b,
|
||||
0xace8a6b0, 0x9e09f006, 0x6a60ea71, 0x4fd04875, 0x05ec76f1, 0x0bd8ede2, 0x2f63b788, 0xfde39562,
|
||||
0xbef0965e, 0xd610d67e, 0xe67cce65, 0xa268b79e, 0x134fb088, 0x32998d96, 0xcedac2cc, 0x70118575,
|
||||
0x0e004a40, 0xa7864c8b, 0xbc7be916, 0x10ba2894, 0x6077197b, 0x98448e4e, 0x8baf845d, 0xe93e07fc,
|
||||
0xf58027d7, 0x5e2b422d, 0x9db2851c, 0x9270ed25, 0x5984e7b3, 0x7af026f1, 0xe0f4116b, 0xace8a6b0,
|
||||
0x9e09f006, 0x6a60ea71, 0x4fd04875, 0x05ec76f1, 0x0bd8ede2, 0x2f63b788, 0xfde39562, 0xbef0965e,
|
||||
};
|
||||
|
||||
static u32 crc32c_x_pow_neg_n(size_t n) {
|
||||
u32 x_pow_n = 1u << 31;
|
||||
int k;
|
||||
for (k = 0; n; k++, n >>= 1) {
|
||||
if (n&1) {
|
||||
x_pow_n = crc32c_mul_mod_p(x_pow_n, CRC_INVERSE_POWER_TABLE[k]);
|
||||
}
|
||||
}
|
||||
return x_pow_n;
|
||||
}
|
||||
|
||||
u32 CRC32C_NAME(crc32c_zero_extend)(u32 crc, ssize_t zeros) {
|
||||
if (zeros > 0) {
|
||||
return ~crc32c_mul_mod_p(~crc, crc32c_x_pow_n(zeros));
|
||||
} else {
|
||||
return ~crc32c_mul_mod_p(~crc, crc32c_x_pow_neg_n(-zeros));
|
||||
}
|
||||
}
|
||||
|
||||
u32 CRC32C_NAME(crc32c_append)(u32 crc_a, u32 crc_b, size_t len_b) {
|
||||
return crc32c_mul_mod_p(crc_a, crc32c_x_pow_n(len_b)) ^ crc_b;
|
||||
}
|
||||
|
||||
u32 CRC32C_NAME(crc32c_xor)(u32 crc_a, u32 crc_b, size_t len) {
|
||||
return crc_a ^ crc_b ^ ~crc32c_mul_mod_p(~(u32)0, crc32c_x_pow_n(len));
|
||||
}
|
||||
16
cpp/crc32c/crc32c_pclmul.c
Normal file
16
cpp/crc32c/crc32c_pclmul.c
Normal file
@@ -0,0 +1,16 @@
|
||||
// Copyright 2025 XTX Markets Technologies Limited
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint32_t u32;
|
||||
typedef uint64_t u64;
|
||||
|
||||
#define CRC32C_USE_PCLMUL 1
|
||||
#define CRC32C_NAME(a) a##_pclmul
|
||||
#include "crc32c_body.c"
|
||||
@@ -1,28 +0,0 @@
|
||||
// Copyright 2025 XTX Markets Technologies Limited
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
#define ISCSI_POLY 0x82F63B78u
|
||||
|
||||
// Return a(x) multiplied by b(x) modulo ISCSI_POLY, For speed, this requires
|
||||
// that a(x) not be zero.
|
||||
static u32 crc32c_mult_mod_p(u32 a, u32 b) {
|
||||
// m goes from x^0 to x^31
|
||||
u32 m = 1u << 31;
|
||||
u32 p = 0;
|
||||
for (;;) {
|
||||
// If a(x) contains x^n, add b(x)*x^n
|
||||
if (a & m) {
|
||||
p ^= b;
|
||||
// Exit when there are no higher bits in a(x)
|
||||
if ((a & (m - 1)) == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Go from x^n to x^(n+1)
|
||||
m >>= 1;
|
||||
// Go from b(x)*x^n to b(x)*x^(n+1)
|
||||
b = (b & 1) ? ((b >> 1) ^ ISCSI_POLY) : b >> 1;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
// Copyright 2025 XTX Markets Technologies Limited
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
constexpr uint32_t ISCSI_POLY = 0x82F63B78u;
|
||||
|
||||
// Return a(x) multiplied by b(x) modulo ISCSI_POLY, For speed, this requires
|
||||
// that a(x) not be zero.
|
||||
static uint32_t crc32c_mult_mod_p(uint32_t a, uint32_t b) {
|
||||
// m goes from x^0 to x^31
|
||||
uint32_t m = 1u << 31;
|
||||
uint32_t p = 0;
|
||||
for (;;) {
|
||||
// If a(x) contains x^n, add b(x)*x^n
|
||||
if (a & m) {
|
||||
p ^= b;
|
||||
// Exit when there are no higher bits in a(x)
|
||||
if ((a & (m - 1)) == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Go from x^n to x^(n+1)
|
||||
m >>= 1;
|
||||
// Go from b(x)*x^n to b(x)*x^(n+1)
|
||||
b = (b & 1) ? ((b >> 1) ^ ISCSI_POLY) : b >> 1;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
// Copyright 2025 XTX Markets Technologies Limited
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "iscsi.hpp"
|
||||
|
||||
int main() {
|
||||
uint32_t crc_power_table[31];
|
||||
printf("static uint32_t CRC_POWER_TABLE[31] = {");
|
||||
uint32_t p = 1u << 30; // start with x^1
|
||||
for (int i = 0; i < 31; i++) {
|
||||
if (i % 5 == 0) {
|
||||
printf("\n ");
|
||||
}
|
||||
crc_power_table[i] = p;
|
||||
printf("0x%08x, ", p);
|
||||
p = crc32c_mult_mod_p(p, p);
|
||||
}
|
||||
printf("\n};\n\n");
|
||||
|
||||
// find x^-1
|
||||
for (uint64_t x = 1; x < (1ull<<32); x++) {
|
||||
if (crc32c_mult_mod_p(x, crc_power_table[0]) == (1u<<31)) {
|
||||
p = (uint32_t)x;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
printf("static uint32_t CRC_INVERSE_POWER_TABLE[31] = {");
|
||||
for (int i = 0; i < 31; i++) {
|
||||
if (i % 5 == 0) {
|
||||
printf("\n ");
|
||||
}
|
||||
if (crc32c_mult_mod_p(p, crc_power_table[i]) != (1u<<31)) {
|
||||
fprintf(stderr, "not an inverse!\n");
|
||||
exit(1);
|
||||
}
|
||||
printf("0x%08x, ", p);
|
||||
p = crc32c_mult_mod_p(p, p);
|
||||
}
|
||||
printf("\n};\n\n");
|
||||
}
|
||||
@@ -17,10 +17,17 @@
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define BOTH(f, ...) ({ \
|
||||
auto x1 = f(__VA_ARGS__); \
|
||||
auto x2 = f##_pclmul(__VA_ARGS__); \
|
||||
ASSERT(x1 == x2); \
|
||||
x1; \
|
||||
})
|
||||
|
||||
int main() {
|
||||
uint32_t expectedCrc = 0x6c0ec068u;
|
||||
const char* str = "bazzer\n";
|
||||
ASSERT(expectedCrc == crc32c(0, str, strlen(str)));
|
||||
ASSERT(expectedCrc == BOTH(crc32c, 0, str, strlen(str)));
|
||||
|
||||
RandomGenerator rand(0);
|
||||
|
||||
@@ -33,29 +40,29 @@ int main() {
|
||||
// Test append
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
auto s1 = randString(1 + rand.generate64()%100);
|
||||
uint32_t crc1 = crc32c(0, s1.data(), s1.size());
|
||||
ASSERT(crc1 == crc32c_append(0, crc1, s1.size()));
|
||||
uint32_t crc1 = BOTH(crc32c, 0, s1.data(), s1.size());
|
||||
ASSERT(crc1 == BOTH(crc32c_append, 0, crc1, s1.size()));
|
||||
auto s2 = randString(1 + rand.generate64()%100);
|
||||
uint32_t crc2 = crc32c(0, s2.data(), s2.size());
|
||||
uint32_t crc2 = BOTH(crc32c, 0, s2.data(), s2.size());
|
||||
std::vector<char> s = s1;
|
||||
s.insert(s.end(), s2.begin(), s2.end());
|
||||
uint32_t crc = crc32c(0, s.data(), s.size());
|
||||
ASSERT(crc == crc32c_append(crc1, crc2, s2.size()));
|
||||
uint32_t crc = BOTH(crc32c, 0, s.data(), s.size());
|
||||
ASSERT(crc == BOTH(crc32c_append, crc1, crc2, s2.size()));
|
||||
}
|
||||
|
||||
// Test XOR
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
size_t l = 1 + rand.generate64()%100;
|
||||
auto s1 = randString(l);
|
||||
uint32_t crc1 = crc32c(0, s1.data(), s1.size());
|
||||
uint32_t crc1 = BOTH(crc32c, 0, s1.data(), s1.size());
|
||||
auto s2 = randString(l);
|
||||
uint32_t crc2 = crc32c(0, s2.data(), s2.size());
|
||||
uint32_t crc2 = BOTH(crc32c, 0, s2.data(), s2.size());
|
||||
std::vector<char> s = s1;
|
||||
for (int i = 0; i < l; i++) {
|
||||
s[i] ^= s2[i];
|
||||
}
|
||||
uint32_t crc = crc32c(0, s.data(), s.size());
|
||||
ASSERT(crc == crc32c_xor(crc1, crc2, l));
|
||||
uint32_t crc = BOTH(crc32c, 0, s.data(), s.size());
|
||||
ASSERT(crc == BOTH(crc32c_xor, crc1, crc2, l));
|
||||
}
|
||||
|
||||
// Test zero extend
|
||||
@@ -65,10 +72,10 @@ int main() {
|
||||
auto s = randString(l);
|
||||
std::vector<char> szeros(l + lzeros, 0);
|
||||
memcpy(&szeros[0], s.data(), l);
|
||||
uint32_t crc = crc32c(0, s.data(), l);
|
||||
uint32_t crc = BOTH(crc32c, 0, s.data(), l);
|
||||
ASSERT(
|
||||
crc32c_zero_extend(crc, lzeros) ==
|
||||
crc32c(0, szeros.data(), szeros.size())
|
||||
BOTH(crc32c_zero_extend, crc, lzeros) ==
|
||||
BOTH(crc32c, 0, szeros.data(), szeros.size())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -79,10 +86,10 @@ int main() {
|
||||
auto s = randString(l);
|
||||
std::vector<char> szeros(l + lzeros, 0);
|
||||
memcpy(&szeros[0], s.data(), l);
|
||||
uint32_t crc = crc32c(0, szeros.data(), szeros.size());
|
||||
uint32_t crc = BOTH(crc32c, 0, szeros.data(), szeros.size());
|
||||
ASSERT(
|
||||
crc32c_zero_extend(crc, -lzeros) ==
|
||||
crc32c(0, s.data(), s.size())
|
||||
BOTH(crc32c_zero_extend, crc, -lzeros) ==
|
||||
BOTH(crc32c, 0, s.data(), s.size())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -90,10 +97,12 @@ int main() {
|
||||
size_t l = 1 + rand.generate64()%100;
|
||||
auto s = randString(l);
|
||||
ASSERT(
|
||||
crc32c(0, s.data(), s.size()) ==
|
||||
crc32c(crc32c(0, s.data(), s.size()), "", 0)
|
||||
BOTH(crc32c, 0, s.data(), s.size()) ==
|
||||
BOTH(crc32c, crc32c(0, s.data(), s.size()), "", 0)
|
||||
);
|
||||
}
|
||||
|
||||
printf("All tests pass.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1330,7 +1330,7 @@ struct ShardDBImpl {
|
||||
// mirroring blocks should all be the same
|
||||
for (int s = 0; s < req.stripes; s++) {
|
||||
uint32_t stripeCrc = req.crcs.els[s*req.parity.blocks()].u32;
|
||||
spanCrc = crc32c_append(spanCrc, stripeCrc, req.cellSize);
|
||||
spanCrc = crc32c_append_pclmul(spanCrc, stripeCrc, req.cellSize);
|
||||
for (int p = 0; p < req.parity.parityBlocks(); p++) {
|
||||
if (req.crcs.els[s*req.parity.blocks() + 1+p].u32 != stripeCrc) {
|
||||
LOG_DEBUG(_env, "mismatched CRC for mirrored block, expected %s, got %s", Crc(stripeCrc), req.crcs.els[s*req.parity.blocks() + 1+p]);
|
||||
@@ -1347,8 +1347,8 @@ struct ShardDBImpl {
|
||||
uint32_t parity0Crc;
|
||||
for (int d = 0; d < req.parity.dataBlocks(); d++) {
|
||||
uint32_t cellCrc = req.crcs.els[s*req.parity.blocks() + d].u32;
|
||||
spanCrc = crc32c_append(spanCrc, cellCrc, req.cellSize);
|
||||
parity0Crc = d == 0 ? cellCrc : crc32c_xor(parity0Crc, cellCrc, req.cellSize);
|
||||
spanCrc = crc32c_append_pclmul(spanCrc, cellCrc, req.cellSize);
|
||||
parity0Crc = d == 0 ? cellCrc : crc32c_xor_pclmul(parity0Crc, cellCrc, req.cellSize);
|
||||
}
|
||||
if (parity0Crc != req.crcs.els[s*req.parity.blocks() + req.parity.dataBlocks()].u32) {
|
||||
LOG_DEBUG(_env, "bad parity 0 CRC, expected %s, got %s", Crc(parity0Crc), req.crcs.els[s*req.parity.blocks() + req.parity.dataBlocks()]);
|
||||
@@ -1356,7 +1356,7 @@ struct ShardDBImpl {
|
||||
}
|
||||
}
|
||||
}
|
||||
spanCrc = crc32c_zero_extend(spanCrc, (ssize_t)req.size - (ssize_t)(req.cellSize * req.stripes * req.parity.dataBlocks()));
|
||||
spanCrc = crc32c_zero_extend_pclmul(spanCrc, (ssize_t)req.size - (ssize_t)(req.cellSize * req.stripes * req.parity.dataBlocks()));
|
||||
if (spanCrc != req.crc) {
|
||||
LOG_DEBUG(_env, "bad span CRC, expected %s, got %s", Crc(spanCrc), req.crc);
|
||||
return false;
|
||||
@@ -1412,8 +1412,8 @@ struct ShardDBImpl {
|
||||
return TernError::BAD_SPAN_BODY;
|
||||
}
|
||||
|
||||
uint32_t expectedCrc = crc32c(0, req.body.data(), req.body.size());
|
||||
expectedCrc = crc32c_zero_extend(expectedCrc, req.size - req.body.size());
|
||||
uint32_t expectedCrc = crc32c_pclmul(0, req.body.data(), req.body.size());
|
||||
expectedCrc = crc32c_zero_extend_pclmul(expectedCrc, req.size - req.body.size());
|
||||
if (expectedCrc != req.crc.u32) {
|
||||
LOG_DEBUG(_env, "inline span expected CRC %s, got %s", Crc(expectedCrc), req.crc);
|
||||
return TernError::BAD_SPAN_BODY;
|
||||
@@ -1479,7 +1479,7 @@ struct ShardDBImpl {
|
||||
for (int s = 0; s < req.stripes; s++) {
|
||||
uint32_t stripeCrc = 0;
|
||||
for (int d = 0; d < req.parity.dataBlocks(); d++) {
|
||||
stripeCrc = crc32c_append(stripeCrc, req.crcs.els[s*req.parity.blocks() + d].u32, req.cellSize);
|
||||
stripeCrc = crc32c_append_pclmul(stripeCrc, req.crcs.els[s*req.parity.blocks() + d].u32, req.cellSize);
|
||||
}
|
||||
entry.bodyStripes.els.emplace_back(stripeCrc);
|
||||
}
|
||||
@@ -1608,7 +1608,7 @@ struct ShardDBImpl {
|
||||
block.blockServiceId = pickedBlockServices[i];
|
||||
uint32_t blockCrc = 0;
|
||||
for (int s = 0; s < req.stripes; s++) {
|
||||
blockCrc = crc32c_append(blockCrc, req.crcs.els[s*req.parity.blocks() + i].u32, req.cellSize);
|
||||
blockCrc = crc32c_append_pclmul(blockCrc, req.crcs.els[s*req.parity.blocks() + i].u32, req.cellSize);
|
||||
}
|
||||
block.crc = blockCrc;
|
||||
}
|
||||
|
||||
@@ -16,23 +16,23 @@ func Sum(crc uint32, buf []byte) uint32 {
|
||||
if len(buf) == 0 { // otherwise the buf[0] won't work below
|
||||
return crc
|
||||
}
|
||||
return (uint32)(C.crc32c(C.uint(crc), (*C.char)(unsafe.Pointer(&buf[0])), C.ulong(len(buf))))
|
||||
return (uint32)(C.crc32c_pclmul(C.uint(crc), (*C.char)(unsafe.Pointer(&buf[0])), C.ulong(len(buf))))
|
||||
}
|
||||
|
||||
func Xor(crc1 uint32, crc2 uint32, len int) uint32 {
|
||||
if len < 0 {
|
||||
panic(fmt.Errorf("negative len %v", len))
|
||||
}
|
||||
return (uint32)(C.crc32c_xor(C.uint(crc1), C.uint(crc2), C.ulong(len)))
|
||||
return (uint32)(C.crc32c_xor_pclmul(C.uint(crc1), C.uint(crc2), C.ulong(len)))
|
||||
}
|
||||
|
||||
func Append(crc1 uint32, crc2 uint32, len2 int) uint32 {
|
||||
if len2 < 0 {
|
||||
panic(fmt.Errorf("negative len %v", len2))
|
||||
}
|
||||
return (uint32)(C.crc32c_append(C.uint(crc1), C.uint(crc2), C.ulong(len2)))
|
||||
return (uint32)(C.crc32c_append_pclmul(C.uint(crc1), C.uint(crc2), C.ulong(len2)))
|
||||
}
|
||||
|
||||
func ZeroExtend(crc uint32, zeros int) uint32 {
|
||||
return (uint32)(C.crc32c_zero_extend(C.uint(crc), C.long(zeros)))
|
||||
return (uint32)(C.crc32c_zero_extend_pclmul(C.uint(crc), C.long(zeros)))
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ ternfs-client-objs += \
|
||||
file.o \
|
||||
rs.o \
|
||||
crc.o \
|
||||
crc_fpu.o \
|
||||
span.o \
|
||||
bincode.o \
|
||||
revision.o \
|
||||
@@ -41,7 +42,7 @@ export CF = -Wbitwise -Wcontext -Wcast_truncate -Wsparse-all -Wno-shadow -Wnopoi
|
||||
revision.c:
|
||||
printf "#include \"sysfs.h\"\nconst char* ternfs_revision = \"$(shell git describe --always --dirty)\";\n" > revision.c
|
||||
|
||||
EXTRA_FILES := ../cpp/crc32c/crc32c.c ../cpp/rs/gf_tables.c ../cpp/crc32c/iscsi.h ../cpp/rs/rs_core.c
|
||||
EXTRA_FILES := ../cpp/crc32c/crc32c_body.c ../cpp/rs/gf_tables.c ../cpp/rs/rs_core.c
|
||||
|
||||
extra-files: $(EXTRA_FILES)
|
||||
$(foreach file,$(EXTRA_FILES),cp $(file) .;)
|
||||
|
||||
25
kmod/crc.c
25
kmod/crc.c
@@ -9,26 +9,9 @@
|
||||
#include "intrshims.h"
|
||||
|
||||
#include "log.h"
|
||||
#include "crc32c.c"
|
||||
|
||||
u32 ternfs_crc32c(u32 crc, const char* buf, size_t len) {
|
||||
return crc32c(crc, buf, len);
|
||||
}
|
||||
#define CRC32C_USE_PCLMUL 0
|
||||
#define CRC32C_NAME(a) ternfs_##a
|
||||
#include "crc32c_body.c"
|
||||
|
||||
u32 ternfs_crc32c_simple(u32 crc, const char* buf, size_t len) {
|
||||
return crc32c_simple(crc, buf, len);
|
||||
}
|
||||
|
||||
u32 ternfs_crc32c_xor(u32 crc1, u32 crc2, size_t len) {
|
||||
return crc32c_xor(crc1, crc2, len);
|
||||
}
|
||||
|
||||
u32 ternfs_crc32c_append(u32 crc1, u32 crc2, size_t len2) {
|
||||
return crc32c_append(crc1, crc2, len2);
|
||||
}
|
||||
|
||||
u32 ternfs_crc32c_zero_extend(u32 crc, ssize_t zeros) {
|
||||
return crc32c_zero_extend(crc, zeros);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
17
kmod/crc.h
17
kmod/crc.h
@@ -2,23 +2,22 @@
|
||||
//
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
// Functions postfixed with _fpu must be wrapped in
|
||||
// kernel_fpu_begin/kernel_fpu_end. They're also generally
|
||||
// faster.
|
||||
#ifndef _TERNFS_CRC_H
|
||||
#define _TERNFS_CRC_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
// You _must_ wrap this with kernel_fpu_begin/kernel_fpu_end!
|
||||
// It's not required for the other functions.
|
||||
u32 ternfs_crc32c(u32 crc, const char* buf, size_t len);
|
||||
|
||||
// The only difference with `ternfs_crc32c` is that kernel_fpu_begin/kernel_fpu_end
|
||||
// are not required for this one.
|
||||
u32 ternfs_crc32c_simple(u32 crc, const char* buf, size_t len);
|
||||
|
||||
u32 ternfs_crc32c_xor(u32 crc1, u32 crc2, size_t len);
|
||||
|
||||
u32 ternfs_crc32c_append(u32 crc1, u32 crc2, size_t len2);
|
||||
|
||||
u32 ternfs_crc32c_zero_extend(u32 crc, ssize_t zeros);
|
||||
|
||||
u32 ternfs_crc32c_fpu(u32 crc, const char* buf, size_t len);
|
||||
u32 ternfs_crc32c_xor_fpu(u32 crc1, u32 crc2, size_t len);
|
||||
u32 ternfs_crc32c_append_fpu(u32 crc1, u32 crc2, size_t len2);
|
||||
u32 ternfs_crc32c_zero_extend_fpu(u32 crc, ssize_t zeros);
|
||||
|
||||
#endif
|
||||
|
||||
17
kmod/crc_fpu.c
Normal file
17
kmod/crc_fpu.c
Normal file
@@ -0,0 +1,17 @@
|
||||
// Copyright 2025 XTX Markets Technologies Limited
|
||||
//
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include "crc.h"
|
||||
|
||||
#ifndef __CHECKER__ // sparse doesn't like this code at all.
|
||||
|
||||
#include "intrshims.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#define CRC32C_USE_PCLMUL 1
|
||||
#define CRC32C_NAME(a) ternfs_##a##_fpu
|
||||
#include "crc32c_body.c"
|
||||
|
||||
#endif
|
||||
@@ -649,7 +649,7 @@ static int write_blocks(struct ternfs_transient_span* span) {
|
||||
err = ternfs_compute_parity(span->parity, PAGE_SIZE, (const char**)&pages_bufs[0], &pages_bufs[D]);
|
||||
// compute CRCs
|
||||
for (i = 0; i < B; i++) {
|
||||
span->cell_crcs[s*B + i] = ternfs_crc32c(span->cell_crcs[s*B + i], pages_bufs[i], PAGE_SIZE);
|
||||
span->cell_crcs[s*B + i] = ternfs_crc32c_fpu(span->cell_crcs[s*B + i], pages_bufs[i], PAGE_SIZE);
|
||||
}
|
||||
for (i = 0; i < B; i++) {
|
||||
kunmap_atomic(pages_bufs[i]);
|
||||
@@ -662,10 +662,10 @@ static int write_blocks(struct ternfs_transient_span* span) {
|
||||
int s;
|
||||
for (s = 0; s < S; s++) {
|
||||
for (i = 0; i < D; i++) {
|
||||
span->span_crc = ternfs_crc32c_append(span->span_crc, span->cell_crcs[s*B + i], cell_size);
|
||||
span->span_crc = ternfs_crc32c_append_fpu(span->span_crc, span->cell_crcs[s*B + i], cell_size);
|
||||
}
|
||||
}
|
||||
span->span_crc = ternfs_crc32c_zero_extend(span->span_crc, (int)span->written - (int)(span->block_size*D));
|
||||
span->span_crc = ternfs_crc32c_zero_extend_fpu(span->span_crc, (int)span->written - (int)(span->block_size*D));
|
||||
kernel_fpu_end();
|
||||
}
|
||||
// Start the first attempt
|
||||
|
||||
@@ -21,6 +21,7 @@ typedef short __v8hi __attribute__((vector_size(16)));
|
||||
typedef int __v4si __attribute__((vector_size(16)));
|
||||
typedef long long __v2di __attribute__((vector_size(16)));
|
||||
|
||||
#define _mm_set_epi32(i0, i1, i2, i3) ((__m128i)(__v4si){i3,i2,i1,i0})
|
||||
#define _mm_setr_epi32(i0, i1, i2, i3) ((__m128i)(__v4si){i0,i1,i2,i3})
|
||||
#define _mm_clmulepi64_si128(X, Y, I) ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (char)(I)))
|
||||
#define _mm_xor_si128(a, b) ((a) ^ (b))
|
||||
|
||||
@@ -676,7 +676,7 @@ int ternfs_shard_create_file(struct ternfs_fs_info* info, u8 shid, int itype, co
|
||||
|
||||
int ternfs_shard_add_inline_span(struct ternfs_fs_info* info, u64 file, u64 cookie, u64 offset, u32 size, const char* data, u8 len) {
|
||||
BUG_ON(size < len); // this never makes sense
|
||||
u32 crc = ternfs_crc32c_simple(0, data, len);
|
||||
u32 crc = ternfs_crc32c(0, data, len);
|
||||
crc = ternfs_crc32c_zero_extend(crc, size - len);
|
||||
|
||||
struct sk_buff* skb;
|
||||
|
||||
@@ -476,7 +476,7 @@ retry:
|
||||
list_for_each_entry(page, &st->blocks_pages[i], lru) {
|
||||
char* page_buf = kmap_atomic(page);
|
||||
kernel_fpu_begin();
|
||||
u32 crc = ternfs_crc32c(0, page_buf, PAGE_SIZE);
|
||||
u32 crc = ternfs_crc32c_fpu(0, page_buf, PAGE_SIZE);
|
||||
kernel_fpu_end();
|
||||
kunmap_atomic(page_buf);
|
||||
if (crc != (u32)page->private) {
|
||||
|
||||
Reference in New Issue
Block a user