#include #include #include #include #include #include #include #include #include "Bincode.hpp" #include "Crypto.hpp" #include "Env.hpp" #include "ShardDB.hpp" #include "ShardDBData.hpp" #include "Assert.hpp" #include "Exception.hpp" #include "Msgs.hpp" #include "MsgsGen.hpp" #include "Time.hpp" #include "RocksDBUtils.hpp" #include "ShardDBData.hpp" #include "AssertiveLock.hpp" #include "crc32c.hpp" #include "splitmix64.hpp" // TODO maybe revisit all those ALWAYS_ASSERTs // ## High level design // // Right now, we're not implementing distributed consensus, but we want to prepare // for this as best we can, since we need it for HA. // // This class implements the state storage, assuming that log entries will be persisted // separatedly (if at all). The state storage is also used to prepare the log entries -- // e.g. there's a tiny bit of state that we read/write to without regards for the // distributed consensus. // // Specifically, we allocate fresh ids, and we read info about block services, when // preparing log entries. // // One question is how we write transactions concurrently, given that we rely on // applying first one log entry and then the next in that order. We could be smart // and pool independent log entries, and then write them concurrently. My guess is // that it's probably not worth doing this, at least since the beginning, and that // it's better to just serialize the transactions manually with a mutex here in C++. // // ## RocksDB schema // // Overview of the column families we use. Assumptions from the document: // // * 1e12 files // * 10EiB storage // * 100000 concurrent clients // * 20000 storage nodes // // * Files: // - Lots of point queries, we actually never do iteration of any kind here. // - InodeId (8 bytes): FileBody (16 bytes) // - Total size: 1e12 files / 256 shards * 24 bytes = ~100GiB // // * Spans: // - Lots of range queries, mostly with SeekForPrev // - Both files and transient files point to this. This is part of why it makes // sense to have a different column family: otherwise we couldn't easily // traverse the transient files with them without stepping over spans. // - { id: InodeId, byte_offset: uint64_t } (16 bytes): SpanBody // * SpanBody is 22 bytes of header, plus 0-255 inline body, or ~(10+4)*12 = 168 bytes for blocks // * Say an average of 200 bytes for key + body, being a bit conservative // * Let's also say that the max block size is 100MiB // * Total size: 10EiB / 100MiB / 256 shards * 200 bytes ~ 100GiB // * However the above is quite optimistic, assuming no fragmentation -- the average file will probably // be something like 10MiB. So the total dataset for spans will probably be around 1TiB in // reality. // // * TransientFiles: // - Should use bloom filter here, lots of point queries // - Unordered iteration for garbage collection // - InodeId (8 bytes): TransientFileBody (25-281 bytes) // - The size for this will be neglegible -- say 1% of Files. // // * Directories: // - We store the directory body // - Unordered traversal for garbage collection // - InodeId (8 bytes): DirectoryBody (18-273 bytes) // - Unclear how many directories we have, but it's probably going to have a few order of magnitude less // than files, e.g. 1e8 directories / 256 shards * 200 bytes = ~80MiB, tiny dataset. // // * Edges: // - Lots of range queries // - Might want to exploit the fact that for current edges we always seek by prefix (without the name) // - Two kind of records: // * { dir: InodeId, current: false, nameHash: nameHash, name: bytes, creationTime: uint64_t }: SnapshotEdgeBody // * { dir: InodeId, current: true, nameHash: nameHash, name: bytes }: CurrentEdgeBody // - The sizes below are for snapshot, for current the creation time moves to the body // - They key size is 25-300 bytes, say 80 bytes on average with a 50 bytes filename // - The value size is 10 bytes or so // - Make it 100bytes total for key and value // - The number of files puts a lower bound on the number of edges, and we don't expect the number to be much // higher, since we won't have many snaphsot edges, and we have no hardlinks. Also there aren't that many // directories. // - So total size is something like 1e12 / 256 * 250 bytes = ~400GiB // - Similar size as spans -- I think it makes a ton of sense to have them in a different column family // from directories, given the different order of magnitude when it comes to size. // // * Blocks: // - Goes from block service to file ids // - No values, only { blockServiceId: uint64_t, fileId: uint64_t }, "set like" // - Assuming RS(10,4), roughly 1e12/256 * 8 bytes = 31GiB // - Not a big deal. // // We'll also store some small metadata in the default column family: // // - SHARD_INFO_KEY ShardInfoBody // - LAST_APPLIED_LOG_ENTRY_KEY uint64_t // - NEXT_FILE_ID_KEY InodeId // - NEXT_SYMLINK_ID_KEY InodeId // - NEXT_BLOCK_ID_KEY InodeId // - CURRENT_BLOCK_SERVICES_KEY CurrentBlockServicesBody // - BLOCK_SERVICE_KEY {blockServiceId: uint64_t} BlockServiceBody // // Note that the block services might be out of date, especially those not referred to by // CURRENT_BLOCK_SERVICES_KEY. // // Total data for the shard will be 1-2TiB. The big ones are `Spans` and `Edges`, which are also // the ones requiring range queries. // // Does this sit well with RocksDB? Their own benchmarks use 900M keys, 20 byte key, 400 byte value, // so 900e6 * (20+400)byte = ~400GiB. They're not very far off, so it seems reasonable. // // ## RocksDB test (state) // // We want to test the two heavy cases: spans and edges. // // For spans, we have 9 bytes keys, and 200 bytes values. 10EiB / 10MiB / 256 = ~5 billion keys. Total data, // around 1TiB. // // I test with // // $ WAL_DIR=benchmarkdb DB_DIR=benchmarkdb KEY_SIZE=9 VALUE_SIZE=200 NUM_KEYS=5000000000 CACHE_SIZE=6442450944 ./benchmark.sh bulkload // // This runs like so: // // /usr/bin/time -f '%e %U %S' -o /tmp/benchmark_bulkload_fillrandom.log.time ./db_bench --benchmarks=fillrandom --use_existing_db=0 --disable_auto_compactions=1 --sync=0 --max_background_jobs=16 --max_write_buffer_number=8 --allow_concurrent_memtable_write=false --level0_file_num_compaction_trigger=10485760 --level0_slowdown_writes_trigger=10485760 --level0_stop_writes_trigger=10485760 --db=benchmarkdb --wal_dir=benchmarkdb --num=5000000000 --key_size=9 --value_size=200 --block_size=8192 --cache_size=6442450944 --cache_numshardbits=6 --compression_max_dict_bytes=0 --compression_ratio=0.5 --compression_type=zstd --bytes_per_sync=8388608 --benchmark_write_rate_limit=0 --write_buffer_size=134217728 --target_file_size_base=134217728 --max_bytes_for_level_base=1073741824 --verify_checksum=1 --delete_obsolete_files_period_micros=62914560 --max_bytes_for_level_multiplier=8 --statistics=0 --stats_per_interval=1 --stats_interval_seconds=60 --report_interval_seconds=5 --histogram=1 --memtablerep=skip_list --bloom_bits=10 --open_files=-1 --subcompactions=1 --compaction_style=0 --num_levels=8 --min_level_to_compress=-1 --level_compaction_dynamic_level_bytes=true --pin_l0_filter_and_index_blocks_in_cache=1 --threads=1 --memtablerep=vector --allow_concurrent_memtable_write=false --disable_wal=1 --seed=1669566020 --report_file=/tmp/benchmark_bulkload_fillrandom.log.r.csv 2>&1 | tee -a /tmp/benchmark_bulkload_fillrandom.log // // This runs in 3:51:09.16, which is 4 times the time it takes to load ~500GiB. The // vast majority of the time is taken by compaction. // // Next I did // // WAL_DIR=benchmarkdb DB_DIR=benchmarkdb KEY_SIZE=9 VALUE_SIZE=200 NUM_KEYS=5000000000 CACHE_SIZE=6442450944 DURATION=5400 ./benchmark.sh readrandom // // TODO fill in results static uint64_t computeHash(HashMode mode, const BincodeBytesRef& bytes) { switch (mode) { case HashMode::XXH3_63: // TODO remove this chopping, pyfuse3 doesn't currently like // full 64-bit hashes. return XXH3_64bits(bytes.data(), bytes.size()) & ~(1ull<<63); default: throw EGGS_EXCEPTION("bad hash mode %s", (int)mode); } } static bool validName(const BincodeBytesRef& name) { if (name.size() == 0) { return false; } if (name == BincodeBytesRef(".") || name == BincodeBytesRef("..")) { return false; } for (int i = 0; i < name.size(); i++) { if (name.data()[i] == (uint8_t)'/' || name.data()[i] == 0) { return false; } } return true; } constexpr uint64_t DEADLINE_INTERVAL = (60ull /*mins*/ * 60 /*secs*/ * 1'000'000'000 /*ns*/); // 1 hr void ShardLogEntry::pack(BincodeBuf& buf) const { time.pack(buf); buf.packScalar((uint16_t)body.kind()); body.pack(buf); } void ShardLogEntry::unpack(BincodeBuf& buf) { time.unpack(buf); ShardLogEntryKind kind = (ShardLogEntryKind)buf.unpackScalar(); body.unpack(buf, kind); } struct BlockServiceCache { AES128Key secretKey; std::array ip; uint16_t port; uint8_t storageClass; }; struct ShardDBImpl { Env _env; ShardId _shid; std::array _secretKey; AES128Key _expandedSecretKey; // TODO it would be good to store basically all of the metadata in memory, // so that we'd just read from it, but this requires a bit of care when writing // since we rollback on error. // These two block services things change infrequently, and are used to add spans // -- keep them in memory. std::vector _currentBlockServices; // the block services we currently want to use to add new spans std::unordered_map _blockServicesCache; rocksdb::DB* _db; rocksdb::ColumnFamilyHandle* _defaultCf; rocksdb::ColumnFamilyHandle* _transientCf; rocksdb::ColumnFamilyHandle* _filesCf; rocksdb::ColumnFamilyHandle* _spansCf; rocksdb::ColumnFamilyHandle* _directoriesCf; rocksdb::ColumnFamilyHandle* _edgesCf; rocksdb::ColumnFamilyHandle* _blockServicesToFilesCf; AssertiveLock _applyLogEntryLock; // ---------------------------------------------------------------- // initialization ShardDBImpl() = delete; ShardDBImpl(Logger& logger, ShardId shid, const std::string& path) : _env(logger, "shard_db"), _shid(shid) { LOG_INFO(_env, "will run shard %s in db dir %s", shid, path); // TODO actually figure out the best strategy for each family, including the default // one. rocksdb::Options options; options.create_if_missing = true; options.create_missing_column_families = true; std::vector familiesDescriptors{ {rocksdb::kDefaultColumnFamilyName, {}}, {"files", {}}, {"spans", {}}, {"transientFiles", {}}, {"directories", {}}, {"edges", {}}, {"blockServicesToFiles", {}}, }; std::vector familiesHandles; auto dbPath = path + "/db"; LOG_INFO(_env, "initializing shard %s RocksDB in %s", _shid, dbPath); ROCKS_DB_CHECKED_MSG( rocksdb::DB::Open(options, dbPath, familiesDescriptors, &familiesHandles, &_db), "could not open RocksDB %s", dbPath ); ALWAYS_ASSERT(familiesDescriptors.size() == familiesHandles.size()); _defaultCf = familiesHandles[0]; _filesCf = familiesHandles[1]; _spansCf = familiesHandles[2]; _transientCf = familiesHandles[3]; _directoriesCf = familiesHandles[4]; _edgesCf = familiesHandles[5]; _blockServicesToFilesCf = familiesHandles[6]; _initDb(); } void close() { LOG_INFO(_env, "destroying column families and closing database"); ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_defaultCf)); ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_filesCf)); ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_spansCf)); ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_transientCf)); ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_directoriesCf)); ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_edgesCf)); ROCKS_DB_CHECKED(_db->DestroyColumnFamilyHandle(_blockServicesToFilesCf)); ROCKS_DB_CHECKED(_db->Close()); } ~ShardDBImpl() { delete _db; } void _initDb() { { bool shardInfoExists; { std::string value; auto status = _db->Get({}, shardMetadataKey(&SHARD_INFO_KEY), &value); if (status.IsNotFound()) { shardInfoExists = false; } else { ROCKS_DB_CHECKED(status); shardInfoExists = true; auto shardInfo = ExternalValue::FromSlice(value); if (shardInfo().shardId() != _shid) { throw EGGS_EXCEPTION("expected shard id %s, but found %s in DB", _shid, shardInfo().shardId()); } _secretKey = shardInfo().secretKey(); } } if (!shardInfoExists) { LOG_INFO(_env, "creating shard info, since it does not exist"); generateSecretKey(_secretKey); StaticValue shardInfo; shardInfo().setShardId(_shid); shardInfo().setSecretKey(_secretKey); ROCKS_DB_CHECKED(_db->Put({}, shardMetadataKey(&SHARD_INFO_KEY), shardInfo.toSlice())); } expandKey(_secretKey, _expandedSecretKey); } const auto keyExists = [this](rocksdb::ColumnFamilyHandle* cf, const rocksdb::Slice& key) -> bool { std::string value; auto status = _db->Get({}, cf, key, &value); if (status.IsNotFound()) { return false; } else { ROCKS_DB_CHECKED(status); return true; } }; if (_shid == ROOT_DIR_INODE_ID.shard()) { auto k = InodeIdKey::Static(ROOT_DIR_INODE_ID); if (!keyExists(_directoriesCf, k.toSlice())) { LOG_INFO(_env, "creating root directory, since it does not exist"); auto info = defaultDirectoryInfo(); StaticValue dirBody; dirBody().setOwnerId(NULL_INODE_ID); dirBody().setMtime({}); dirBody().setHashMode(HashMode::XXH3_63); dirBody().setInfoInherited(false); dirBody().setInfo(info.ref()); auto k = InodeIdKey::Static(ROOT_DIR_INODE_ID); ROCKS_DB_CHECKED(_db->Put({}, _directoriesCf, k.toSlice(), dirBody.toSlice())); } } if (!keyExists(_defaultCf, shardMetadataKey(&NEXT_FILE_ID_KEY))) { LOG_INFO(_env, "initializing next file id"); InodeId nextFileId(InodeType::FILE, ShardId(_shid), 0); auto v = InodeIdValue::Static(nextFileId); ROCKS_DB_CHECKED(_db->Put({}, _defaultCf, shardMetadataKey(&NEXT_FILE_ID_KEY), v.toSlice())); } if (!keyExists(_defaultCf, shardMetadataKey(&NEXT_SYMLINK_ID_KEY))) { LOG_INFO(_env, "initializing next symlink id"); InodeId nextLinkId(InodeType::SYMLINK, ShardId(_shid), 0); auto v = InodeIdValue::Static(nextLinkId); ROCKS_DB_CHECKED(_db->Put({}, _defaultCf, shardMetadataKey(&NEXT_SYMLINK_ID_KEY), v.toSlice())); } if (!keyExists(_defaultCf, shardMetadataKey(&NEXT_BLOCK_ID_KEY))) { LOG_INFO(_env, "initializing next block id"); StaticValue v; v().setU64(_shid.u8); ROCKS_DB_CHECKED(_db->Put({}, _defaultCf, shardMetadataKey(&NEXT_BLOCK_ID_KEY), v.toSlice())); } if (!keyExists(_defaultCf, shardMetadataKey(&LAST_APPLIED_LOG_ENTRY_KEY))) { LOG_INFO(_env, "initializing last applied log entry"); auto v = U64Value::Static(0); ROCKS_DB_CHECKED(_db->Put({}, _defaultCf, shardMetadataKey(&LAST_APPLIED_LOG_ENTRY_KEY), v.toSlice())); } if (!keyExists(_defaultCf, shardMetadataKey(&CURRENT_BLOCK_SERVICES_KEY))) { LOG_INFO(_env, "initializing current block services (as empty)"); OwnedValue v(0); v().setLength(0); ROCKS_DB_CHECKED(_db->Put({}, _defaultCf, shardMetadataKey(&CURRENT_BLOCK_SERVICES_KEY), v.toSlice())); } else { LOG_INFO(_env, "initializing block services cache (from db)"); std::string buf; ROCKS_DB_CHECKED(_db->Get({}, _defaultCf, shardMetadataKey(&CURRENT_BLOCK_SERVICES_KEY), &buf)); ExternalValue v(buf); _currentBlockServices.resize(v().length()); for (int i = 0; i < v().length(); i++) { _currentBlockServices[i] = v().at(i); } { rocksdb::ReadOptions options; static_assert(sizeof(ShardMetadataKey) == sizeof(uint8_t)); auto upperBound = (ShardMetadataKey)((uint8_t)BLOCK_SERVICE_KEY + 1); auto upperBoundSlice = shardMetadataKey(&upperBound); options.iterate_upper_bound = &upperBoundSlice; WrappedIterator it(_db->NewIterator(options, _defaultCf)); StaticValue beginKey; beginKey().setKey(BLOCK_SERVICE_KEY); beginKey().setBlockServiceId(0); for (it->Seek(beginKey.toSlice()); it->Valid(); it->Next()) { auto k = ExternalValue::FromSlice(it->key()); ALWAYS_ASSERT(k().key() == BLOCK_SERVICE_KEY); auto v = ExternalValue::FromSlice(it->value()); auto& cache = _blockServicesCache[k().blockServiceId()]; cache.ip = v().ip(); cache.port = v().port(); expandKey(v().secretKey(), cache.secretKey); cache.storageClass = v().storageClass(); } } } } // ---------------------------------------------------------------- // read-only path EggsError _statFile(const StatFileReq& req, StatFileResp& resp) { std::string fileValue; ExternalValue file; EggsError err = _getFile({}, req.id, fileValue, file); if (err != NO_ERROR) { return err; } resp.mtime = file().mtime(); resp.size = file().fileSize(); return NO_ERROR; } EggsError _statTransientFile(const StatTransientFileReq& req, StatTransientFileResp& resp) { std::string fileValue; { auto k = InodeIdKey::Static(req.id); auto status = _db->Get({}, _transientCf, k.toSlice(), &fileValue); if (status.IsNotFound()) { return EggsError::FILE_NOT_FOUND; } ROCKS_DB_CHECKED(status); } auto body = ExternalValue::FromSlice(fileValue); resp.mtime = body().mtime(); resp.size = body().fileSize(); resp.note = body().note(); return NO_ERROR; } EggsError _statDirectory(const StatDirectoryReq& req, StatDirectoryResp& resp) { std::string dirValue; ExternalValue dir; // allowSnapshot=true, the caller can very easily detect if it's snapshot or not EggsError err = _getDirectory({}, req.id, true, dirValue, dir); if (err != NO_ERROR) { return err; } resp.mtime = dir().mtime(); resp.owner = dir().ownerId(); resp.info = dir().info(); return NO_ERROR; } EggsError _readDir(const ReadDirReq& req, ReadDirResp& resp) { // snapshot probably not strictly needed -- it's for the possible lookup if we // got no results. even if returned a false positive/negative there it probably // wouldn't matter. but it's more pleasant. WrappedSnapshot snapshot(_db); rocksdb::ReadOptions options; options.snapshot = snapshot.snapshot; // we don't want snapshot directories, so check for that early { std::string dirValue; ExternalValue dir; EggsError err = _getDirectory(options, req.dirId, false, dirValue, dir); if (err != NO_ERROR) { return err; } } { auto it = WrappedIterator(_db->NewIterator(options, _edgesCf)); StaticValue beginKey; beginKey().setDirIdWithCurrent(req.dirId, true); // current = true beginKey().setNameHash(req.startHash); beginKey().setName({}); int budget = UDP_MTU - ShardResponseHeader::STATIC_SIZE - ReadDirResp::STATIC_SIZE; for (it->Seek(beginKey.toSlice()); it->Valid(); it->Next()) { auto key = ExternalValue::FromSlice(it->key()); if (key().dirId() != req.dirId || !key().current()) { resp.nextHash = 0; // we've ran out of edges break; } auto edge = ExternalValue::FromSlice(it->value()); CurrentEdge& respEdge = resp.results.els.emplace_back(); respEdge.targetId = edge().targetIdWithLocked().id(); respEdge.nameHash = key().nameHash(); respEdge.name = key().name(); respEdge.creationTime = edge().creationTime(); budget -= respEdge.packedSize(); if (budget < 0) { resp.nextHash = key().nameHash(); // remove the current element, and also, do not have straddling hashes while (!resp.results.els.empty() && resp.results.els.back().nameHash == key().nameHash()) { resp.results.els.pop_back(); } break; } } ROCKS_DB_CHECKED(it->status()); } return NO_ERROR; } EggsError _fullReadDir(const FullReadDirReq& req, FullReadDirResp& resp) { // snapshot probably not strictly needed -- it's for the possible lookup if we // got no results. even if returned a false positive/negative there it probably // wouldn't matter. but it's more pleasant. WrappedSnapshot snapshot(_db); rocksdb::ReadOptions options; options.snapshot = snapshot.snapshot; { auto it = WrappedIterator(_db->NewIterator(options, _edgesCf)); StaticValue beginKey; beginKey().setDirIdWithCurrent(req.dirId, req.cursor.current); beginKey().setNameHash(req.cursor.startHash); beginKey().setName(req.cursor.startName.ref()); if (!req.cursor.current) { beginKey().setCreationTime(req.cursor.startTime); } else { ALWAYS_ASSERT(req.cursor.startTime == 0); // TODO proper error at validation time } int budget = UDP_MTU - ShardResponseHeader::STATIC_SIZE - FullReadDirResp::STATIC_SIZE; for (it->Seek(beginKey.toSlice()); it->Valid(); it->Next()) { auto key = ExternalValue::FromSlice(it->key()); if (key().dirId() != req.dirId) { break; } auto& respEdge = resp.results.els.emplace_back(); if (key().current()) { auto edge = ExternalValue::FromSlice(it->value()); respEdge.current = key().current(); respEdge.targetId = edge().targetIdWithLocked(); respEdge.nameHash = key().nameHash(); respEdge.name = key().name(); respEdge.creationTime = edge().creationTime(); } else { auto edge = ExternalValue::FromSlice(it->value()); respEdge.current = key().current(); respEdge.targetId = edge().targetIdWithOwned(); respEdge.nameHash = key().nameHash(); respEdge.name = key().name(); respEdge.creationTime = key().creationTime(); } budget -= respEdge.packedSize(); if (budget < 0) { int prevCursorSize = FullReadDirCursor::STATIC_SIZE; while (budget < 0) { auto& last = resp.results.els.back(); budget += last.packedSize(); resp.next.current = last.current; resp.next.startHash = last.nameHash; resp.next.startName = last.name; resp.next.startTime = last.current ? 0 : last.creationTime; resp.results.els.pop_back(); budget += prevCursorSize; budget -= resp.next.packedSize(); prevCursorSize = resp.next.packedSize(); } break; } } ROCKS_DB_CHECKED(it->status()); } if (resp.results.els.empty()) { std::string dirValue; ExternalValue dir; // here we want snapshots, which is also why we can do this check later EggsError err = _getDirectory(options, req.dirId, true, dirValue, dir); if (err != NO_ERROR) { return err; } } return NO_ERROR; } EggsError _lookup(const LookupReq& req, LookupResp& resp) { WrappedSnapshot snapshot(_db); rocksdb::ReadOptions options; options.snapshot = snapshot.snapshot; uint64_t nameHash; { EggsError err = _getDirectoryAndHash(options, req.dirId, false, req.name.ref(), nameHash); if (err != NO_ERROR) { return err; } } { StaticValue reqKey; reqKey().setDirIdWithCurrent(req.dirId, true); // current=true reqKey().setNameHash(nameHash); reqKey().setName(req.name.ref()); std::string edgeValue; auto status = _db->Get(options, _edgesCf, reqKey.toSlice(), &edgeValue); if (status.IsNotFound()) { return EggsError::NAME_NOT_FOUND; } ROCKS_DB_CHECKED(status); ExternalValue edge(edgeValue); resp.creationTime = edge().creationTime(); resp.targetId = edge().targetIdWithLocked().id(); } return NO_ERROR; } EggsError _visitTransientFiles(const VisitTransientFilesReq& req, VisitTransientFilesResp& resp) { resp.nextId = NULL_INODE_ID; { WrappedIterator it(_db->NewIterator({}, _transientCf)); auto beginKey = InodeIdKey::Static(req.beginId); int budget = UDP_MTU - ShardResponseHeader::STATIC_SIZE - VisitTransientFilesResp::STATIC_SIZE; for (it->Seek(beginKey.toSlice()); it->Valid(); it->Next()) { auto id = ExternalValue::FromSlice(it->key()); auto file = ExternalValue::FromSlice(it->value()); auto& respFile = resp.files.els.emplace_back(); respFile.id = id().id(); respFile.cookie.data = _calcCookie(respFile.id); respFile.deadlineTime = file().deadline(); budget -= respFile.packedSize(); if (budget <= 0) { resp.nextId = resp.files.els.back().id; resp.files.els.pop_back(); break; } } ROCKS_DB_CHECKED(it->status()); } return NO_ERROR; } template EggsError _visitInodes(rocksdb::ColumnFamilyHandle* cf, const Req& req, Resp& resp) { resp.nextId = NULL_INODE_ID; int budget = UDP_MTU - ShardResponseHeader::STATIC_SIZE - Resp::STATIC_SIZE; int maxIds = (budget/8) + 1; // include next inode { WrappedIterator it(_db->NewIterator({}, cf)); auto beginKey = InodeIdKey::Static(req.beginId); for ( it->Seek(beginKey.toSlice()); it->Valid() && resp.ids.els.size() < maxIds; it->Next() ) { auto id = ExternalValue::FromSlice(it->key()); resp.ids.els.emplace_back(id().id()); } } if (resp.ids.els.size() == maxIds) { resp.nextId = resp.ids.els.back(); resp.ids.els.pop_back(); } return NO_ERROR; } EggsError _visitDirectories(const VisitDirectoriesReq& req, VisitDirectoriesResp& resp) { return _visitInodes(_directoriesCf, req, resp); } EggsError _fileSpans(const FileSpansReq& req, FileSpansResp& resp) { // snapshot probably not strictly needed -- it's for the possible lookup if we // got no results. even if returned a false positive/negative there it probably // wouldn't matter. but it's more pleasant. WrappedSnapshot snapshot(_db); rocksdb::ReadOptions options; options.snapshot = snapshot.snapshot; int budget = UDP_MTU - ShardResponseHeader::STATIC_SIZE - FileSpansResp::STATIC_SIZE; // if -1, we ran out of budget. const auto addBlockService = [this, &resp, &budget](uint64_t blockServiceId) -> int { // See if we've placed it already for (int i = 0; i < resp.blockServices.els.size(); i++) { if (resp.blockServices.els.at(i).id == blockServiceId) { return i; } } // If not, we need to make space for it budget -= BlockService::STATIC_SIZE; if (budget < 0) { return -1; } auto& blockService = resp.blockServices.els.emplace_back(); const auto& cache = _blockServicesCache.at(blockServiceId); blockService.id = blockServiceId; blockService.ip = cache.ip; blockService.port = cache.port; // TODO propagade block service flags here return resp.blockServices.els.size()-1; }; StaticValue beginKey; beginKey().setFileId(req.fileId); beginKey().setOffset(req.byteOffset); { WrappedIterator it(_db->NewIterator(options, _spansCf)); for (it->SeekForPrev(beginKey.toSlice()); it->Valid(); it->Next()) { auto key = ExternalValue::FromSlice(it->key()); if (key().fileId() != req.fileId) { break; } auto value = ExternalValue::FromSlice(it->value()); auto& respSpan = resp.spans.els.emplace_back(); respSpan.byteOffset = key().offset(); respSpan.parity = value().parity(); respSpan.storageClass = value().storageClass(); respSpan.crc32 = value().crc32(); respSpan.size = value().spanSize(); respSpan.blockSize = value().blockSize(); if (value().storageClass() == INLINE_STORAGE) { respSpan.bodyBytes = value().inlineBody(); } else { respSpan.bodyBytes.clear(); for (int i = 0; i < value().parity().blocks(); i++) { auto block = value().block(i); int blockServiceIx = addBlockService(block.blockServiceId); if (blockServiceIx < 0) { break; // no need to break in outer loop -- we will break out anyway because budget < 0 } ALWAYS_ASSERT(blockServiceIx < 256); auto& respBlock = respSpan.bodyBlocks.els.emplace_back(); respBlock.blockId = block.blockId; respBlock.blockServiceIx = blockServiceIx; respBlock.crc32 = block.crc32; } } budget -= respSpan.packedSize(); if (budget < 0) { resp.nextOffset = respSpan.byteOffset; resp.spans.els.pop_back(); break; } } } // Check if file does not exist when we have no spans if (resp.spans.els.size() == 0) { std::string fileValue; ExternalValue file; EggsError err = _getFile(options, req.fileId, fileValue, file); if (err != NO_ERROR) { return err; } } return NO_ERROR; } EggsError _blockServiceFiles(const BlockServiceFilesReq& req, BlockServiceFilesResp& resp) { int maxFiles = (UDP_MTU - ShardResponseHeader::STATIC_SIZE - BlockServiceFilesResp::STATIC_SIZE) / 8; resp.fileIds.els.reserve(maxFiles); StaticValue beginKey; beginKey().setBlockServiceId(req.blockServiceId); beginKey().setFileId(req.startFrom); StaticValue endKey; beginKey().setBlockServiceId(req.blockServiceId+1); beginKey().setFileId(NULL_INODE_ID); auto endKeySlice = endKey.toSlice(); rocksdb::ReadOptions options; options.iterate_upper_bound = &endKeySlice; WrappedIterator it(_db->NewIterator(options)); int i; for ( it->Seek(beginKey.toSlice()), i = 0; it->Valid() && i < maxFiles; it->Next(), i++ ) { auto key = ExternalValue::FromSlice(it->key()); resp.fileIds.els.emplace_back(key().fileId()); } ROCKS_DB_CHECKED(it->status()); return NO_ERROR; } EggsError _visitFiles(const VisitFilesReq& req, VisitFilesResp& resp) { return _visitInodes(_filesCf, req, resp); } EggsError _snapshotLookup(const SnapshotLookupReq& req, SnapshotLookupResp& resp) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } uint64_t nameHash; { // allowSnapshot=true since we want this to also work for snaphsot dirs EggsError err = _getDirectoryAndHash({}, req.dirId, true, req.name.ref(), nameHash); if (err != NO_ERROR) { return err; } } int maxEdges = 1 + (UDP_MTU - ShardResponseHeader::STATIC_SIZE - SnapshotLookupResp::STATIC_SIZE) / SnapshotLookupEdge::STATIC_SIZE; WrappedIterator it(_db->NewIterator({}, _edgesCf)); StaticValue firstK; firstK().setDirIdWithCurrent(req.dirId, false); firstK().setNameHash(nameHash); firstK().setName(req.name.ref()); firstK().setCreationTime(req.startFrom); int i; for ( i = 0, it->Seek(firstK.toSlice()); i < maxEdges && it->Valid(); i++, it->Next() ) { auto k = ExternalValue::FromSlice(it->key()); if ( k().nameHash() != nameHash || k().current() || // only snapshot edges k().dirId() != req.dirId || k().name() != req.name.ref() ) { break; } auto v = ExternalValue::FromSlice(it->value()); auto& edge = resp.edges.els.emplace_back(); edge.targetId = v().targetIdWithOwned(); edge.creationTime = k().creationTime(); } ROCKS_DB_CHECKED(it->status()); if (resp.edges.els.size() == maxEdges) { resp.nextTime = resp.edges.els.back().creationTime; resp.edges.els.pop_back(); } return NO_ERROR; } EggsError read(const ShardReqContainer& req, ShardRespContainer& resp) { LOG_DEBUG(_env, "processing read-only request of kind %s", req.kind()); EggsError err = NO_ERROR; resp.clear(); switch (req.kind()) { case ShardMessageKind::STAT_FILE: err = _statFile(req.getStatFile(), resp.setStatFile()); break; case ShardMessageKind::READ_DIR: err = _readDir(req.getReadDir(), resp.setReadDir()); break; case ShardMessageKind::STAT_DIRECTORY: err = _statDirectory(req.getStatDirectory(), resp.setStatDirectory()); break; case ShardMessageKind::STAT_TRANSIENT_FILE: err = _statTransientFile(req.getStatTransientFile(), resp.setStatTransientFile()); break; case ShardMessageKind::LOOKUP: err = _lookup(req.getLookup(), resp.setLookup()); break; case ShardMessageKind::VISIT_TRANSIENT_FILES: err = _visitTransientFiles(req.getVisitTransientFiles(), resp.setVisitTransientFiles()); break; case ShardMessageKind::FULL_READ_DIR: err = _fullReadDir(req.getFullReadDir(), resp.setFullReadDir()); break; case ShardMessageKind::VISIT_DIRECTORIES: err = _visitDirectories(req.getVisitDirectories(), resp.setVisitDirectories()); break; case ShardMessageKind::FILE_SPANS: err = _fileSpans(req.getFileSpans(), resp.setFileSpans()); break; case ShardMessageKind::BLOCK_SERVICE_FILES: err = _blockServiceFiles(req.getBlockServiceFiles(), resp.setBlockServiceFiles()); break; case ShardMessageKind::VISIT_FILES: err = _visitFiles(req.getVisitFiles(), resp.setVisitFiles()); break; case ShardMessageKind::SNAPSHOT_LOOKUP: err = _snapshotLookup(req.getSnapshotLookup(), resp.setSnapshotLookup()); break; default: throw EGGS_EXCEPTION("bad read-only shard message kind %s", req.kind()); } ALWAYS_ASSERT(req.kind() == resp.kind()); return err; } // ---------------------------------------------------------------- // log preparation EggsError _prepareConstructFile(EggsTime time, const ConstructFileReq& req, ConstructFileEntry& entry) { if (req.type != (uint8_t)InodeType::FILE && req.type != (uint8_t)InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } entry.type = req.type; entry.note = req.note; entry.deadlineTime = time + DEADLINE_INTERVAL; return NO_ERROR; } EggsError _checkTransientFileCookie(InodeId id, std::array cookie) { if (id.type() != InodeType::FILE && id.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } std::array expectedCookie; if (cookie != _calcCookie(id)) { return EggsError::BAD_COOKIE; } return NO_ERROR; } EggsError _prepareLinkFile(EggsTime time, const LinkFileReq& req, LinkFileEntry& entry) { // some early, preliminary checks if (req.ownerId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.ownerId.shard() != _shid || req.fileId.shard() != _shid) { return EggsError::BAD_SHARD; } EggsError err = _checkTransientFileCookie(req.fileId, req.cookie.data); if (err != NO_ERROR) { return err; } entry.fileId = req.fileId; entry.name = req.name; entry.ownerId = req.ownerId; return NO_ERROR; } EggsError _prepareSameDirectoryRename(EggsTime time, const SameDirectoryRenameReq& req, SameDirectoryRenameEntry& entry) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.oldName == req.newName) { return EggsError::SAME_SOURCE_AND_DESTINATION; } if (!validName(req.newName.ref())) { return EggsError::BAD_NAME; } if (req.dirId.shard() != _shid) { return EggsError::BAD_SHARD; } entry.dirId = req.dirId; entry.oldCreationTime = req.oldCreationTime; entry.oldName = req.oldName; entry.newName = req.newName; entry.targetId = req.targetId; return NO_ERROR; } EggsError _prepareSoftUnlinkFile(EggsTime time, const SoftUnlinkFileReq& req, SoftUnlinkFileEntry& entry) { if (req.ownerId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.fileId.type() != InodeType::FILE && req.fileId.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } if (req.ownerId.shard() != _shid) { return EggsError::BAD_SHARD; } entry.ownerId = req.ownerId; entry.fileId = req.fileId; entry.name = req.name; entry.creationTime = req.creationTime; return NO_ERROR; } EggsError _prepareCreateDirectoryInode(EggsTime time, const CreateDirectoryInodeReq& req, CreateDirectoryInodeEntry& entry) { if (req.id.shard() != _shid) { return EggsError::BAD_SHARD; } if (req.id.type() != InodeType::DIRECTORY || req.ownerId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.info.inherited != (req.info.body.size() == 0)) { return EggsError::BAD_DIRECTORY_INFO; } entry.id = req.id; entry.ownerId = req.ownerId; entry.info.inherited = req.info.inherited; entry.info.body = req.info.body; return NO_ERROR; } EggsError _prepareCreateLockedCurrentEdge(EggsTime time, const CreateLockedCurrentEdgeReq& req, CreateLockedCurrentEdgeEntry& entry) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.dirId.shard() != _shid) { return EggsError::BAD_SHARD; } if (!validName(req.name.ref())) { return EggsError::BAD_NAME; } ALWAYS_ASSERT(req.targetId != NULL_INODE_ID); // proper error entry.dirId = req.dirId; entry.targetId = req.targetId; entry.name = req.name; return NO_ERROR; } EggsError _prepareUnlockCurrentEdge(EggsTime time, const UnlockCurrentEdgeReq& req, UnlockCurrentEdgeEntry& entry) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.dirId.shard() != _shid) { return EggsError::BAD_SHARD; } entry.dirId = req.dirId; entry.targetId = req.targetId; entry.name = req.name; entry.wasMoved = req.wasMoved; entry.creationTime = req.creationTime; return NO_ERROR; } EggsError _prepareLockCurrentEdge(EggsTime time, const LockCurrentEdgeReq& req, LockCurrentEdgeEntry& entry) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.dirId.shard() != _shid) { return EggsError::BAD_SHARD; } entry.dirId = req.dirId; entry.name = req.name; entry.targetId = req.targetId; entry.creationTime = req.creationTime; return NO_ERROR; } EggsError _prepareRemoveDirectoryOwner(EggsTime time, const RemoveDirectoryOwnerReq& req, RemoveDirectoryOwnerEntry& entry) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.dirId.shard() != _shid) { return EggsError::BAD_SHARD; } if (req.info.size() == 0) { return EggsError::BAD_DIRECTORY_INFO; } ALWAYS_ASSERT(req.dirId != ROOT_DIR_INODE_ID); // TODO proper error entry.dirId = req.dirId; entry.info = req.info; return NO_ERROR; } EggsError _prepareRemoveInode(EggsTime time, const RemoveInodeReq& req, RemoveInodeEntry& entry) { if (req.id.shard() != _shid) { return EggsError::BAD_SHARD; } if (req.id == ROOT_DIR_INODE_ID) { return EggsError::CANNOT_REMOVE_ROOT_DIRECTORY; } entry.id = req.id; return NO_ERROR; } EggsError _prepareSetDirectoryOwner(EggsTime time, const SetDirectoryOwnerReq& req, SetDirectoryOwnerEntry& entry) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.dirId.shard() != _shid) { return EggsError::BAD_SHARD; } if (req.ownerId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } entry.dirId = req.dirId; entry.ownerId = req.ownerId; return NO_ERROR; } EggsError _prepareSetDirectoryInfo(EggsTime time, const SetDirectoryInfoReq& req, SetDirectoryInfoEntry& entry) { if (req.id.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.id.shard() != _shid) { return EggsError::BAD_SHARD; } if (req.info.inherited && req.id == ROOT_DIR_INODE_ID) { return EggsError::BAD_DIRECTORY_INFO; } if (req.info.inherited != (req.info.body.size() == 0)) { return EggsError::BAD_DIRECTORY_INFO; } entry.dirId = req.id; entry.info.inherited = req.info.inherited; entry.info.body = req.info.body; return NO_ERROR; } EggsError _prepareRemoveNonOwnedEdge(EggsTime time, const RemoveNonOwnedEdgeReq& req, RemoveNonOwnedEdgeEntry& entry) { if (req.dirId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.dirId.shard() != _shid) { return EggsError::BAD_SHARD; } entry.dirId = req.dirId; entry.creationTime = req.creationTime; entry.name = req.name; return NO_ERROR; } EggsError _prepareSameShardHardFileUnlink(EggsTime time, const SameShardHardFileUnlinkReq& req, SameShardHardFileUnlinkEntry& entry) { if (req.ownerId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.targetId.type() != InodeType::FILE && req.targetId.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } if (req.ownerId.shard() != _shid || req.targetId.shard() != _shid) { return EggsError::BAD_SHARD; } entry.ownerId = req.ownerId; entry.targetId = req.targetId; entry.name = req.name; entry.creationTime = req.creationTime; return NO_ERROR; } EggsError _prepareRemoveSpanInitiate(EggsTime time, const RemoveSpanInitiateReq& req, RemoveSpanInitiateEntry& entry) { if (req.fileId.type() != InodeType::FILE && req.fileId.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } if (req.fileId.shard() != _shid) { return EggsError::BAD_SHARD; } { EggsError err = _checkTransientFileCookie(req.fileId, req.cookie.data); if (err != NO_ERROR) { return err; } } entry.fileId = req.fileId; return NO_ERROR; } bool _checkSpanBody(const AddSpanInitiateReq& req) { // Note that the span size might be bigger or smaller than // the data -- check comment on top of `AddSpanInitiateReq` in `msgs.go` // for details. // The two special cases if (req.storageClass == EMPTY_STORAGE) { if (req.bodyBytes.size() != 0 || req.bodyBlocks.els.size() != 0 || req.parity != Parity(0) || req.size != 0 || req.blockSize != 0) { return false; } if (req.crc32.data != crc32c("", 0)) { return false; } return true; } if (req.storageClass == INLINE_STORAGE) { if (req.bodyBytes.size() == 0 || req.bodyBlocks.els.size() != 0 || req.parity != Parity(0) || req.size != req.bodyBytes.size() || req.blockSize != 0) { return false; } auto expectedCrc32 = crc32c(req.bodyBytes.data(), req.bodyBytes.size()); if (req.crc32.data != expectedCrc32) { return false; } return true; } if (req.parity.dataBlocks() == 1) { // mirroring blocks should all be the same for (const auto& block: req.bodyBlocks.els) { if (block.crc32 != req.crc32) { return false; } } } else { // consistency check for the general case // // For parity mode 1+M (mirroring), for any M, the server can validate // that the CRC of the span equals the CRC of every block. // For N+M in the general case, it can probably validate that the CRC of // the span equals the concatenation of the CRCs of the N, and that the // CRC of the 1st of the M equals the XOR of the CRCs of the N. // It cannot validate anything about the rest of the M though (scrubbing // can validate the rest, if it wants to, but it would be too expensive // for the shard to validate). std::array blocksCrc32; memset(&blocksCrc32[0], 0, 4); for (const auto& block: req.bodyBlocks.els) { blocksCrc32 = crc32cCombine(blocksCrc32, block.crc32.data, req.blockSize); } if (req.crc32.data != blocksCrc32) { return false; } // TODO check parity block CRC32 } return true; } bool _blockServiceMatchesBlacklist(const std::vector& blacklists, uint64_t blockServiceId, const BlockServiceCache& cache) { for (const auto& blacklist: blacklists) { if (blacklist.id == blockServiceId) { return true; } if (blacklist.ip == cache.ip && blacklist.port == cache.port) { return true; } } return false; } EggsError _prepareAddSpanInitiate(EggsTime time, const AddSpanInitiateReq& req, AddSpanInitiateEntry& entry) { if (req.fileId.type() != InodeType::FILE && req.fileId.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } if (req.fileId.shard() != _shid) { return EggsError::BAD_SHARD; } { EggsError err = _checkTransientFileCookie(req.fileId, req.cookie.data); if (err != NO_ERROR) { return err; } } if (!_checkSpanBody(req)) { return EggsError::BAD_SPAN_BODY; } // start filling in entry entry.fileId = req.fileId; entry.byteOffset = req.byteOffset; entry.storageClass = req.storageClass; entry.parity = req.parity; entry.crc32 = req.crc32; entry.size = req.size; entry.blockSize = req.blockSize; entry.bodyBytes = req.bodyBytes; // Now fill in the block services. Generally we want to try to keep them the same // throughout the file, if possible, so that the likelihood of data loss is minimized. // // TODO factor failure domains in the decision if (entry.storageClass != INLINE_STORAGE && entry.storageClass != EMPTY_STORAGE) { std::vector candidateBlockServices; candidateBlockServices.reserve(_currentBlockServices.size()); LOG_DEBUG(_env, "Starting out with %s current block services", _currentBlockServices.size()); for (uint64_t id: _currentBlockServices) { const auto& cache = _blockServicesCache.at(id); if (cache.storageClass != entry.storageClass) { LOG_DEBUG(_env, "Skipping %s because of different storage class (%s != %s)", id, (int)cache.storageClass, (int)entry.storageClass); continue; } if (_blockServiceMatchesBlacklist(req.blacklist.els, id, cache)) { LOG_DEBUG(_env, "Skipping %s because it matches blacklist", id); continue; } candidateBlockServices.emplace_back(id); } LOG_DEBUG(_env, "Starting out with %s block service candidates", candidateBlockServices.size()); std::vector pickedBlockServices; pickedBlockServices.reserve(req.parity.blocks()); // Try to get the first span to copy its block services -- this should be the // very common case past the first span. { StaticValue k; k().setFileId(req.fileId); k().setOffset(0); std::string v; auto status = _db->Get({}, _spansCf, k.toSlice(), &v); if (status.IsNotFound()) { // no-op -- we'll just generate them all at random } else { ROCKS_DB_CHECKED(status); ExternalValue span(v); for ( int i = 0; i < span().parity().blocks() && pickedBlockServices.size() < req.parity.blocks() && candidateBlockServices.size() > 0; i++ ) { BlockBody spanBlock = span().block(i); auto isCandidate = std::find(candidateBlockServices.begin(), candidateBlockServices.end(), spanBlock.blockServiceId); if (isCandidate == candidateBlockServices.end()) { continue; } LOG_DEBUG(_env, "(1) Picking block service candidate %s", spanBlock.blockServiceId); pickedBlockServices.emplace_back(spanBlock.blockServiceId); std::iter_swap(isCandidate, candidateBlockServices.end()-1); candidateBlockServices.pop_back(); } } } // Fill in whatever remains. We don't need to be deterministic here (we would have to // if we were in log application), but we might as well. { uint64_t rand = time.ns; while (pickedBlockServices.size() < req.parity.blocks() && candidateBlockServices.size() > 0) { uint64_t ix = splitmix64(rand) % candidateBlockServices.size(); LOG_DEBUG(_env, "(2) Picking block service candidate %s", candidateBlockServices[ix]); pickedBlockServices.emplace_back(candidateBlockServices[ix]); std::iter_swap(candidateBlockServices.begin()+ix, candidateBlockServices.end()-1); candidateBlockServices.pop_back(); } } // If we still couldn't find enough block services, we're toast. if (pickedBlockServices.size() < req.parity.blocks()) { return EggsError::COULD_NOT_PICK_BLOCK_SERVICES; } // Now generate the blocks entry.bodyBlocks.els.resize(req.parity.blocks()); for (int i = 0; i < req.parity.blocks(); i++) { const auto& reqBlock = req.bodyBlocks.els[i]; auto& block = entry.bodyBlocks.els[i]; block.blockServiceId = pickedBlockServices[i]; block.crc32 = reqBlock.crc32; } } return NO_ERROR; } EggsError _prepareAddSpanCertify(EggsTime time, const AddSpanCertifyReq& req, AddSpanCertifyEntry& entry) { if (req.fileId.type() != InodeType::FILE && req.fileId.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } if (req.fileId.shard() != _shid) { return EggsError::BAD_SHARD; } { EggsError err = _checkTransientFileCookie(req.fileId, req.cookie.data); if (err != NO_ERROR) { return err; } } entry.fileId = req.fileId; entry.byteOffset = req.byteOffset; entry.proofs = req.proofs; return NO_ERROR; } EggsError _prepareMakeFileTransient(EggsTime time, const MakeFileTransientReq& req, MakeFileTransientEntry& entry) { if (req.id.type() != InodeType::FILE && req.id.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } if (req.id.shard() != _shid) { return EggsError::BAD_SHARD; } entry.id = req.id; entry.note = req.note; return NO_ERROR; } EggsError _prepareRemoveSpanCertify(EggsTime time, const RemoveSpanCertifyReq& req, RemoveSpanCertifyEntry& entry) { if (req.fileId.type() != InodeType::FILE && req.fileId.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } if (req.fileId.shard() != _shid) { return EggsError::BAD_SHARD; } { EggsError err = _checkTransientFileCookie(req.fileId, req.cookie.data); if (err != NO_ERROR) { return err; } } entry.fileId = req.fileId; entry.byteOffset = req.byteOffset; entry.proofs = req.proofs; return NO_ERROR; } EggsError _prepareRemoveOwnedSnapshotFileEdge(EggsTime time, const RemoveOwnedSnapshotFileEdgeReq& req, RemoveOwnedSnapshotFileEdgeEntry& entry) { if (req.ownerId.type() != InodeType::DIRECTORY) { return EggsError::TYPE_IS_NOT_DIRECTORY; } if (req.ownerId.shard() != _shid) { return EggsError::BAD_SHARD; } if (req.targetId.type () != InodeType::FILE && req.targetId.type () != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } entry.ownerId = req.ownerId; entry.targetId = req.targetId; entry.creationTime = req.creationTime; entry.name = req.name; return NO_ERROR; } EggsError prepareLogEntry(const ShardReqContainer& req, ShardLogEntry& logEntry) { LOG_DEBUG(_env, "processing write request of kind %s", req.kind()); logEntry.clear(); EggsError err = NO_ERROR; EggsTime time = eggsNow(); logEntry.time = time; auto& logEntryBody = logEntry.body; switch (req.kind()) { case ShardMessageKind::CONSTRUCT_FILE: err = _prepareConstructFile(time, req.getConstructFile(), logEntryBody.setConstructFile()); break; case ShardMessageKind::LINK_FILE: err = _prepareLinkFile(time, req.getLinkFile(), logEntryBody.setLinkFile()); break; case ShardMessageKind::SAME_DIRECTORY_RENAME: err = _prepareSameDirectoryRename(time, req.getSameDirectoryRename(), logEntryBody.setSameDirectoryRename()); break; case ShardMessageKind::SOFT_UNLINK_FILE: err = _prepareSoftUnlinkFile(time, req.getSoftUnlinkFile(), logEntryBody.setSoftUnlinkFile()); break; case ShardMessageKind::CREATE_DIRECTORY_INODE: err = _prepareCreateDirectoryInode(time, req.getCreateDirectoryInode(), logEntryBody.setCreateDirectoryInode()); break; case ShardMessageKind::CREATE_LOCKED_CURRENT_EDGE: err = _prepareCreateLockedCurrentEdge(time, req.getCreateLockedCurrentEdge(), logEntryBody.setCreateLockedCurrentEdge()); break; case ShardMessageKind::UNLOCK_CURRENT_EDGE: err = _prepareUnlockCurrentEdge(time, req.getUnlockCurrentEdge(), logEntryBody.setUnlockCurrentEdge()); break; case ShardMessageKind::LOCK_CURRENT_EDGE: err = _prepareLockCurrentEdge(time, req.getLockCurrentEdge(), logEntryBody.setLockCurrentEdge()); break; case ShardMessageKind::REMOVE_DIRECTORY_OWNER: err = _prepareRemoveDirectoryOwner(time, req.getRemoveDirectoryOwner(), logEntryBody.setRemoveDirectoryOwner()); break; case ShardMessageKind::REMOVE_INODE: err = _prepareRemoveInode(time, req.getRemoveInode(), logEntryBody.setRemoveInode()); break; case ShardMessageKind::SET_DIRECTORY_OWNER: err = _prepareSetDirectoryOwner(time, req.getSetDirectoryOwner(), logEntryBody.setSetDirectoryOwner()); break; case ShardMessageKind::SET_DIRECTORY_INFO: err = _prepareSetDirectoryInfo(time, req.getSetDirectoryInfo(), logEntryBody.setSetDirectoryInfo()); break; case ShardMessageKind::REMOVE_NON_OWNED_EDGE: err = _prepareRemoveNonOwnedEdge(time, req.getRemoveNonOwnedEdge(), logEntryBody.setRemoveNonOwnedEdge()); break; case ShardMessageKind::SAME_SHARD_HARD_FILE_UNLINK: err = _prepareSameShardHardFileUnlink(time, req.getSameShardHardFileUnlink(), logEntryBody.setSameShardHardFileUnlink()); break; case ShardMessageKind::REMOVE_SPAN_INITIATE: err = _prepareRemoveSpanInitiate(time, req.getRemoveSpanInitiate(), logEntryBody.setRemoveSpanInitiate()); break; case ShardMessageKind::ADD_SPAN_INITIATE: err = _prepareAddSpanInitiate(time, req.getAddSpanInitiate(), logEntryBody.setAddSpanInitiate()); break; case ShardMessageKind::ADD_SPAN_CERTIFY: err = _prepareAddSpanCertify(time, req.getAddSpanCertify(), logEntryBody.setAddSpanCertify()); break; case ShardMessageKind::MAKE_FILE_TRANSIENT: err = _prepareMakeFileTransient(time, req.getMakeFileTransient(), logEntryBody.setMakeFileTransient()); break; case ShardMessageKind::REMOVE_SPAN_CERTIFY: err = _prepareRemoveSpanCertify(time, req.getRemoveSpanCertify(), logEntryBody.setRemoveSpanCertify()); break; case ShardMessageKind::REMOVE_OWNED_SNAPSHOT_FILE_EDGE: err = _prepareRemoveOwnedSnapshotFileEdge(time, req.getRemoveOwnedSnapshotFileEdge(), logEntryBody.setRemoveOwnedSnapshotFileEdge()); break; case ShardMessageKind::SWAP_BLOCKS: throw EGGS_EXCEPTION("UNIMPLEMENTED"); default: throw EGGS_EXCEPTION("bad write shard message kind %s", req.kind()); } if (err == NO_ERROR) { LOG_DEBUG(_env, "prepared log entry of kind %s, for request of kind %s: %s", logEntryBody.kind(), req.kind(), logEntryBody); } else { LOG_INFO(_env, "could not prepare log entry for request of kind %s: %s", req.kind(), err); } return err; } // ---------------------------------------------------------------- // log application void _advanceLastAppliedLogEntry(rocksdb::WriteBatch& batch, uint64_t index) { uint64_t oldIndex = _lastAppliedLogEntry(); ALWAYS_ASSERT(oldIndex+1 == index, "old index is %s, expected %s, got %s", oldIndex, oldIndex+1, index); LOG_DEBUG(_env, "bumping log index from %s to %s", oldIndex, index); StaticValue v; v().setU64(index); ROCKS_DB_CHECKED(batch.Put({}, shardMetadataKey(&LAST_APPLIED_LOG_ENTRY_KEY), v.toSlice())); } EggsError _applyConstructFile(rocksdb::WriteBatch& batch, EggsTime time, const ConstructFileEntry& entry, ConstructFileResp& resp) { const auto nextFileId = [this, &batch](const ShardMetadataKey* key) -> InodeId { std::string value; ROCKS_DB_CHECKED(_db->Get({}, shardMetadataKey(key), &value)); ExternalValue inodeId(value); inodeId().setId(InodeId::FromU64(inodeId().id().u64 + 0x100)); ROCKS_DB_CHECKED(batch.Put(shardMetadataKey(key), inodeId.toSlice())); return inodeId().id(); }; InodeId id; if (entry.type == (uint8_t)InodeType::FILE) { id = nextFileId(&NEXT_FILE_ID_KEY); } else if (entry.type == (uint8_t)InodeType::SYMLINK) { id = nextFileId(&NEXT_SYMLINK_ID_KEY); } else { ALWAYS_ASSERT(false, "Bad type %s", (int)entry.type); } // write to rocks StaticValue transientFile; transientFile().setFileSize(0); transientFile().setMtime(time); transientFile().setDeadline(entry.deadlineTime); transientFile().setLastSpanState(SpanState::CLEAN); transientFile().setNote(entry.note.ref()); auto k = InodeIdKey::Static(id); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), transientFile.toSlice())); // prepare response resp.id = id; resp.cookie.data = _calcCookie(resp.id); return NO_ERROR; } EggsError _applyLinkFile(rocksdb::WriteBatch& batch, EggsTime time, const LinkFileEntry& entry, LinkFileResp& resp) { std::string fileValue; ExternalValue transientFile; { EggsError err = _getTransientFile({}, time, false /*allowPastDeadline*/, entry.fileId, fileValue, transientFile); if (err == EggsError::FILE_NOT_FOUND) { // Check if the file has already been linked to simplify the life of retrying // clients. uint64_t nameHash; // Return original error if the dir doens't exist, since this is some recovery mechanism anyway if (_getDirectoryAndHash({}, entry.ownerId, false /*allowSnapshot*/, entry.name.ref(), nameHash) != NO_ERROR) { LOG_DEBUG(_env, "could not find directory after FILE_NOT_FOUND for link file"); return err; } StaticValue edgeKey; edgeKey().setDirIdWithCurrent(entry.ownerId, true); edgeKey().setNameHash(nameHash); edgeKey().setName(entry.name.ref()); std::string edgeValue; { auto status = _db->Get({}, _edgesCf, edgeKey.toSlice(), &edgeValue); if (status.IsNotFound()) { LOG_DEBUG(_env, "could not find edge after FILE_NOT_FOUND for link file"); return err; } ROCKS_DB_CHECKED(status); } ExternalValue edge(edgeValue); if (edge().targetId() != entry.fileId) { LOG_DEBUG(_env, "mismatching file id after FILE_NOT_FOUND for link file"); return err; } resp.creationTime = edge().creationTime(); return NO_ERROR; } else if (err != NO_ERROR) { return err; } } if (transientFile().lastSpanState() != SpanState::CLEAN) { return EggsError::LAST_SPAN_STATE_NOT_CLEAN; } // move from transient to non-transient. auto fileKey = InodeIdKey::Static(entry.fileId); ROCKS_DB_CHECKED(batch.Delete(_transientCf, fileKey.toSlice())); StaticValue file; file().setMtime(time); file().setFileSize(transientFile().fileSize()); ROCKS_DB_CHECKED(batch.Put(_filesCf, fileKey.toSlice(), file.toSlice())); // create edge in owner. { EggsError err = ShardDBImpl::_createCurrentEdge(time, batch, entry.ownerId, entry.name, entry.fileId, false); if (err != NO_ERROR) { return err; } } resp.creationTime = time; return NO_ERROR; } EggsError _initiateDirectoryModification(EggsTime time, bool allowSnapshot, rocksdb::WriteBatch& batch, InodeId dirId, std::string& dirValue, ExternalValue& dir) { ExternalValue tmpDir; EggsError err = _getDirectory({}, dirId, allowSnapshot, dirValue, tmpDir); if (err != NO_ERROR) { return err; } // Don't go backwards in time. This is important amongst other things to ensure // that we have snapshot edges to be uniquely identified by name, hash, creationTime. // This should be very uncommon. if (tmpDir().mtime() >= time) { RAISE_ALERT(_env, "trying to modify dir %s going backwards in time, dir mtime is %s, log entry time is %s", dirId, tmpDir().mtime(), time); return EggsError::MTIME_IS_TOO_RECENT; } // Modify the directory mtime tmpDir().setMtime(time); { auto k = InodeIdKey::Static(dirId); ROCKS_DB_CHECKED(batch.Put(_directoriesCf, k.toSlice(), tmpDir.toSlice())); } dir = tmpDir; return NO_ERROR; } // When we just want to compute the hash of something when modifying the dir EggsError _initiateDirectoryModificationAndHash(EggsTime time, bool allowSnapshot, rocksdb::WriteBatch& batch, InodeId dirId, const BincodeBytesRef& name, uint64_t& nameHash) { ExternalValue dir; std::string dirValue; EggsError err = _initiateDirectoryModification(time, allowSnapshot, batch, dirId, dirValue, dir); if (err != NO_ERROR) { return err; } nameHash = computeHash(dir().hashMode(), name); return NO_ERROR; } // Note that we cannot expose an API which allows us to create non-locked current edges, // see comment for CreateLockedCurrentEdgeReq. // // `logEntryTime` and `creationTime` are separate since we need a different `creationTime` // for CreateDirectoryInodeReq. EggsError _createCurrentEdge(EggsTime logEntryTime, rocksdb::WriteBatch& batch, InodeId dirId, const BincodeBytes& name, InodeId targetId, bool locked) { EggsTime creationTime = logEntryTime; uint64_t nameHash; { // allowSnaphsot=false since we cannot create current edges in snapshot directories. EggsError err = _initiateDirectoryModificationAndHash(logEntryTime, false, batch, dirId, name.ref(), nameHash); if (err != NO_ERROR) { return err; } } // Next, we need to look at the current edge with the same name, if any. StaticValue edgeKey; edgeKey().setDirIdWithCurrent(dirId, true); // current=true edgeKey().setNameHash(nameHash); edgeKey().setName(name.ref()); std::string edgeValue; auto status = _db->Get({}, _edgesCf, edgeKey.toSlice(), &edgeValue); // in the block below, we exit the function early if something is off. if (status.IsNotFound()) { // we're the first one here -- we only need to check the time of // the snaphsot edges if the creation time is earlier than the // log entry time, otherwise we know that the snapshot edges are // all older, since they were all created before logEntryTime. StaticValue snapshotEdgeKey; snapshotEdgeKey().setDirIdWithCurrent(dirId, false); // snapshhot (current=false) snapshotEdgeKey().setNameHash(nameHash); snapshotEdgeKey().setName(name.ref()); snapshotEdgeKey().setCreationTime({std::numeric_limits::max()}); WrappedIterator it(_db->NewIterator({}, _edgesCf)); it->SeekForPrev(snapshotEdgeKey.toSlice()); if (it->Valid() && !it->status().IsNotFound()) { auto k = ExternalValue::FromSlice(it->key()); if (k().dirId() == dirId && !k().current() && k().nameHash() == nameHash && k().name() == name.ref()) { if (k().creationTime() >= creationTime) { return EggsError::MORE_RECENT_SNAPSHOT_EDGE; } } } ROCKS_DB_CHECKED(it->status()); } else { ROCKS_DB_CHECKED(status); ExternalValue existingEdge(edgeValue); if (existingEdge().targetIdWithLocked().extra()) { // locked // we have an existing locked edge, we need to make sure that it's the one we expect for // idempotency. if ( !locked || // the one we're trying to create isn't locked (targetId != existingEdge().targetIdWithLocked().id()) // we're trying to create a different locked edge ) { return EggsError::NAME_IS_LOCKED; } } else { // We're kicking out a non-locked current edge. The only circumstance where we allow // this automatically is if a file is overriding another file, which is also how it // works in linux/posix (see `man 2 rename`). if (existingEdge().creationTime() >= creationTime) { return EggsError::MORE_RECENT_CURRENT_EDGE; } if ( targetId.type() == InodeType::DIRECTORY || existingEdge().targetIdWithLocked().id().type() == InodeType::DIRECTORY ) { return EggsError::CANNOT_OVERRIDE_NAME; } // make what is now the current edge a snapshot edge -- no need to delete it, // it'll be overwritten below. { StaticValue k; k().setDirIdWithCurrent(dirId, false); // snapshot (current=false) k().setNameHash(nameHash); k().setName(name.ref()); k().setCreationTime(existingEdge().creationTime()); StaticValue v; // this was current, so it's now owned. v().setTargetIdWithOwned(InodeIdExtra(existingEdge().targetIdWithLocked().id(), true)); ROCKS_DB_CHECKED(batch.Put(_edgesCf, k.toSlice(), v.toSlice())); } } } // OK, we're now ready to insert the current edge StaticValue edgeBody; edgeBody().setTargetIdWithLocked(InodeIdExtra(targetId, locked)); edgeBody().setCreationTime(creationTime); ROCKS_DB_CHECKED(batch.Put(_edgesCf, edgeKey.toSlice(), edgeBody.toSlice())); return NO_ERROR; } EggsError _applySameDirectoryRename(EggsTime time, rocksdb::WriteBatch& batch, const SameDirectoryRenameEntry& entry, SameDirectoryRenameResp& resp) { // First, remove the old edge -- which won't be owned anymore, since we're renaming it. { EggsError err = _softUnlinkCurrentEdge(time, batch, entry.dirId, entry.oldName, entry.oldCreationTime, entry.targetId, false); if (err != NO_ERROR) { return err; } } // Now, create the new one { EggsError err = _createCurrentEdge(time, batch, entry.dirId, entry.newName, entry.targetId, false); if (err != NO_ERROR) { return err; } } resp.newCreationTime = time; return NO_ERROR; } EggsError _softUnlinkCurrentEdge(EggsTime time, rocksdb::WriteBatch& batch, InodeId dirId, const BincodeBytes& name, EggsTime creationTime, InodeId targetId, bool owned) { // compute hash uint64_t nameHash; { // allowSnaphsot=false since we can't have current edges in snapshot dirs EggsError err = _initiateDirectoryModificationAndHash(time, false, batch, dirId, name.ref(), nameHash); if (err != NO_ERROR) { return err; } } // get the edge StaticValue edgeKey; edgeKey().setDirIdWithCurrent(dirId, true); // current=true edgeKey().setNameHash(nameHash); edgeKey().setName(name.ref()); std::string edgeValue; auto status = _db->Get({}, _edgesCf, edgeKey.toSlice(), &edgeValue); if (status.IsNotFound()) { return EggsError::EDGE_NOT_FOUND; } ROCKS_DB_CHECKED(status); ExternalValue edgeBody(edgeValue); if (edgeBody().targetIdWithLocked().id() != targetId) { LOG_DEBUG(_env, "expecting target %s, but got %s", targetId, edgeBody().targetIdWithLocked().id()); return EggsError::MISMATCHING_TARGET; } if (edgeBody().creationTime() != creationTime) { return EggsError::MISMATCHING_CREATION_TIME; } if (edgeBody().targetIdWithLocked().extra()) { // locked return EggsError::EDGE_IS_LOCKED; } // delete the current edge batch.Delete(_edgesCf, edgeKey.toSlice()); // add the two snapshot edges, one for what was the current edge, // and another to signify deletion { StaticValue k; k().setDirIdWithCurrent(dirId, false); // snapshot (current=false) k().setNameHash(nameHash); k().setName(name.ref()); k().setCreationTime(edgeBody().creationTime()); StaticValue v; v().setTargetIdWithOwned(InodeIdExtra(targetId, owned)); ROCKS_DB_CHECKED(batch.Put(_edgesCf, k.toSlice(), v.toSlice())); k().setCreationTime(time); v().setTargetIdWithOwned(InodeIdExtra(NULL_INODE_ID, false)); ROCKS_DB_CHECKED(batch.Put(_edgesCf, k.toSlice(), v.toSlice())); } return NO_ERROR; } EggsError _applySoftUnlinkFile(EggsTime time, rocksdb::WriteBatch& batch, const SoftUnlinkFileEntry& entry, SoftUnlinkFileResp& resp) { return _softUnlinkCurrentEdge(time, batch, entry.ownerId, entry.name, entry.creationTime, entry.fileId, true); } EggsError _applyCreateDirectoryInode(EggsTime time, rocksdb::WriteBatch& batch, const CreateDirectoryInodeEntry& entry, CreateDirectoryInodeResp& resp) { // The assumption here is that only the CDC creates directories, and it doles out // inode ids per transaction, so that you'll never get competing creates here, but // we still check that the parent makes sense. { std::string dirValue; ExternalValue dir; // we never create directories as snapshot EggsError err = _getDirectory({}, entry.id, false, dirValue, dir); if (err == NO_ERROR) { if (dir().ownerId() != entry.ownerId) { return EggsError::MISMATCHING_OWNER; } else { return NO_ERROR; } } else if (err == EggsError::DIRECTORY_NOT_FOUND) { // we continue } else { return err; } } { auto dirKey = InodeIdKey::Static(entry.id); StaticValue dir; dir().setOwnerId(entry.ownerId); dir().setMtime(time); dir().setHashMode(HashMode::XXH3_63); dir().setInfoInherited(entry.info.inherited); LOG_DEBUG(_env, "inherited: %s, hasInfo: %s, body: %s", entry.info.inherited, dir().mustHaveInfo(), entry.info.body.ref()); dir().setInfo(entry.info.body.ref()); ROCKS_DB_CHECKED(batch.Put(_directoriesCf, dirKey.toSlice(), dir.toSlice())); } resp.mtime = time; return NO_ERROR; } EggsError _applyCreateLockedCurrentEdge(EggsTime time, rocksdb::WriteBatch& batch, const CreateLockedCurrentEdgeEntry& entry, CreateLockedCurrentEdgeResp& resp) { auto err = _createCurrentEdge(time, batch, entry.dirId, entry.name, entry.targetId, true); // locked=true if (err != NO_ERROR) { return err; } resp.creationTime = time; return NO_ERROR; } EggsError _applyUnlockCurrentEdge(EggsTime time, rocksdb::WriteBatch& batch, const UnlockCurrentEdgeEntry& entry, UnlockCurrentEdgeResp& resp) { uint64_t nameHash; { std::string dirValue; ExternalValue dir; // allowSnaphsot=false since no current edges in snapshot dirs EggsError err = _getDirectory({}, entry.dirId, false, dirValue, dir); if (err != NO_ERROR) { return err; } nameHash = computeHash(dir().hashMode(), entry.name.ref()); } StaticValue currentKey; currentKey().setDirIdWithCurrent(entry.dirId, true); // current=true currentKey().setNameHash(nameHash); currentKey().setName(entry.name.ref()); std::string edgeValue; { auto status = _db->Get({}, _edgesCf, currentKey.toSlice(), &edgeValue); if (status.IsNotFound()) { return EggsError::EDGE_NOT_FOUND; } ROCKS_DB_CHECKED(status); } ExternalValue edge(edgeValue); if (edge().creationTime() != entry.creationTime) { return EggsError::MISMATCHING_CREATION_TIME; } if (edge().locked()) { edge().setTargetIdWithLocked(InodeIdExtra(entry.targetId, false)); // locked=false ROCKS_DB_CHECKED(batch.Put(_edgesCf, currentKey.toSlice(), edge.toSlice())); } if (entry.wasMoved) { // We need to move the current edge to snapshot, and create a new snapshot // edge with the deletion. ROCKS_DB_CHECKED(batch.Delete(_edgesCf, currentKey.toSlice())); StaticValue snapshotKey; snapshotKey().setDirIdWithCurrent(entry.dirId, false); // snapshot (current=false) snapshotKey().setNameHash(nameHash); snapshotKey().setName(entry.name.ref()); snapshotKey().setCreationTime(edge().creationTime()); StaticValue snapshotBody; snapshotBody().setTargetIdWithOwned(InodeIdExtra(entry.targetId, false)); ROCKS_DB_CHECKED(batch.Put(_edgesCf, snapshotKey.toSlice(), snapshotBody.toSlice())); snapshotKey().setCreationTime(time); snapshotBody().setTargetIdWithOwned(InodeIdExtra(NULL_INODE_ID, false)); // deletion edges are never owned ROCKS_DB_CHECKED(batch.Put(_edgesCf, snapshotKey.toSlice(), snapshotBody.toSlice())); } return NO_ERROR; } EggsError _applyLockCurrentEdge(EggsTime time, rocksdb::WriteBatch& batch, const LockCurrentEdgeEntry& entry, LockCurrentEdgeResp& resp) { // TODO lots of duplication with _applyUnlockCurrentEdge uint64_t nameHash; { std::string dirValue; ExternalValue dir; // allowSnaphsot=false since no current edges in snapshot dirs EggsError err = _getDirectory({}, entry.dirId, false, dirValue, dir); if (err != NO_ERROR) { return err; } nameHash = computeHash(dir().hashMode(), entry.name.ref()); } StaticValue currentKey; currentKey().setDirIdWithCurrent(entry.dirId, true); // current=true currentKey().setNameHash(nameHash); currentKey().setName(entry.name.ref()); std::string edgeValue; { auto status = _db->Get({}, _edgesCf, currentKey.toSlice(), &edgeValue); if (status.IsNotFound()) { return EggsError::EDGE_NOT_FOUND; } ROCKS_DB_CHECKED(status); } ExternalValue edge(edgeValue); if (edge().creationTime() != entry.creationTime) { return EggsError::MISMATCHING_CREATION_TIME; } if (!edge().locked()) { edge().setTargetIdWithLocked({entry.targetId, true}); // locked=true ROCKS_DB_CHECKED(batch.Put(_edgesCf, currentKey.toSlice(), edge.toSlice())); } return NO_ERROR; } EggsError _applyRemoveDirectoryOwner(EggsTime time, rocksdb::WriteBatch& batch, const RemoveDirectoryOwnerEntry& entry, RemoveDirectoryOwnerResp& resp) { std::string dirValue; ExternalValue dir; { // allowSnapshot=true for idempotency (see below) EggsError err = _initiateDirectoryModification(time, true, batch, entry.dirId, dirValue, dir); if (err != NO_ERROR) { return err; } if (dir().ownerId() == NULL_INODE_ID) { return NO_ERROR; // already done } } // if we have any current edges, we can't proceed { StaticValue edgeKey; edgeKey().setDirIdWithCurrent(entry.dirId, true); // current=true edgeKey().setNameHash(0); edgeKey().setName({}); WrappedIterator it(_db->NewIterator({}, _edgesCf)); it->Seek(edgeKey.toSlice()); if (it->Valid()) { auto otherEdge = ExternalValue::FromSlice(it->key()); if (otherEdge().dirId() == entry.dirId && otherEdge().current()) { return EggsError::DIRECTORY_NOT_EMPTY; } } else if (it->status().IsNotFound()) { // nothing to do } else { ROCKS_DB_CHECKED(it->status()); } } // we need to create a new DirectoryBody rather than modify the old one, the materialized // info might have changed size { StaticValue newDir; newDir().setOwnerId(NULL_INODE_ID); newDir().setMtime(time); newDir().setHashMode(dir().hashMode()); newDir().setInfoInherited(dir().infoInherited()); newDir().setInfo(entry.info.ref()); auto k = InodeIdKey::Static(entry.dirId); ROCKS_DB_CHECKED(batch.Put(_directoriesCf, k.toSlice(), newDir.toSlice())); } return NO_ERROR; } EggsError _applyRemoveDirectoryInode(EggsTime time, rocksdb::WriteBatch& batch, const RemoveInodeEntry& entry, RemoveInodeResp& resp) { ALWAYS_ASSERT(entry.id.type() == InodeType::DIRECTORY); std::string dirValue; ExternalValue dir; { EggsError err = _getDirectory({}, entry.id, true, dirValue, dir); if (err == EggsError::DIRECTORY_NOT_FOUND) { return NO_ERROR; // we're already done } if (err != NO_ERROR) { return err; } } if (dir().ownerId() != NULL_INODE_ID) { return EggsError::DIRECTORY_HAS_OWNER; } // there can't be any outgoing edges when killing a directory definitively { StaticValue edgeKey; edgeKey().setDirIdWithCurrent(entry.id, false); edgeKey().setNameHash(0); edgeKey().setName({}); edgeKey().setCreationTime(0); WrappedIterator it(_db->NewIterator({}, _edgesCf)); it->Seek(edgeKey.toSlice()); if (it->Valid()) { auto otherEdge = ExternalValue::FromSlice(it->key()); if (otherEdge().dirId() == entry.id) { LOG_DEBUG(_env, "found edge %s when trying to remove directory %s", otherEdge(), entry.id); return EggsError::DIRECTORY_NOT_EMPTY; } } else if (it->status().IsNotFound()) { // nothing to do } else { ROCKS_DB_CHECKED(it->status()); } } // we can finally delete { auto dirKey = InodeIdKey::Static(entry.id); ROCKS_DB_CHECKED(batch.Delete(_directoriesCf, dirKey.toSlice())); } return NO_ERROR; } EggsError _applyRemoveFileInode(EggsTime time, rocksdb::WriteBatch& batch, const RemoveInodeEntry& entry, RemoveInodeResp& resp) { ALWAYS_ASSERT(entry.id.type() == InodeType::FILE || entry.id.type() == InodeType::DIRECTORY); // we demand for the file to be transient, for the deadline to have passed, and for it to have // no spans { std::string transientFileValue; ExternalValue transientFile; EggsError err = _getTransientFile({}, time, true /*allowPastDeadline*/, entry.id, transientFileValue, transientFile); if (err == EggsError::FILE_NOT_FOUND) { std::string fileValue; ExternalValue file; EggsError err = _getFile({}, entry.id, fileValue, file); if (err == NO_ERROR) { return EggsError::FILE_IS_NOT_TRANSIENT; } else if (err == EggsError::FILE_NOT_FOUND) { // In this case the inode is just gone. The best thing to do is // to just be OK with it, since we need to handle repeated calls // nicely. return NO_ERROR; } else { return err; } } else if (err == NO_ERROR) { // keep going } else { return err; } // check deadline if (transientFile().deadline() >= time) { return EggsError::DEADLINE_NOT_PASSED; } // check no spans { StaticValue spanKey; spanKey().setFileId(entry.id); spanKey().setOffset(0); WrappedIterator it(_db->NewIterator({}, _spansCf)); it->Seek(spanKey.toSlice()); if (it->Valid()) { auto otherSpan = ExternalValue::FromSlice(it->key()); if (otherSpan().fileId() == entry.id) { return EggsError::FILE_NOT_EMPTY; } } else { ROCKS_DB_CHECKED(it->status()); } } } // we can finally delete { auto fileKey = InodeIdKey::Static(entry.id); ROCKS_DB_CHECKED(batch.Delete(_transientCf, fileKey.toSlice())); } return NO_ERROR; } EggsError _applyRemoveInode(EggsTime time, rocksdb::WriteBatch& batch, const RemoveInodeEntry& entry, RemoveInodeResp& resp) { if (entry.id.type() == InodeType::DIRECTORY) { return _applyRemoveDirectoryInode(time, batch, entry, resp); } else { return _applyRemoveFileInode(time, batch, entry, resp); } } EggsError _applySetDirectoryOwner(EggsTime time, rocksdb::WriteBatch& batch, const SetDirectoryOwnerEntry& entry, SetDirectoryOwnerResp& resp) { std::string dirValue; ExternalValue dir; { EggsError err = _initiateDirectoryModification(time, true, batch, entry.dirId, dirValue, dir); if (err != NO_ERROR) { return err; } } // set the owner, and restore the directory entry if needed. StaticValue newDir; newDir().setOwnerId(entry.ownerId); newDir().setMtime(dir().mtime()); newDir().setHashMode(dir().hashMode()); newDir().setInfoInherited(dir().infoInherited()); // remove directory info if it was inherited, now that we have an // owner again. if (dir().infoInherited()) { newDir().setInfo({}); } { auto k = InodeIdKey::Static(entry.dirId); ROCKS_DB_CHECKED(batch.Put(_directoriesCf, k.toSlice(), newDir.toSlice())); } return NO_ERROR; } EggsError _applySetDirectoryInfo(EggsTime time, rocksdb::WriteBatch& batch, const SetDirectoryInfoEntry& entry, SetDirectoryInfoResp& resp) { std::string dirValue; ExternalValue dir; { // allowSnapshot=true since we might want to influence deletion policies for already deleted // directories. EggsError err = _initiateDirectoryModification(time, true, batch, entry.dirId, dirValue, dir); if (err != NO_ERROR) { return err; } } StaticValue newDir; newDir().setOwnerId(dir().ownerId()); newDir().setMtime(dir().mtime()); newDir().setHashMode(dir().hashMode()); newDir().setInfoInherited(entry.info.inherited); // For snapshot directories, we need to preserve the last known info if inherited. // It'll be reset when it's made non-snapshot again. if (!entry.info.inherited || (dir().ownerId() != NULL_INODE_ID)) { newDir().setInfo(entry.info.body.ref()); } { auto k = InodeIdKey::Static(entry.dirId); ROCKS_DB_CHECKED(batch.Put(_directoriesCf, k.toSlice(), newDir.toSlice())); } return NO_ERROR; } EggsError _applyRemoveNonOwnedEdge(EggsTime time, rocksdb::WriteBatch& batch, const RemoveNonOwnedEdgeEntry& entry, RemoveNonOwnedEdgeResp& resp) { uint64_t nameHash; { // allowSnapshot=true since GC needs to be able to remove non-owned edges from snapshot dir EggsError err = _initiateDirectoryModificationAndHash(time, true, batch, entry.dirId, entry.name.ref(), nameHash); if (err != NO_ERROR) { return err; } } // We check that edge is still not owned -- otherwise we might orphan a file. { StaticValue k; k().setDirIdWithCurrent(entry.dirId, false); // snapshot (current=false), we're deleting a non owned snapshot edge k().setNameHash(nameHash); k().setName(entry.name.ref()); k().setCreationTime(entry.creationTime); std::string edgeValue; auto status = _db->Get({}, _edgesCf, k.toSlice(), &edgeValue); if (status.IsNotFound()) { return NO_ERROR; // make the client's life easier } ROCKS_DB_CHECKED(status); ExternalValue edge(edgeValue); if (edge().targetIdWithOwned().extra()) { // TODO better error here? return EggsError::EDGE_NOT_FOUND; // unexpectedly owned } // we can go ahead and safely delete ROCKS_DB_CHECKED(batch.Delete(_edgesCf, k.toSlice())); } return NO_ERROR; } EggsError _applySameShardHardFileUnlink(EggsTime time, rocksdb::WriteBatch& batch, const SameShardHardFileUnlinkEntry& entry, SameShardHardFileUnlinkResp& resp) { // fetch the file std::string fileValue; ExternalValue file; { EggsError err = _getFile({}, entry.targetId, fileValue, file); if (err == EggsError::FILE_NOT_FOUND) { // if the file is already transient, we're done std::string transientFileValue; ExternalValue transientFile; EggsError err = _getTransientFile({}, time, true, entry.targetId, fileValue, transientFile); if (err == NO_ERROR) { return NO_ERROR; } else if (err == EggsError::FILE_NOT_FOUND) { return EggsError::FILE_NOT_FOUND; } else { return err; } } else if (err != NO_ERROR) { return err; } } // fetch dir, compute hash uint64_t nameHash; { std::string dirValue; ExternalValue dir; // allowSnapshot=true since GC needs to be able to do this in snapshot dirs EggsError err = _initiateDirectoryModification(time, true, batch, entry.ownerId, dirValue, dir); nameHash = computeHash(dir().hashMode(), entry.name.ref()); } // We need to check that the edge is still there, and that it still owns the // file. Maybe the file was re-owned by someone else in the meantime, in which case // we can't proceed making the file transient. { StaticValue k; // current=false since we can only delete k().setDirIdWithCurrent(entry.ownerId, false); k().setNameHash(nameHash); k().setName(entry.name.ref()); k().setCreationTime(entry.creationTime); std::string edgeValue; auto status = _db->Get({}, _edgesCf, k.toSlice(), &edgeValue); if (status.IsNotFound()) { return EggsError::EDGE_NOT_FOUND; // can't return NO_ERROR, since the transient file still exists } ROCKS_DB_CHECKED(status); ExternalValue edge(edgeValue); if (!edge().targetIdWithOwned().extra()) { // not owned return EggsError::EDGE_NOT_FOUND; } // we can proceed ROCKS_DB_CHECKED(batch.Delete(_edgesCf, k.toSlice())); } // make file transient { auto k = InodeIdKey::Static(entry.targetId); ROCKS_DB_CHECKED(batch.Delete(_filesCf, k.toSlice())); StaticValue v; v().setFileSize(file().fileSize()); v().setMtime(time); v().setDeadline(0); v().setLastSpanState(SpanState::CLEAN); v().setNote(entry.name.ref()); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), v.toSlice())); } return NO_ERROR; } EggsError _applyRemoveSpanInitiate(EggsTime time, rocksdb::WriteBatch& batch, const RemoveSpanInitiateEntry& entry, RemoveSpanInitiateResp& resp) { std::string fileValue; ExternalValue file; { EggsError err = _initiateTransientFileModification(time, true, batch, entry.fileId, fileValue, file); if (err != NO_ERROR) { return err; } } // Exit early if file is empty. Crucial to do this with the size, // otherwise we might spend a lot of time poring through the SSTs // making sure there are no spans. if (file().fileSize() == 0) { return EggsError::FILE_EMPTY; } LOG_DEBUG(_env, "deleting span from file %s of size %s", entry.fileId, file().fileSize()); // Fetch the last span WrappedIterator spanIt(_db->NewIterator({}, _spansCf)); ExternalValue spanKey; ExternalValue span; { StaticValue endKey; endKey().setFileId(entry.fileId); endKey().setOffset(file().fileSize()); spanIt->SeekForPrev(endKey.toSlice()); ROCKS_DB_CHECKED(spanIt->status()); ALWAYS_ASSERT(spanIt->Valid()); // we know the file isn't empty, we must have a span spanKey = ExternalValue::FromSlice(spanIt->key()); ALWAYS_ASSERT(spanKey().fileId() == entry.fileId); // again, we know the file isn't empty span = ExternalValue::FromSlice(spanIt->value()); } ALWAYS_ASSERT(span().storageClass() != EMPTY_STORAGE); resp.byteOffset = spanKey().offset(); // If the span is blockless, the only thing we need to to do is remove it if (span().storageClass() == INLINE_STORAGE) { ROCKS_DB_CHECKED(batch.Delete(_spansCf, spanKey.toSlice())); file().setFileSize(spanKey().offset()); { auto k = InodeIdKey::Static(entry.fileId); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), file.toSlice())); } return NO_ERROR; } // Otherwise, we need to condemn it first, and then certify the deletion. // Note that we allow to remove dirty spans -- this is important to deal well with // the case where a writer dies in the middle of adding a span. file().setLastSpanState(SpanState::CONDEMNED); { auto k = InodeIdKey::Static(entry.fileId); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), file.toSlice())); } // Fill in the response blocks resp.blocks.els.reserve(span().parity().blocks()); for (int i = 0; i < span().parity().blocks(); i++) { const auto& block = span().block(i); const auto& cache = _blockServicesCache.at(block.blockServiceId); auto& respBlock = resp.blocks.els.emplace_back(); respBlock.blockServiceIp = cache.ip; respBlock.blockServicePort = cache.port; respBlock.blockServiceId = block.blockServiceId; respBlock.blockId = block.blockId; respBlock.certificate = _blockDeleteCertificate(span().blockSize(), block, cache.secretKey); } return NO_ERROR; } // Important for this to never error: we rely on the write to go through since // we write _currentBlockServices/_blockServicesCache, which we rely to be in // sync with RocksDB. void _applyUpdateBlockServices(EggsTime time, rocksdb::WriteBatch& batch, const UpdateBlockServicesEntry& entry) { // The idea here is that we ask for some block services to use to shuckle once a minute, // and then just pick at random within them. We shouldn't have many to pick from at any // given point. ALWAYS_ASSERT(entry.blockServices.els.size() < 256); // TODO proper error OwnedValue currentBody(entry.blockServices.els.size()); _currentBlockServices.resize(entry.blockServices.els.size()); StaticValue blockKey; blockKey().setKey(BLOCK_SERVICE_KEY); StaticValue blockBody; for (int i = 0; i < entry.blockServices.els.size(); i++) { const auto& entryBlock = entry.blockServices.els[i]; currentBody().set(i, entryBlock.id); _currentBlockServices[i] = entryBlock.id; blockKey().setBlockServiceId(entryBlock.id); blockBody().setId(entryBlock.id); blockBody().setIp(entryBlock.ip.data); blockBody().setPort(entryBlock.port); blockBody().setStorageClass(entryBlock.storageClass); blockBody().setFailureDomain(entryBlock.failureDomain.data); blockBody().setSecretKey(entryBlock.secretKey.data); ROCKS_DB_CHECKED(batch.Put(_defaultCf, blockKey.toSlice(), blockBody.toSlice())); auto& cache = _blockServicesCache[entryBlock.id]; expandKey(entryBlock.secretKey.data, cache.secretKey); cache.ip = entryBlock.ip.data; cache.port = entryBlock.port; cache.storageClass = entryBlock.storageClass; } ROCKS_DB_CHECKED(batch.Put(_defaultCf, shardMetadataKey(&CURRENT_BLOCK_SERVICES_KEY), currentBody.toSlice())); } uint64_t _getNextBlockId() { std::string v; ROCKS_DB_CHECKED(_db->Get({}, _defaultCf, shardMetadataKey(&NEXT_BLOCK_ID_KEY), &v)); return ExternalValue(v)().u64(); } uint64_t _updateNextBlockId(EggsTime time, uint64_t& nextBlockId) { // time is embedded into the id, other than LSB which is shard nextBlockId = std::max(nextBlockId + 0x100, _shid.u8 | (time.ns & ~0xFFull)); return nextBlockId; } void _writeNextBlockId(rocksdb::WriteBatch& batch, uint64_t nextBlockId) { StaticValue v; v().setU64(nextBlockId); ROCKS_DB_CHECKED(batch.Put(_defaultCf, shardMetadataKey(&NEXT_BLOCK_ID_KEY), v.toSlice())); } void _fillInAddSpanInitiate(const SpanBody& spanBody, AddSpanInitiateResp& resp) { resp.blocks.els.reserve(spanBody.parity().blocks()); BlockBody block; for (int i = 0; i < spanBody.parity().blocks(); i++) { BlockBody block = spanBody.block(i); auto& respBlock = resp.blocks.els.emplace_back(); respBlock.blockServiceId = block.blockServiceId; respBlock.blockId = block.blockId; const auto& cache = _blockServicesCache.at(block.blockServiceId); respBlock.blockServiceIp = cache.ip; respBlock.blockServicePort = cache.port; respBlock.certificate.data = _blockAddCertificate(spanBody.blockSize(), block, cache.secretKey); } } EggsError _applyAddSpanInitiate(EggsTime time, rocksdb::WriteBatch& batch, const AddSpanInitiateEntry& entry, AddSpanInitiateResp& resp) { std::string fileValue; ExternalValue file; { EggsError err = _initiateTransientFileModification(time, false, batch, entry.fileId, fileValue, file); if (err != NO_ERROR) { return err; } } // Special case -- for empty spans we have nothing to do if (entry.size == 0) { return NO_ERROR; } StaticValue spanKey; spanKey().setFileId(entry.fileId); spanKey().setOffset(entry.byteOffset); // Check that the file is where where we expect it if (file().fileSize() != entry.byteOffset) { // Special case: we're trying to add the same span again. This is acceptable // in the name of idempotency, but we return the blocks we've previously // computed. // // We _must_ return the blocks we've first returned, and ignored whatever // comes next, even if a new blacklist might have influenced the decision. // // This is since we need to be certain that we don't leak blocks. So either // the client needs to migrate the blocks, or just throw away the file. if (file().fileSize() == entry.byteOffset+entry.size) { std::string spanValue; auto status = _db->Get({}, _spansCf, spanKey.toSlice(), &spanValue); if (status.IsNotFound()) { LOG_DEBUG(_env, "file size does not match, but could not find existing span"); return EggsError::SPAN_NOT_FOUND; } ROCKS_DB_CHECKED(status); ExternalValue existingSpan(spanValue); if ( existingSpan().spanSize() != entry.size || existingSpan().blockSize() != entry.blockSize || existingSpan().parity() != entry.parity || existingSpan().crc32() != entry.crc32 ) { LOG_DEBUG(_env, "file size does not match, and existing span does not match"); return EggsError::SPAN_NOT_FOUND; } _fillInAddSpanInitiate(existingSpan(), resp); return NO_ERROR; } LOG_DEBUG(_env, "expecting file size %s, but got %s, returning span not found", entry.byteOffset, file().fileSize()); return EggsError::SPAN_NOT_FOUND; } // We're actually adding a new span -- the span state must be clean. if (file().lastSpanState() != SpanState::CLEAN) { return EggsError::LAST_SPAN_STATE_NOT_CLEAN; } // Update the file with the new file size and set the last span state to dirty, // if we're not blockless file().setFileSize(entry.byteOffset+entry.size); if (entry.storageClass != EMPTY_STORAGE && entry.storageClass != INLINE_STORAGE) { file().setLastSpanState(SpanState::DIRTY); } { auto k = InodeIdKey::Static(entry.fileId); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), file.toSlice())); } // Now manufacture and add the span, also recording the blocks // in the block service -> files index. OwnedValue spanBody(entry.storageClass, entry.parity, entry.bodyBytes.ref()); { spanBody().setSpanSize(entry.size); spanBody().setBlockSize(entry.blockSize); spanBody().setCrc32(entry.crc32.data); uint64_t nextBlockId = _getNextBlockId(); BlockBody block; for (int i = 0; i < entry.parity.blocks(); i++) { auto& entryBlock = entry.bodyBlocks.els[i]; block.blockId = _updateNextBlockId(time, nextBlockId); block.blockServiceId = entryBlock.blockServiceId; block.crc32 = entryBlock.crc32.data; spanBody().setBlock(i, block); { StaticValue k; k().setBlockServiceId(entryBlock.blockServiceId); k().setFileId(entry.fileId); ROCKS_DB_CHECKED(batch.Put(_blockServicesToFilesCf, k.toSlice(), {})); } } _writeNextBlockId(batch, nextBlockId); ROCKS_DB_CHECKED(batch.Put(_spansCf, spanKey.toSlice(), spanBody.toSlice())); } // Fill in the response _fillInAddSpanInitiate(spanBody(), resp); return NO_ERROR; } std::array _blockAddCertificate(uint32_t blockSize, const BlockBody& block, const AES128Key& secretKey) { char buf[32]; memset(buf, 0, sizeof(buf)); BincodeBuf bbuf(buf, sizeof(buf)); // struct.pack_into('(block.blockServiceId); bbuf.packScalar('w'); bbuf.packScalar(block.blockId); bbuf.packFixedBytes<4>({block.crc32}); bbuf.packScalar(blockSize); return cbcmac(secretKey, (uint8_t*)buf, sizeof(buf)); } bool _checkBlockAddProof(uint64_t blockServiceId, const BlockProof& proof) { char buf[32]; memset(buf, 0, sizeof(buf)); BincodeBuf bbuf(buf, sizeof(buf)); // struct.pack_into('(blockServiceId); bbuf.packScalar('W'); bbuf.packScalar(proof.blockId); const auto& cache = _blockServicesCache.at(blockServiceId); auto expectedProof = cbcmac(cache.secretKey, (uint8_t*)buf, sizeof(buf)); return proof.proof == expectedProof; } std::array _blockDeleteCertificate(uint32_t blockSize, const BlockBody& block, const AES128Key& secretKey) { char buf[32]; memset(buf, 0, sizeof(buf)); BincodeBuf bbuf(buf, sizeof(buf)); // struct.pack_into('(block.blockServiceId); bbuf.packScalar('e'); bbuf.packScalar(block.blockId); return cbcmac(secretKey, (uint8_t*)buf, sizeof(buf)); } bool _checkBlockDeleteProof(uint64_t blockServiceId, const BlockProof& proof) { char buf[32]; memset(buf, 0, sizeof(buf)); BincodeBuf bbuf(buf, sizeof(buf)); // struct.pack_into('(blockServiceId); bbuf.packScalar('E'); bbuf.packScalar(proof.blockId); const auto& cache = _blockServicesCache.at(blockServiceId); auto expectedProof = cbcmac(cache.secretKey, (uint8_t*)buf, sizeof(buf)); bool good = proof.proof == expectedProof; if (!good) { RAISE_ALERT(_env, "Bad block delete proof, expected %s, got %s", BincodeFixedBytes<8>(expectedProof), BincodeFixedBytes<8>(proof.proof)); } return good; } EggsError _applyAddSpanCertify(EggsTime time, rocksdb::WriteBatch& batch, const AddSpanCertifyEntry& entry, AddSpanCertifyResp& resp) { std::string fileValue; ExternalValue file; { EggsError err = _initiateTransientFileModification(time, false, batch, entry.fileId, fileValue, file); if (err != NO_ERROR) { return err; } } StaticValue spanKey; spanKey().setFileId(entry.fileId); spanKey().setOffset(entry.byteOffset); // Make sure the existing span exists, exit early if we're already done with // it, verify the proofs. { std::string spanValue; auto status = _db->Get({}, _spansCf, spanKey.toSlice(), &spanValue); if (status.IsNotFound()) { return EggsError::SPAN_NOT_FOUND; } ROCKS_DB_CHECKED(status); ExternalValue span(spanValue); // "Is the span still there" if (file().fileSize() > entry.byteOffset+span().spanSize()) { return NO_ERROR; // already certified (we're past it) } if (file().lastSpanState() == SpanState::CLEAN) { return NO_ERROR; // already certified } if (file().lastSpanState() == SpanState::CONDEMNED) { return EggsError::SPAN_NOT_FOUND; // we could probably have a better error here } ALWAYS_ASSERT(file().lastSpanState() == SpanState::DIRTY); // Now verify the proofs if (span().storageClass() == INLINE_STORAGE) { return EggsError::CANNOT_CERTIFY_BLOCKLESS_SPAN; } if (span().parity().blocks() != entry.proofs.els.size()) { return EggsError::BAD_NUMBER_OF_BLOCKS_PROOFS; } BlockBody block; for (int i = 0; i < span().parity().blocks(); i++) { BlockBody block = span().block(i); if (!_checkBlockAddProof(block.blockServiceId, entry.proofs.els[i])) { return EggsError::BAD_BLOCK_PROOF; } } } // Okay, now we can update the file to mark the last span as clean file().setLastSpanState(SpanState::CLEAN); { auto k = InodeIdKey::Static(entry.fileId); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), file.toSlice())); } // We're done. return NO_ERROR; } EggsError _applyMakeFileTransient(EggsTime time, rocksdb::WriteBatch& batch, const MakeFileTransientEntry& entry, MakeFileTransientResp& resp) { std::string fileValue; ExternalValue file; { EggsError err = _getFile({}, entry.id, fileValue, file); if (err == EggsError::FILE_NOT_FOUND) { // if it's already transient, we're done std::string transientFileValue; ExternalValue transientFile; EggsError err = _getTransientFile({}, time, true, entry.id, transientFileValue, transientFile); if (err == NO_ERROR) { return NO_ERROR; } } if (err != NO_ERROR) { return err; } } // delete the file auto k = InodeIdKey::Static(entry.id); ROCKS_DB_CHECKED(batch.Delete(_filesCf, k.toSlice())); // make a transient one StaticValue transientFile; transientFile().setFileSize(file().fileSize()); transientFile().setMtime(time); transientFile().setDeadline(0); transientFile().setLastSpanState(SpanState::CLEAN); transientFile().setNote(entry.note.ref()); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), transientFile.toSlice())); return NO_ERROR; } EggsError _applyRemoveSpanCertify(EggsTime time, rocksdb::WriteBatch& batch, const RemoveSpanCertifyEntry& entry, RemoveSpanCertifyResp& resp) { std::string fileValue; ExternalValue file; { EggsError err = _initiateTransientFileModification(time, true, batch, entry.fileId, fileValue, file); if (err != NO_ERROR) { return err; } } // Fetch span StaticValue spanKey; spanKey().setFileId(entry.fileId); spanKey().setOffset(entry.byteOffset); std::string spanValue; ExternalValue span; { auto status = _db->Get({}, _spansCf, spanKey.toSlice(), &spanValue); if (status.IsNotFound()) { LOG_DEBUG(_env, "skipping removal of span for file %s, offset %s, since we're already done", entry.fileId, entry.byteOffset); return NO_ERROR; // already done } ROCKS_DB_CHECKED(status); span = ExternalValue(spanValue); } ALWAYS_ASSERT(span().storageClass() != EMPTY_STORAGE); if (span().storageClass() == INLINE_STORAGE) { return EggsError::CANNOT_CERTIFY_BLOCKLESS_SPAN; } // Make sure we're condemned if (file().lastSpanState() != SpanState::CONDEMNED) { return EggsError::SPAN_NOT_FOUND; // TODO maybe better error? } // Verify proofs if (entry.proofs.els.size() != span().parity().blocks()) { return EggsError::BAD_NUMBER_OF_BLOCKS_PROOFS; } for (int i = 0; i < span().parity().blocks(); i++) { const auto& block = span().block(i); const auto& proof = entry.proofs.els[i]; if (block.blockId != proof.blockId) { RAISE_ALERT(_env, "bad block proof id, expected %s, got %s", block.blockId, proof.blockId); return EggsError::BAD_BLOCK_PROOF; } if (!_checkBlockDeleteProof(block.blockServiceId, proof)) { return EggsError::BAD_BLOCK_PROOF; } } // Delete span, set new size, and go back to clean state LOG_DEBUG(_env, "deleting span for file %s, at offset %s", entry.fileId, entry.byteOffset); ROCKS_DB_CHECKED(batch.Delete(_spansCf, spanKey.toSlice())); { auto k = InodeIdKey::Static(entry.fileId); file().setLastSpanState(SpanState::CLEAN); file().setFileSize(spanKey().offset()); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), file.toSlice())); } return NO_ERROR; } EggsError _applyRemoveOwnedSnapshotFileEdge(EggsTime time, rocksdb::WriteBatch& batch, const RemoveOwnedSnapshotFileEdgeEntry& entry, RemoveOwnedSnapshotFileEdgeResp& resp) { uint64_t nameHash; { // the GC needs to work on deleted dirs who might still have owned files, so allowSnapshot=true EggsError err = _initiateDirectoryModificationAndHash(time, true, batch, entry.ownerId, entry.name.ref(), nameHash); if (err != NO_ERROR) { return err; } } { StaticValue edgeKey; edgeKey().setDirIdWithCurrent(entry.ownerId, false); // snapshot (current=false) edgeKey().setNameHash(nameHash); edgeKey().setName(entry.name.ref()); edgeKey().setCreationTime(entry.creationTime); ROCKS_DB_CHECKED(batch.Delete(_edgesCf, edgeKey.toSlice())); } return NO_ERROR; } EggsError applyLogEntry(bool sync, uint64_t logIndex, const ShardLogEntry& logEntry, ShardRespContainer& resp) { // TODO figure out the story with what regards time monotonicity (possibly drop non-monotonic log // updates?) LOG_DEBUG(_env, "applying log at index %s", logIndex); auto locked = _applyLogEntryLock.lock(); resp.clear(); EggsError err = NO_ERROR; rocksdb::WriteBatch batch; _advanceLastAppliedLogEntry(batch, logIndex); // We set this savepoint since we still want to record the log index advancement // even if the application does _not_ go through. // // This gives us the freedom to write freely in the specific _apply* functions below, // and fail midway, without worrying that we could write an inconsistent state, since // it's all rolled back. batch.SetSavePoint(); std::string entryScratch; EggsTime time = logEntry.time; const auto& logEntryBody = logEntry.body; LOG_DEBUG(_env, "about to apply log entry %s", logEntryBody); switch (logEntryBody.kind()) { case ShardLogEntryKind::CONSTRUCT_FILE: err = _applyConstructFile(batch, time, logEntryBody.getConstructFile(), resp.setConstructFile()); break; case ShardLogEntryKind::LINK_FILE: err = _applyLinkFile(batch, time, logEntryBody.getLinkFile(), resp.setLinkFile()); break; case ShardLogEntryKind::SAME_DIRECTORY_RENAME: err = _applySameDirectoryRename(time, batch, logEntryBody.getSameDirectoryRename(), resp.setSameDirectoryRename()); break; case ShardLogEntryKind::SOFT_UNLINK_FILE: err = _applySoftUnlinkFile(time, batch, logEntryBody.getSoftUnlinkFile(), resp.setSoftUnlinkFile()); break; case ShardLogEntryKind::CREATE_DIRECTORY_INODE: err = _applyCreateDirectoryInode(time, batch, logEntryBody.getCreateDirectoryInode(), resp.setCreateDirectoryInode()); break; case ShardLogEntryKind::CREATE_LOCKED_CURRENT_EDGE: err = _applyCreateLockedCurrentEdge(time, batch, logEntryBody.getCreateLockedCurrentEdge(), resp.setCreateLockedCurrentEdge()); break; case ShardLogEntryKind::UNLOCK_CURRENT_EDGE: err = _applyUnlockCurrentEdge(time, batch, logEntryBody.getUnlockCurrentEdge(), resp.setUnlockCurrentEdge()); break; case ShardLogEntryKind::LOCK_CURRENT_EDGE: err = _applyLockCurrentEdge(time, batch, logEntryBody.getLockCurrentEdge(), resp.setLockCurrentEdge()); break; case ShardLogEntryKind::REMOVE_DIRECTORY_OWNER: err = _applyRemoveDirectoryOwner(time, batch, logEntryBody.getRemoveDirectoryOwner(), resp.setRemoveDirectoryOwner()); break; case ShardLogEntryKind::REMOVE_INODE: err = _applyRemoveInode(time, batch, logEntryBody.getRemoveInode(), resp.setRemoveInode()); break; case ShardLogEntryKind::SET_DIRECTORY_OWNER: err = _applySetDirectoryOwner(time, batch, logEntryBody.getSetDirectoryOwner(), resp.setSetDirectoryOwner()); break; case ShardLogEntryKind::SET_DIRECTORY_INFO: err = _applySetDirectoryInfo(time, batch, logEntryBody.getSetDirectoryInfo(), resp.setSetDirectoryInfo()); break; case ShardLogEntryKind::REMOVE_NON_OWNED_EDGE: err = _applyRemoveNonOwnedEdge(time, batch, logEntryBody.getRemoveNonOwnedEdge(), resp.setRemoveNonOwnedEdge()); break; case ShardLogEntryKind::SAME_SHARD_HARD_FILE_UNLINK: err = _applySameShardHardFileUnlink(time, batch, logEntryBody.getSameShardHardFileUnlink(), resp.setSameShardHardFileUnlink()); break; case ShardLogEntryKind::REMOVE_SPAN_INITIATE: err = _applyRemoveSpanInitiate(time, batch, logEntryBody.getRemoveSpanInitiate(), resp.setRemoveSpanInitiate()); break; case ShardLogEntryKind::UPDATE_BLOCK_SERVICES: // no resp here -- the callers are internal _applyUpdateBlockServices(time, batch, logEntryBody.getUpdateBlockServices()); break; case ShardLogEntryKind::ADD_SPAN_INITIATE: err = _applyAddSpanInitiate(time, batch, logEntryBody.getAddSpanInitiate(), resp.setAddSpanInitiate()); break; case ShardLogEntryKind::ADD_SPAN_CERTIFY: err = _applyAddSpanCertify(time, batch, logEntryBody.getAddSpanCertify(), resp.setAddSpanCertify()); break; case ShardLogEntryKind::MAKE_FILE_TRANSIENT: err = _applyMakeFileTransient(time, batch, logEntryBody.getMakeFileTransient(), resp.setMakeFileTransient()); break; case ShardLogEntryKind::REMOVE_SPAN_CERTIFY: err = _applyRemoveSpanCertify(time, batch, logEntryBody.getRemoveSpanCertify(), resp.setRemoveSpanCertify()); break; case ShardLogEntryKind::REMOVE_OWNED_SNAPSHOT_FILE_EDGE: err = _applyRemoveOwnedSnapshotFileEdge(time, batch, logEntryBody.getRemoveOwnedSnapshotFileEdge(), resp.setRemoveOwnedSnapshotFileEdge()); break; default: throw EGGS_EXCEPTION("bad log entry kind %s", logEntryBody.kind()); } if (err != NO_ERROR) { LOG_INFO(_env, "could not apply log entry %s, index %s, because of err %s", logEntryBody.kind(), logIndex, err); batch.RollbackToSavePoint(); } else { LOG_DEBUG(_env, "applied log entry of kind %s, index %s, writing changes", logEntryBody.kind(), logIndex); } { rocksdb::WriteOptions options; options.sync = sync; ROCKS_DB_CHECKED(_db->Write(options, &batch)); } return err; } // ---------------------------------------------------------------- // miscellanea std::array _calcCookie(InodeId id) { return cbcmac(_expandedSecretKey, (const uint8_t*)&id, sizeof(id)); } uint64_t _lastAppliedLogEntry() { std::string value; ROCKS_DB_CHECKED(_db->Get({}, shardMetadataKey(&LAST_APPLIED_LOG_ENTRY_KEY), &value)); ExternalValue v(value); return v().u64(); } EggsError _getDirectory(const rocksdb::ReadOptions& options, InodeId id, bool allowSnapshot, std::string& dirValue, ExternalValue& dir) { if (unlikely(id.type() != InodeType::DIRECTORY)) { return EggsError::TYPE_IS_NOT_DIRECTORY; } auto k = InodeIdKey::Static(id); auto status = _db->Get(options, _directoriesCf, k.toSlice(), &dirValue); if (status.IsNotFound()) { return EggsError::DIRECTORY_NOT_FOUND; } ROCKS_DB_CHECKED(status); auto tmpDir = ExternalValue(dirValue); if (!allowSnapshot && (tmpDir().ownerId() == NULL_INODE_ID && id != ROOT_DIR_INODE_ID)) { // root dir never has an owner return EggsError::DIRECTORY_NOT_FOUND; } dir = tmpDir; return NO_ERROR; } EggsError _getDirectoryAndHash(const rocksdb::ReadOptions& options, InodeId id, bool allowSnapshot, const BincodeBytesRef& name, uint64_t& nameHash) { std::string dirValue; ExternalValue dir; EggsError err = _getDirectory(options, id, allowSnapshot, dirValue, dir); if (err != NO_ERROR) { return err; } nameHash = computeHash(dir().hashMode(), name); return NO_ERROR; } EggsError _getFile(const rocksdb::ReadOptions& options, InodeId id, std::string& fileValue, ExternalValue& file) { if (unlikely(id.type() != InodeType::FILE && id.type() != InodeType::SYMLINK)) { return EggsError::TYPE_IS_DIRECTORY; } auto k = InodeIdKey::Static(id); auto status = _db->Get(options, _filesCf, k.toSlice(), &fileValue); if (status.IsNotFound()) { return EggsError::FILE_NOT_FOUND; } ROCKS_DB_CHECKED(status); file = ExternalValue(fileValue); return NO_ERROR; } EggsError _getTransientFile(const rocksdb::ReadOptions& options, EggsTime time, bool allowPastDeadline, InodeId id, std::string& value, ExternalValue& file) { if (id.type() != InodeType::FILE && id.type() != InodeType::SYMLINK) { return EggsError::TYPE_IS_DIRECTORY; } auto k = InodeIdKey::Static(id); auto status = _db->Get(options, _transientCf, k.toSlice(), &value); if (status.IsNotFound()) { return EggsError::FILE_NOT_FOUND; } ROCKS_DB_CHECKED(status); auto tmpFile = ExternalValue(value); if (!allowPastDeadline && time > tmpFile().deadline()) { // this should be fairly uncommon LOG_INFO(_env, "not picking up transient file %s since its deadline %s is past the log entry time %s", id, tmpFile().deadline(), time); return EggsError::FILE_NOT_FOUND; } file = tmpFile; return NO_ERROR; } EggsError _initiateTransientFileModification( EggsTime time, bool allowPastDeadline, rocksdb::WriteBatch& batch, InodeId id, std::string& tfValue, ExternalValue& tf ) { ExternalValue tmpTf; EggsError err = _getTransientFile({}, time, allowPastDeadline, id, tfValue, tmpTf); if (err != NO_ERROR) { return err; } // Like with dirs, don't go backwards in time. This is possibly less needed than // with directories, but still seems good hygiene. if (tmpTf().mtime() >= time) { RAISE_ALERT(_env, "trying to modify transient file %s going backwards in time, file mtime is %s, log entry time is %s", id, tmpTf().mtime(), time); return EggsError::MTIME_IS_TOO_RECENT; } tmpTf().setMtime(time); if (!allowPastDeadline) { tmpTf().setDeadline(time + DEADLINE_INTERVAL); } { auto k = InodeIdKey::Static(id); ROCKS_DB_CHECKED(batch.Put(_transientCf, k.toSlice(), tmpTf.toSlice())); } tf = tmpTf; return NO_ERROR; } }; BincodeBytes defaultDirectoryInfo() { char buf[255]; DirectoryInfoBody info; info.version = 0; // delete after 30 days info.deleteAfterTime = (30ull /*days*/ * 24 /*hours*/ * 60 /*minutes*/ * 60 /*seconds*/ * 1'000'000'000 /*ns*/) | (1ull<<63); // do not delete after N versions info.deleteAfterVersions = 0; // up to 1MiB flash, up to 10MiB HDD (should be 100MiB really, it's nicer to test with smaller // spans now). auto& flashPolicy = info.spanPolicies.els.emplace_back(); flashPolicy.maxSize = 1ull << 20; // 1MiB flashPolicy.storageClass = storageClassByName("FLASH"); flashPolicy.parity = Parity(1, 1); // RAID1 auto& hddPolicy = info.spanPolicies.els.emplace_back(); hddPolicy.maxSize = 10ull << 20; // 10MiB hddPolicy.storageClass = storageClassByName("HDD"); hddPolicy.parity = Parity(1, 1); // RAID1 BincodeBuf bbuf(buf, 255); info.pack(bbuf); return BincodeBytes(buf, bbuf.len()); } bool readOnlyShardReq(const ShardMessageKind kind) { switch (kind) { case ShardMessageKind::LOOKUP: case ShardMessageKind::STAT_FILE: case ShardMessageKind::STAT_TRANSIENT_FILE: case ShardMessageKind::STAT_DIRECTORY: case ShardMessageKind::READ_DIR: case ShardMessageKind::FULL_READ_DIR: case ShardMessageKind::FILE_SPANS: case ShardMessageKind::VISIT_DIRECTORIES: case ShardMessageKind::VISIT_FILES: case ShardMessageKind::VISIT_TRANSIENT_FILES: case ShardMessageKind::BLOCK_SERVICE_FILES: case ShardMessageKind::SNAPSHOT_LOOKUP: return true; case ShardMessageKind::CONSTRUCT_FILE: case ShardMessageKind::ADD_SPAN_INITIATE: case ShardMessageKind::ADD_SPAN_CERTIFY: case ShardMessageKind::LINK_FILE: case ShardMessageKind::SOFT_UNLINK_FILE: case ShardMessageKind::SAME_DIRECTORY_RENAME: case ShardMessageKind::SET_DIRECTORY_INFO: case ShardMessageKind::REMOVE_NON_OWNED_EDGE: case ShardMessageKind::SAME_SHARD_HARD_FILE_UNLINK: case ShardMessageKind::REMOVE_SPAN_INITIATE: case ShardMessageKind::REMOVE_SPAN_CERTIFY: case ShardMessageKind::SWAP_BLOCKS: case ShardMessageKind::CREATE_DIRECTORY_INODE: case ShardMessageKind::SET_DIRECTORY_OWNER: case ShardMessageKind::REMOVE_DIRECTORY_OWNER: case ShardMessageKind::CREATE_LOCKED_CURRENT_EDGE: case ShardMessageKind::LOCK_CURRENT_EDGE: case ShardMessageKind::UNLOCK_CURRENT_EDGE: case ShardMessageKind::REMOVE_INODE: case ShardMessageKind::REMOVE_OWNED_SNAPSHOT_FILE_EDGE: case ShardMessageKind::MAKE_FILE_TRANSIENT: return false; case ShardMessageKind::ERROR: throw EGGS_EXCEPTION("unexpected ERROR shard message kind"); } throw EGGS_EXCEPTION("bad message kind %s", kind); } ShardDB::ShardDB(Logger& logger, ShardId shid, const std::string& path) { _impl = new ShardDBImpl(logger, shid, path); } void ShardDB::close() { ((ShardDBImpl*)_impl)->close(); } ShardDB::~ShardDB() { delete (ShardDBImpl*)_impl; _impl = nullptr; } EggsError ShardDB::read(const ShardReqContainer& req, ShardRespContainer& resp) { return ((ShardDBImpl*)_impl)->read(req, resp); } EggsError ShardDB::prepareLogEntry(const ShardReqContainer& req, ShardLogEntry& logEntry) { return ((ShardDBImpl*)_impl)->prepareLogEntry(req, logEntry); } EggsError ShardDB::applyLogEntry(bool sync, uint64_t logEntryIx, const ShardLogEntry& logEntry, ShardRespContainer& resp) { return ((ShardDBImpl*)_impl)->applyLogEntry(sync, logEntryIx, logEntry, resp); } uint64_t ShardDB::lastAppliedLogEntry() { return ((ShardDBImpl*)_impl)->_lastAppliedLogEntry(); } const std::array& ShardDB::secretKey() const { return ((ShardDBImpl*)_impl)->_secretKey; }